Set up environment

In [1]:
# We will represent the board as a 3x3 list of lists.
# 0: Empty, 1: Player X, 2: Player O

board = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
current_player = 1 # 1 for X, 2 for O

In [2]:
def display_board(board):
    """
    Prints the current state of the board.
    """
    symbols = {0: ' ', 1: 'X', 2: 'O'}
    for row in board:
        print('|'.join([symbols[cell] for cell in row]))
        print('-' * 5)

In [3]:
def check_win(board):
    """
    Checks if there is a winner. Returns 1 for player X, 2 for player O,
    or 0 if no one has won yet.
    """
    # Check rows, columns, and diagonals for a win
    for i in range(3):
        if board[i][0] == board[i][1] == board[i][2] != 0:
            return board[i][0]
        if board[0][i] == board[1][i] == board[2][i] != 0:
            return board[0][i]
    if board[0][0] == board[1][1] == board[2][2] != 0:
        return board[0][0]
    if board[0][2] == board[1][1] == board[2][0] != 0:
        return board[0][2]
    return 0

In [4]:
def get_possible_moves(board):
    """
    Returns a list of empty cells (row, col).
    """
    moves = []
    for r in range(3):
        for c in range(3):
            if board[r][c] == 0:
                moves.append((r, c))
    return moves

In [5]:
def is_game_over(board):
    """
    Returns True if the game is over (win or draw), False otherwise.
    """
    if check_win(board) != 0:
        return True
    if not get_possible_moves(board):
        return True
    return False

Define an agent

In [10]:
import random

class TTT_Agent:
    def __init__(self, player_num=1, exploration_rate=0.2):
        self.player_num = player_num
        self.q_table = {}
        self.exploration_rate = exploration_rate
        self.history = [] # Stores (state, action) pairs

    def get_state_key(self, board):
        return tuple(tuple(row) for row in board)

    def choose_action(self, board):
        state_key = self.get_state_key(board)
        possible_moves = get_possible_moves(board)

        # Initialize Q-values for new states
        if state_key not in self.q_table:
            self.q_table[state_key] = {move: 0 for move in possible_moves}

        # Epsilon-greedy strategy
        if random.uniform(0, 1) < self.exploration_rate:
            move = random.choice(possible_moves)
        else:
            # Exploitation: find the best move
            q_values = self.q_table[state_key]
            max_q = max(q_values.values())

            # Handle multiple best moves
            best_moves = [move for move, q_val in q_values.items() if q_val == max_q]
            move = random.choice(best_moves)

        # Record state and action for later reward update
        self.history.append((state_key, move))
        return move

    def update_q_values(self, reward):
        # Update rewards at the end of the game
        for state_key, move in self.history:
            self.q_table[state_key][move] = reward

        self.history = []

Train the agent

In [15]:
def play_game_against_self(agent1, agent2):
    board = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
    current_player = 1

    while True:
        if current_player == 1:
            agent = agent1
        else:
            agent = agent2

        move = agent.choose_action(board)
        row, col = move
        board[row][col] = current_player

        winner = check_win(board)
        if winner != 0:
            if winner == agent1.player_num:
                return 1 # Agent 1 wins
            else:
                return -1 # Agent 2 wins

        if not get_possible_moves(board):
            return 0 # Draw

        current_player = 3 - current_player

In [16]:
agent_x = TTT_Agent(player_num=1)
agent_o = TTT_Agent(player_num=2)

num_games = 60000 # Play a large number of games to train the agent

In [17]:
print("Training the agent...")
for i in range(num_games):
    winner = play_game_against_self(agent_x, agent_o)

    # Update rewards for both agents
    if winner == 1:
        agent_x.update_q_values(1)
        agent_o.update_q_values(-1)
    elif winner == -1:
        agent_x.update_q_values(-1)
        agent_o.update_q_values(1)
    else:
        agent_x.update_q_values(0.1)
        agent_o.update_q_values(0.1)

    # Gradually decrease exploration rate
    if i % 1000 == 0:
        agent_x.exploration_rate *= 0.99
        agent_o.exploration_rate *= 0.99

print("Training complete!")

Training the agent...
Training complete!


To test, play against the agent

In [18]:
def play_against_human(agent):
    """
    Allows a human to play against the trained agent.
    """
    board = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
    current_player = 1 # Human is always X

    while True:
        display_board(board)

        if current_player == 1:
            # Human's turn
            try:
                row = int(input("Enter row (0, 1, or 2): "))
                col = int(input("Enter column (0, 1, or 2): "))

                if board[row][col] != 0:
                    print("This spot is taken. Try again.")
                    continue

                board[row][col] = 1
            except (ValueError, IndexError):
                print("Invalid input. Please enter numbers 0-2.")
                continue
        else:
            # Agent's turn
            print("Agent is thinking...")
            move = agent.choose_action(board)
            row, col = move
            board[row][col] = 2 # Agent is always O
            print(f"Agent places O at ({row}, {col})")

        winner = check_win(board)
        if winner != 0:
            display_board(board)
            if winner == 1:
                print("You win!")
            else:
                print("Agent wins!")
            break

        if not get_possible_moves(board):
            display_board(board)
            print("It's a draw!")
            break

        current_player = 3 - current_player

# Start a game against the trained agent
play_against_human(agent_o) # The agent we trained as 'O'

 | | 
-----
 | | 
-----
 | | 
-----
Enter row (0, 1, or 2): 1
Enter column (0, 1, or 2): 1
 | | 
-----
 |X| 
-----
 | | 
-----
Agent is thinking...
Agent places O at (2, 0)
 | | 
-----
 |X| 
-----
O| | 
-----
Enter row (0, 1, or 2): 0
Enter column (0, 1, or 2): 0
X| | 
-----
 |X| 
-----
O| | 
-----
Agent is thinking...
Agent places O at (0, 2)
X| |O
-----
 |X| 
-----
O| | 
-----
Enter row (0, 1, or 2): 2
Enter column (0, 1, or 2): 1
X| |O
-----
 |X| 
-----
O|X| 
-----
Agent is thinking...
Agent places O at (1, 0)
X| |O
-----
O|X| 
-----
O|X| 
-----
Enter row (0, 1, or 2): 2
Enter column (0, 1, or 2): 2
X| |O
-----
O|X| 
-----
O|X|X
-----
You win!
