In [8]:
import numpy as np
import random

In [9]:
class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)  # 3x3 board initialized to 0
        self.current_player = 1  # Player 1 starts
        return self.get_state()

    def get_state(self):
        return tuple(self.board.flatten())  # Flattened board as a tuple

    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def make_move(self, action):
        i, j = action
        if self.board[i, j] == 0:
            self.board[i, j] = self.current_player
            self.current_player = -self.current_player  # Switch player
            return True
        return False

    def check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:  # Row win
                return np.sign(sum(self.board[i, :]))
            if abs(sum(self.board[:, i])) == 3:  # Column win
                return np.sign(sum(self.board[:, i]))
        if abs(sum(self.board.diagonal())) == 3:  # Main diagonal
            return np.sign(sum(self.board.diagonal()))
        if abs(sum(np.fliplr(self.board).diagonal())) == 3:  # Anti-diagonal
            return np.sign(sum(np.fliplr(self.board).diagonal()))
        if not self.available_actions():
            return 0  # Draw
        return None  # Game ongoing


In [10]:
class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.9, exploration_rate=1.0, exploration_decay=0.99):
        self.q_table = {}
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0)

    def choose_action(self, state, available_actions):
        if random.random() < self.exploration_rate:
            return random.choice(available_actions)
        q_values = [self.get_q_value(state, action) for action in available_actions]
        max_q = max(q_values)
        return available_actions[q_values.index(max_q)]

    def update_q_value(self, state, action, reward, next_state, next_available_actions):
        old_q = self.get_q_value(state, action)
        max_future_q = max([self.get_q_value(next_state, a) for a in next_available_actions], default=0)
        new_q = (1 - self.learning_rate) * old_q + self.learning_rate * (reward + self.discount_factor * max_future_q)
        self.q_table[(state, action)] = new_q

    def decay_exploration(self):
        self.exploration_rate *= self.exploration_decay


In [11]:
def train(agent, episodes=10000):
    game = TicTacToe()
    for episode in range(episodes):
        state = game.reset()
        done = False
        while not done:
            available_actions = game.available_actions()
            action = agent.choose_action(state, available_actions)
            game.make_move(action)
            reward = 0
            winner = game.check_winner()
            if winner is not None:
                done = True
                if winner == 1:
                    reward = 1  # Agent wins
                elif winner == -1:
                    reward = -1  # Opponent wins
                else:
                    reward = 0.5  # Draw
            next_state = game.get_state()
            next_available_actions = game.available_actions()
            agent.update_q_value(state, action, reward, next_state, next_available_actions)
            state = next_state
            if done:
                agent.decay_exploration()

In [12]:
def play_game(agent):
    game = TicTacToe()
    state = game.reset()
    done = False
    while not done:
        available_actions = game.available_actions()
        action = agent.choose_action(state, available_actions)
        game.make_move(action)
        print(f"Player {game.current_player * -1} moved to position {action}")
        print(game.board)
        winner = game.check_winner()
        if winner is not None:
            if winner == 1:
                print("Agent wins!")
            elif winner == -1:
                print("Opponent wins!")
            else:
                print("It's a draw!")
            break
        state = game.get_state()

In [13]:
# Initialize agent
agent = QLearningAgent()

# Train the agent
train(agent, episodes=10000)

# Test the trained agent
play_game(agent)

Player 1 moved to position (0, 0)
[[1 0 0]
 [0 0 0]
 [0 0 0]]
Player -1 moved to position (0, 1)
[[ 1 -1  0]
 [ 0  0  0]
 [ 0  0  0]]
Player 1 moved to position (2, 0)
[[ 1 -1  0]
 [ 0  0  0]
 [ 1  0  0]]
Player -1 moved to position (0, 2)
[[ 1 -1 -1]
 [ 0  0  0]
 [ 1  0  0]]
Player 1 moved to position (1, 0)
[[ 1 -1 -1]
 [ 1  0  0]
 [ 1  0  0]]
Agent wins!
