In [1]:
import numpy as np
import random
import pickle

In [2]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))  # 3x3 board initialized with zeros
        self.player = 1  # Player 1 starts the game

    def available_moves(self):
        """Returns a list of available moves (positions) on the board."""
        return np.argwhere(self.board == 0)
    def make_move(self, row, col):
        """Places the player's mark (1 or -1) on the board at the given position."""
        if self.board[row, col] == 0:
            self.board[row, col] = self.player
            return True
        return False

    def switch_player(self):
        """Switches the current player (from 1 to -1, or vice versa)."""
        self.player *= -1

    def check_winner(self):
        """Checks if there's a winner or if the game is a draw."""
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:  # Check rows
                return self.board[i, 0]
            if abs(sum(self.board[:, i])) == 3:  # Check columns
                return self.board[0, i]
         # Check diagonals
        if abs(self.board.trace()) == 3:
            return self.board[0, 0]
        if abs(np.fliplr(self.board).trace()) == 3:
            return self.board[0, 2]
        # Check for draw
        if not self.available_moves().size:
            return 0  # Draw
        return None  # Game ongoing

In [3]:


# Task 2: Defining the Reinforcement Learning Model
class Agent:
    def __init__(self, player, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.player = player  # 1 for player 1, -1 for player 2
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.q_table = {}  # State-action value table

    def get_state(self, board):
        """Converts the board into a tuple to use as a state."""
        return tuple(map(tuple, board))

    def choose_action(self, game):
        """Chooses an action based on epsilon-greedy policy."""
        state = self.get_state(game.board)
        if random.uniform(0, 1) < self.epsilon:
            # Exploration: choose a random move
            available_moves = game.available_moves()
            move = random.choice(available_moves)
        else:
            # Exploitation: choose the best move
            q_values = self.q_table.get(state, {})
            if q_values:
                max_q = max(q_values.values())
                best_moves = [move for move, q in q_values.items() if q == max_q]
                move = random.choice(best_moves)
            else:
                move = random.choice(game.available_moves())
        return move

    def update_q_value(self, state, action, reward, next_state):
        """Updates the Q-value for the given state and action."""
        current_q = self.q_table.get(state, {}).get(action, 0)
        max_future_q = max(self.q_table.get(next_state, {}).values(), default=0)
        new_q = current_q + self.alpha * (reward + self.gamma * max_future_q - current_q)
        if state not in self.q_table:
            self.q_table[state] = {}
        self.q_table[state][action] = new_q

    def learn(self, game, reward):
        """Updates Q-values after the game is finished."""
        state = self.get_state(game.board)
        available_moves = game.available_moves()
        if available_moves.size:
            action = tuple(random.choice(available_moves))
            next_state = self.get_state(game.board)
            self.update_q_value(state, action, reward, next_state)

# Create agents for both players
player1 = Agent(player=1)
player2 = Agent(player=-1)

# Task 3: Training the Model
def play_game(player1, player2, episodes=10000):
    for episode in range(episodes):
        game = TicTacToe()
        while True:
            if game.player == 1:
                action = player1.choose_action(game)
            else:
                action = player2.choose_action(game)
            game.make_move(*action)

            winner = game.check_winner()
            if winner is not None:
                # Update Q-values after the game ends
                if winner == 1:
                    player1.learn(game, reward=1)
                    player2.learn(game, reward=-1)
                elif winner == -1:
                    player1.learn(game, reward=-1)
                    player2.learn(game, reward=1)
                else:
                    player1.learn(game, reward=0)  # Draw
                    player2.learn(game, reward=0)  # Draw
                break
            game.switch_player()

# Train the agents
play_game(player1, player2, episodes=10000)



In [4]:
# Save the trained model
with open('tictactoe_q_learning.pkl', 'wb') as f:
    pickle.dump((player1.q_table, player2.q_table), f)

# Task 4: Testing the Model
def test_agent(agent, episodes=100):
    """Test the performance of the trained agent against a random opponent."""
    wins, losses, draws = 0, 0, 0
    for episode in range(episodes):
        game = TicTacToe()
        while True:
            if game.player == 1:
                action = agent.choose_action(game)
            else:
                action = random.choice(game.available_moves())
            game.make_move(*action)

            winner = game.check_winner()
            if winner is not None:
                if winner == 1:
                    wins += 1
                elif winner == -1:
                    losses += 1
                else:
                    draws += 1
                break
            game.switch_player()

    return wins, losses, draws

# Test player1 agent (trained) against a random player
test_results = test_agent(player1, episodes=100)
test_results  # Display test results (wins, losses, draws)

(54, 33, 13)