In [7]:
import random
import numpy as np
from collections import defaultdict

# =====================================================
# a. Setting up the environment
# =====================================================
class TicTacToeEnv:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = [' '] * 9  # 3x3 board
        self.done = False
        self.winner = None
        return self.get_state()

    def get_state(self):
        return ''.join(self.board)

    def available_actions(self):
        return [i for i, v in enumerate(self.board) if v == ' ']

    def step(self, action, player):
        if self.board[action] != ' ' or self.done:
            return self.get_state(), -10, True  # illegal move
        self.board[action] = player

        # Check if game ended
        self.winner = self.check_winner()
        if self.winner:
            self.done = True
            return self.get_state(), 1 if self.winner == 'X' else -1, True
        elif ' ' not in self.board:
            self.done = True
            return self.get_state(), 0.5, True  # draw
        else:
            return self.get_state(), 0, False

    def check_winner(self):
        combos = [
            [0,1,2],[3,4,5],[6,7,8], # rows
            [0,3,6],[1,4,7],[2,5,8], # cols
            [0,4,8],[2,4,6]          # diagonals
        ]
        for a,b,c in combos:
            if self.board[a] == self.board[b] == self.board[c] and self.board[a] != ' ':
                return self.board[a]
        return None

    def render(self):
        print("\n".join(["|".join(self.board[i:i+3]) for i in range(0,9,3)]))
        print("-----")

# =====================================================
# b. Defining the Tic-Tac-Toe game (already done above)
# =====================================================

# =====================================================
# c. Building the reinforcement learning model (Q-learning)
# =====================================================
class QLearningAgent:
    def __init__(self, player='X', alpha=0.1, gamma=0.9, epsilon=1.0, epsilon_decay=0.9995, epsilon_min=0.05):
        self.q_table = defaultdict(float)  # maps (state, action) â†’ value
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.player = player

    def choose_action(self, state, available_actions):
        if random.random() < self.epsilon:
            return random.choice(available_actions)
        q_vals = [self.q_table[(state, a)] for a in available_actions]
        max_q = max(q_vals)
        best_actions = [a for a in available_actions if self.q_table[(state, a)] == max_q]
        return random.choice(best_actions)

    def update(self, state, action, reward, next_state, next_available, done):
        best_next = 0 if done else max([self.q_table[(next_state, a)] for a in next_available], default=0)
        old_value = self.q_table[(state, action)]
        self.q_table[(state, action)] = old_value + self.alpha * (reward + self.gamma * best_next - old_value)

    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

# =====================================================
# d. Training the model
# =====================================================
def train(agent, episodes=50000):
    env = TicTacToeEnv()
    opponent = 'O' if agent.player == 'X' else 'X'

    for ep in range(episodes):
        state = env.reset()
        done = False
        current_player = 'X'

        while not done:
            if current_player == agent.player:
                action = agent.choose_action(state, env.available_actions())
                next_state, reward, done = env.step(action, agent.player)
                agent.update(state, action, reward, next_state, env.available_actions(), done)
                state = next_state
                if done: 
                    break
            else:
                # Opponent plays randomly
                action = random.choice(env.available_actions())
                next_state, reward, done = env.step(action, opponent)
                state = next_state
                if done: 
                    # If opponent wins, negative reward to agent
                    if env.winner == opponent:
                        agent.update(state, action, -1, next_state, env.available_actions(), done)
                    break
            current_player = opponent if current_player == agent.player else agent.player

        agent.decay_epsilon()

    print("Training complete. Final epsilon:", agent.epsilon)

# =====================================================
# e. Testing the model
# =====================================================
def test(agent, games=20):
    env = TicTacToeEnv()
    opponent = 'O' if agent.player == 'X' else 'X'
    results = {"win": 0, "loss": 0, "draw": 0}

    for _ in range(games):
        state = env.reset()
        done = False
        current_player = 'X'

        while not done:
            if current_player == agent.player:
                action = agent.choose_action(state, env.available_actions())
                state, reward, done = env.step(action, agent.player)
            else:
                action = random.choice(env.available_actions())
                state, reward, done = env.step(action, opponent)

            if done:
                if env.winner == agent.player:
                    results["win"] += 1
                elif env.winner == opponent:
                    results["loss"] += 1
                else:
                    results["draw"] += 1
                break
            current_player = opponent if current_player == agent.player else agent.player

    print("Test Results:", results)


# =====================================================
# Run training and testing
# =====================================================
if __name__ == "__main__":
    agent = QLearningAgent(player='X')
    train(agent, episodes=50000)   # Train agent
    test(agent, games=500)          # Test agent

Training complete. Final epsilon: 0.05
Test Results: {'win': 408, 'loss': 63, 'draw': 29}
