In [5]:
import numpy as np
import random


In [6]:
# Tic-Tac-Toe Game Environment
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
    
    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]
    
    def make_move(self, row, col, player):
        self.board[row, col] = player
    
    def check_winner(self, player):
        for i in range(3):
            if np.all(self.board[i, :] == player) or np.all(self.board[:, i] == player):
                return True
        if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
            return True
        return False
    
    def is_full(self):
        return len(self.available_actions()) == 0
    
    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)

In [7]:
# Q-Learning Agent
class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
    
    def get_state(self, board):
        return str(board.reshape(9))
    
    def choose_action(self, state, available_actions):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)
        q_values = [self.q_table.get((state, action), 0) for action in available_actions]
        return available_actions[q_values.index(max(q_values))]
    
    def update_q_value(self, state, action, reward, next_state, next_actions):
        old_q = self.q_table.get((state, action), 0)
        future_q = max([self.q_table.get((next_state, a), 0) for a in next_actions], default=0)
        new_q = old_q + self.alpha * (reward + self.gamma * future_q - old_q)
        self.q_table[(state, action)] = new_q


In [8]:
# Training the Agent
def train_agent(episodes=1000):
    agent = QLearningAgent()
    game = TicTacToe()
    for _ in range(episodes):
        game.reset()
        state = agent.get_state(game.board)
        done = False
        player = 1
        while not done:
            available_actions = game.available_actions()
            if not available_actions: # If no actions are available (board is full), break
                break
            action = agent.choose_action(state, available_actions)
            game.make_move(action[0], action[1], player)

            if game.check_winner(player):
                reward = 1 if player == 1 else -1
                agent.update_q_value(state, action, reward, None, [])
                break
            if game.is_full(): # If the board is full, it's a draw
                reward = 0
                agent.update_q_value(state, action, reward, None, [])
                break
            next_state = agent.get_state(game.board)
            next_available_actions = game.available_actions()
            agent.update_q_value(state, action, 0, next_state, next_available_actions)

            state = next_state
            player = 3 - player # Switch player
            
    return agent


In [9]:
# Testing the Agent
def test_agent(agent, episodes=100):
    game = TicTacToe()
    wins = 0

    for _ in range(episodes):
        game.reset()
        state = agent.get_state(game.board)
        player = 1

        while True:
            available_actions = game.available_actions()
            if not available_actions: # If no actions are available (board is full), it's a draw
                break

            action = agent.choose_action(state, available_actions)
            game.make_move(action[0], action[1], player)

            if game.check_winner(player):
                if player == 1:
                    wins += 1
                break

            if game.is_full():
                break
            state = agent.get_state(game.board)
            player = 3 - player # Switch player
    print(f"Wins: {wins}/{episodes}")


In [17]:
# Run Training and Testing
agent = train_agent(episodes=1000)
test_agent(agent, episodes=100)


Wins: 88/100
