# Q Learning applied to TicTacToe.

player X :Comp/agent Uses Q Learning to play TicTacToe.

player O :places a random input on the table.

In [7]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.full((3, 3), ' ', dtype=str)
    
    def valid_moves(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == ' ']

    def make_move(self, move, player):
        if self.board[move] == ' ':
            self.board[move] = player
            return True
        return False
    
    def check_winner(self, player):
        for i in range(3):
            if all(self.board[i, :] == player) or all(self.board[:, i] == player):
                return True
        if (self.board[0, 0] == self.board[1, 1] == self.board[2, 2] == player) or \
           (self.board[0, 2] == self.board[1, 1] == self.board[2, 0] == player):
            return True
        return False
    
    def is_full(self):
        return not any(self.board[i, j] == ' ' for i in range(3) for j in range(3))

def train_agent(episodes=10000):
    ArR_variable = {} 
    learning_rate = 0.1
    discount_factor = 0.9
    epsilon = 0.2
    
    for episode in range(episodes):
        game = TicTacToe()
        player = 'X'
        state = tuple(map(tuple, game.board))
        ArR_variable.setdefault(state, np.zeros(9))
        
        done = False
        while not done:
            moves = game.valid_moves()
            if random.random() < epsilon:
                action = random.choice(moves)
            else:
                q_values = ArR_variable[state]
                move_indices = [m[0] * 3 + m[1] for m in moves]
                action = moves[np.argmax(q_values[move_indices])]
            
            game.make_move(action, player)
            next_state = tuple(map(tuple, game.board))
            ArR_variable.setdefault(next_state, np.zeros(9))
            
            reward = 0
            if game.check_winner(player):
                reward = 1
            elif game.is_full():
                reward = 0.5
            
            best_future_q = np.max(ArR_variable[next_state])
            ArR_variable[state][3 * action[0] + action[1]] += learning_rate * (
                reward + discount_factor * best_future_q - ArR_variable[state][3 * action[0] + action[1]]
            )
            
            if reward > 0 or game.is_full():
                done = True
            
            state = next_state
            player = 'O' if player == 'X' else 'X'
    
    return ArR_variable

def play_game(ArR_variable):
    game = TicTacToe()
    player = 'X'
    done = False
    
    while not done:
        state = tuple(map(tuple, game.board))
        
        if player == 'X':
            q_values = ArR_variable[state]
            action = tuple(np.divmod(np.argmax(q_values), 3))
        else:
            action = random.choice(game.valid_moves())
        
        game.make_move(action, player)
        
        if game.check_winner(player):
            print(f"Player {player} wins!")
            done = True
            break
        
        if game.is_full():
            print("It's a draw!")
            done = True
            break
        
        player = 'O' if player == 'X' else 'X'
    
    print(game.board)

Q = train_agent()
play_game(Q)

Player X wins!
[['X' 'X' 'X']
 [' ' ' ' 'O']
 ['O' ' ' ' ']]
