In [10]:
import numpy as np
import random
import pickle

In [8]:
class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        """Initialize or reset the board"""
        self.board = [' '] * 9  # 3x3 board
        self.current_player = 'X'
        return self.get_state()

    def available_actions(self):
        """Return available positions"""
        return [i for i, cell in enumerate(self.board) if cell == ' ']

    def step(self, action):
        """Make a move"""
        if self.board[action] != ' ':
            return self.get_state(), -10, True  # Invalid move penalty

        self.board[action] = self.current_player
        winner = self.check_winner()
        done = winner is not None or ' ' not in self.board
        reward = 0

        if winner == self.current_player:
            reward = 1
        elif done:
            reward = 0.5  # Draw reward

        # Switch player
        self.current_player = 'O' if self.current_player == 'X' else 'X'
        return self.get_state(), reward, done

    def check_winner(self):
        """Check if a player has won"""
        wins = [
            (0, 1, 2), (3, 4, 5), (6, 7, 8),  # rows
            (0, 3, 6), (1, 4, 7), (2, 5, 8),  # columns
            (0, 4, 8), (2, 4, 6)              # diagonals
        ]
        for (a, b, c) in wins:
            if self.board[a] == self.board[b] == self.board[c] != ' ':
                return self.board[a]
        return None

    def get_state(self):
        """Return tuple representation of the board"""
        return tuple(self.board)

    def render(self):
        """Display the board"""
        print("\n")
        for i in range(0, 9, 3):
            print(self.board[i], "|", self.board[i+1], "|", self.board[i+2])
        print("\n")

In [9]:
class QLearningAgent:
    def __init__(self, alpha=0.3, gamma=0.9, epsilon=1.0, epsilon_min=0.1, decay=0.995):
        self.Q = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.decay = decay

    def get_q(self, state, action):
        return self.Q.get((state, action), 0.0)

    def choose_action(self, state, available_actions):
        if np.random.uniform(0, 1) < self.epsilon:
            return random.choice(available_actions)
        qs = [self.get_q(state, a) for a in available_actions]
        max_q = max(qs)
        return random.choice([a for a, q in zip(available_actions, qs) if q == max_q])

    def update(self, state, action, reward, next_state, next_actions, done):
        old_q = self.get_q(state, action)
        future_q = 0 if done else max([self.get_q(next_state, a) for a in next_actions])
        new_q = old_q + self.alpha * (reward + self.gamma * future_q - old_q)
        self.Q[(state, action)] = new_q

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.decay


In [11]:
env = TicTacToe()
agent = QLearningAgent()

episodes = 50000
win, draw, lose = 0, 0, 0

for episode in range(episodes):
    state = env.reset()
    done = False

    while not done:
        actions = env.available_actions()
        action = agent.choose_action(state, actions)
        next_state, reward, done = env.step(action)

        # Opponent plays randomly
        if not done:
            opp_actions = env.available_actions()
            if opp_actions:
                opp_action = random.choice(opp_actions)
                next_state, opp_reward, done = env.step(opp_action)
                if opp_reward == 1:  # Opponent wins
                    reward = -1
                    done = True

        next_actions = env.available_actions()
        agent.update(state, action, reward, next_state, next_actions, done)
        state = next_state

    agent.decay_epsilon()

    # Track performance
    if reward == 1:
        win += 1
    elif reward == 0.5:
        draw += 1
    elif reward == -1:
        lose += 1

    if episode % 5000 == 0:
        print(f"Episode {episode} | Epsilon={agent.epsilon:.3f}")

print("\n‚úÖ Training completed!")
print(f"Wins: {win}, Draws: {draw}, Losses: {lose}")

# Save Q-table
with open("tictactoe_qtable.pkl", "wb") as f:
    pickle.dump(agent.Q, f)
print("Q-table saved as 'tictactoe_qtable.pkl'.")

Episode 0 | Epsilon=0.995
Episode 5000 | Epsilon=0.100
Episode 10000 | Epsilon=0.100
Episode 15000 | Epsilon=0.100
Episode 20000 | Epsilon=0.100
Episode 25000 | Epsilon=0.100
Episode 30000 | Epsilon=0.100
Episode 35000 | Epsilon=0.100
Episode 40000 | Epsilon=0.100
Episode 45000 | Epsilon=0.100

‚úÖ Training completed!
Wins: 46634, Draws: 2142, Losses: 1224
Q-table saved as 'tictactoe_qtable.pkl'.


In [12]:
agent.epsilon = 0.0  # Disable exploration
env = TicTacToe()

print("\nüéÆ Testing trained agent vs Random Player")

for game in range(3):  # play 3 sample games
    state = env.reset()
    done = False
    env.render()

    while not done:
        actions = env.available_actions()
        action = agent.choose_action(state, actions)
        next_state, reward, done = env.step(action)
        env.render()

        if done:
            if reward == 1:
                print("Agent (X) wins ‚úÖ")
            elif reward == 0.5:
                print("It's a draw üòê")
            else:
                print("Agent lost ‚ùå")
            break

        # Opponent move
        opp_actions = env.available_actions()
        if opp_actions:
            opp_action = random.choice(opp_actions)
            next_state, reward, done = env.step(opp_action)
            env.render()
            if done:
                if reward == 1:
                    print("Opponent (O) wins ‚ùå")
                elif reward == 0.5:
                    print("It's a draw üòê")
                break
        state = next_state




üéÆ Testing trained agent vs Random Player


  |   |  
  |   |  
  |   |  




  |   |  
  | X |  
  |   |  




  | O |  
  | X |  
  |   |  




  | O |  
X | X |  
  |   |  




  | O |  
X | X |  
  | O |  




  | O |  
X | X | X
  | O |  


Agent (X) wins ‚úÖ


  |   |  
  |   |  
  |   |  




  |   |  
  | X |  
  |   |  




  |   |  
  | X |  
  | O |  




  |   |  
X | X |  
  | O |  




O |   |  
X | X |  
  | O |  




O |   |  
X | X | X
  | O |  


Agent (X) wins ‚úÖ


  |   |  
  |   |  
  |   |  




  |   |  
  | X |  
  |   |  




  |   |  
  | X |  
  | O |  




  |   |  
X | X |  
  | O |  




O |   |  
X | X |  
  | O |  




O |   |  
X | X | X
  | O |  


Agent (X) wins ‚úÖ


In [13]:
def play_vs_agent(agent):
    env = TicTacToe()
    state = env.reset()
    done = False
    agent.epsilon = 0.0  # Disable exploration

    print("\nüéÆ Let's play Tic-Tac-Toe! You are O, AI is X")
    print("Board positions:\n0 | 1 | 2\n3 | 4 | 5\n6 | 7 | 8\n")
    env.render()

    while not done:
        # Agent (X) plays
        if env.current_player == 'X':
            actions = env.available_actions()
            action = agent.choose_action(state, actions)
            state, reward, done = env.step(action)
            print("ü§ñ AI's Move:")
            env.render()

            if done:
                if reward == 1:
                    print("AI wins! ü§ñüèÜ")
                elif reward == 0.5:
                    print("It's a draw üòê")
                else:
                    print("You win! üéâ")
                break

        # Human (O) plays
        else:
            valid_move = False
            while not valid_move:
                try:
                    pos = int(input("Enter your move (0-8): "))
                    if pos in env.available_actions():
                        valid_move = True
                    else:
                        print("‚ùå Invalid move, try again.")
                except ValueError:
                    print("‚ö†Ô∏è Enter a number between 0 and 8.")
            
            state, reward, done = env.step(pos)
            print("üßç‚Äç‚ôÇÔ∏è Your Move:")
            env.render()

            if done:
                if reward == 1:
                    print("You win! üéâ")
                elif reward == 0.5:
                    print("It's a draw üòê")
                else:
                    print("AI wins! ü§ñüèÜ")
                break

# Run the game after training
play_vs_agent(agent)


üéÆ Let's play Tic-Tac-Toe! You are O, AI is X
Board positions:
0 | 1 | 2
3 | 4 | 5
6 | 7 | 8



  |   |  
  |   |  
  |   |  


ü§ñ AI's Move:


  |   |  
  | X |  
  |   |  


üßç‚Äç‚ôÇÔ∏è Your Move:


  |   | O
  | X |  
  |   |  


ü§ñ AI's Move:


  |   | O
  | X |  
  |   | X


üßç‚Äç‚ôÇÔ∏è Your Move:


O |   | O
  | X |  
  |   | X


ü§ñ AI's Move:


O | X | O
  | X |  
  |   | X


üßç‚Äç‚ôÇÔ∏è Your Move:


O | X | O
  | X |  
  | O | X


ü§ñ AI's Move:


O | X | O
  | X | X
  | O | X


üßç‚Äç‚ôÇÔ∏è Your Move:


O | X | O
O | X | X
  | O | X


ü§ñ AI's Move:


O | X | O
O | X | X
X | O | X


It's a draw üòê
