<a href="https://colab.research.google.com/github/pksdmsyi/UltimateTicTacToe-RL/blob/main/ultimate_tic_tac_toe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
# board.py
import numpy as np
import matplotlib.pyplot as plt

class Board:
    def __init__(self):
        # Initialize the overall 9x9 grid (9 sub-grids, each 3x3)
        self.board = np.zeros((9, 9), dtype=int)  # Overall 9x9 board
        self.subgrid_wins = [0] * 9  # Track wins for each subgrid: 0 = not won, 1 = Player 1, 2 = Player 2
        self.overall_winner = 0  # Track the overall winner: 0 = none, 1 = Player 1, 2 = Player 2
        self.next_subgrid = None  # Tracks the subgrid where the next move must be made
        self.last_move = -1

    def index_to_position(self, index, size):
        """
        Convert a flat index into row and column coordinates.
        For size = 3 (subgrid), index will be from 0-8.
        For size = 9 (full board), index will be from 0-80.
        """
        row = index // size
        col = index % size
        return row, col

    def get_subgrid(self, subgrid_index):
        """
        Get a specific 3x3 subgrid by index (0-8).
        """
        row_start = (subgrid_index // 3) * 3  # Row of the top-left corner of the subgrid
        col_start = (subgrid_index % 3) * 3   # Column of the top-left corner of the subgrid
        return self.board[row_start:row_start + 3, col_start:col_start + 3]

    def set_subgrid(self, subgrid_index, subgrid):
        """
        Set the values of a specific 3x3 subgrid.
        The subgrid should be a 3x3 numpy array.
        """
        if subgrid.shape != (3, 3):
            raise ValueError("Subgrid must be a 3x3 numpy array")

        row_start = (subgrid_index // 3) * 3
        col_start = (subgrid_index % 3) * 3
        self.board[row_start:row_start + 3, col_start:col_start + 3] = subgrid


    def is_valid_move(self, row, col):
        """
        Check if a move is valid by checking if the corresponding subgrid is available
        and the cell is empty.
        """
        # Check if move is in the valid subgrid, unless it's a free move
        if self.next_subgrid is not None:
            subgrid_row, subgrid_col = row // 3, col // 3
            if subgrid_row * 3 + subgrid_col != self.next_subgrid:
                return False

        return self.board[row, col] == 0

    def update_cell(self, index, value):
        """
        Update a single cell in the 9x9 board using a flat index (0-80).
        This method ensures that updates to the overall board are reflected in the corresponding subgrid.
        """
        if not (0 <= index < 81):
            raise ValueError("Index must be between 0 and 80.")
        if value not in (0, 1, 2):  # Assuming 0 = empty, 1 = Player 1, 2 = Player 2
            raise ValueError("Cell value must be 0 (empty), 1 (Player 1), or 2 (Player 2).")

        # Convert the flat index (0-80) to a row and column
        row, col = self.index_to_position(index, 9)

        if not self.is_valid_move(row, col):
            raise ValueError(f"Invalid move: Cell at index {index} is already occupied.")

        # Update the main 9x9 board
        self.board[row, col] = value

        # Check the winner for the subgrid
        subgrid_index = (row // 3) * 3 + (col // 3)
        self.check_subgrid_winner(subgrid_index)

        # Update the next subgrid based on this move
        self.update_next_subgrid(row, col)

        # Check overall winner after every move
        self.check_winner(value)

    def update_cell_in_subgrid(self, subgrid_index, subgrid_index_flat, value):
        """
        Update a specific cell within a subgrid (3x3) by subgrid index (0-8) and
        flat cell index (0-8), and reflect this change in the overall 9x9 board.

        subgrid_index: The index of the subgrid (0-8).
        subgrid_index_flat: The flat index within the subgrid (0-8).
        value: The value to set (0 = empty, 1 = Player 1, 2 = Player 2).
        """
        if not (0 <= subgrid_index < 9):
            raise ValueError("Subgrid index must be between 0 and 8.")
        if not (0 <= subgrid_index_flat < 9):
            raise ValueError("Subgrid cell index must be between 0 and 8.")
        if value not in (0, 1, 2):
            raise ValueError("Cell value must be 0 (empty), 1 (Player 1), or 2 (Player 2).")

        # Convert the flat index within subgrid (0-8) to local row and column
        local_row, local_col = self.index_to_position(subgrid_index_flat, 3)

        # Calculate the global row and column in the 9x9 board based on the subgrid index
        global_row = (subgrid_index // 3) * 3 + local_row
        global_col = (subgrid_index % 3) * 3 + local_col

        if not self.is_valid_move(global_row, global_col):
            raise ValueError(f"Invalid move: Cell in subgrid {subgrid_index}, index {subgrid_index_flat} is already occupied.")

        # Update the corresponding cell in the 9x9 board
        self.board[global_row, global_col] = value

        # Check the winner for the subgrid
        self.check_subgrid_winner(subgrid_index)

         # Update the next subgrid restriction based on the current move
        self.update_next_subgrid(global_row, global_col)

        # Check overall winner after every move
        self.check_winner(value)

    def check_winner(self, player):
        """
        Check if the given player has won the overall board or any subgrid.
        """
        # Check rows and columns for overall board
        for i in range(9):
            if all(self.board[i, j] == player for j in range(9)):  # Check row
                self.overall_winner = player
                return True
            if all(self.board[j, i] == player for j in range(9)):  # Check column
                self.overall_winner = player
                return True

        # Check diagonals for overall board
        if all(self.board[i, i] == player for i in range(9)):  # Main diagonal
            self.overall_winner = player
            return True
        if all(self.board[i, 8 - i] == player for i in range(9)):  # Anti-diagonal
            self.overall_winner = player
            return True

        return False

    def check_subgrid_winner(self, subgrid_index):
        """
        Check if there is a winner in a specific subgrid.
        """
        subgrid = self.get_subgrid(subgrid_index)

        for player in [1, 2]:
            # Check rows and columns for subgrid
            for i in range(3):
                if all(subgrid[i, j] == player for j in range(3)):  # Check row
                    self.subgrid_wins[subgrid_index] = player
                    return player
                if all(subgrid[j, i] == player for j in range(3)):  # Check column
                    self.subgrid_wins[subgrid_index] = player
                    return player

            # Check diagonals for subgrid
            if all(subgrid[i, i] == player for i in range(3)):  # Main diagonal
                self.subgrid_wins[subgrid_index] = player
                return player
            if all(subgrid[i, 2 - i] == player for i in range(3)):  # Anti-diagonal
                self.subgrid_wins[subgrid_index] = player
                return player

        return None  # No winner in this subgrid

    def is_subgrid_full(self, subgrid_index):
        """
        Check if a specific subgrid is full (i.e., no available moves).
        """
        subgrid = self.get_subgrid(subgrid_index)
        return np.all(subgrid != 0)

    def update_next_subgrid(self, row, col):
        """
        Update the next subgrid based on the position of the last move.
        """
        # Determine the subgrid index where the next move should be made
        next_subgrid = (row % 3) * 3 + (col % 3)
        if self.is_subgrid_full(next_subgrid) or self.subgrid_wins[next_subgrid] != 0:
            self.next_subgrid = None  # Free play if the next subgrid is full or won
        else:
            self.next_subgrid = next_subgrid

    def display_winners(self):
        """
        Display the winners for subgrids and the overall board.
        """
        print("Subgrid Winners:")
        for i, winner in enumerate(self.subgrid_wins):
            if winner == 0:
                print(f"Subgrid {i}: No winner")
            else:
                print(f"Subgrid {i}: Player {winner} wins")

        if self.overall_winner:
            print(f"\nOverall Winner: Player {self.overall_winner}")
        else:
            print("\nOverall Winner: None")

    def print_board(self):
        """
        Print the 9x9 overall board with separation between subgrids.
        """
        print("Ultimate Tic-Tac-Toe Board (9x9):\n")
        for i in range(3):  # Iterate over 3 rows of subgrids
            for row in range(3):  # Each row within subgrid
                row_display = ""
                for j in range(3):  # Iterate over 3 columns of subgrids
                    row_display += " ".join(map(str, self.board[i * 3 + row, j * 3:j * 3 + 3])) + " | "
                print(row_display)
            print("-" * 20)

    def plot_board(self):
        """
        Plot the current state of the 9x9 board using matplotlib.
        """
        plt.figure(figsize=(8, 8))
        plt.title("Ultimate Tic-Tac-Toe Board")

        # Create the grid
        for i in range(10):
            # Thicker lines for 3x3 subgrid borders
            linewidth = 2 if i % 3 == 0 else 0.5
            plt.axhline(i, color='black', linewidth=linewidth, linestyle='-')
            plt.axvline(i, color='black', linewidth=linewidth, linestyle='-')

        # Set the ticks and labels
        plt.xticks(np.arange(0.5, 9, 1), [])
        plt.yticks(np.arange(0.5, 9, 1), [])

        # Fill in the board with markers
        for i in range(9):
            for j in range(9):
                # Center the markers in their respective cells
                center_x = j + 0.5
                center_y = 8.45 - i
                if self.board[i, j] == 1:
                    plt.text(center_x, center_y, 'X', fontsize=40, ha='center', va='center', color='blue')
                elif self.board[i, j] == 2:
                    plt.text(center_x, center_y, 'O', fontsize=40, ha='center', va='center', color='red')

        plt.xlim(0, 9)
        plt.ylim(0, 9)
        plt.grid(False)
        plt.gca().set_aspect('equal', adjustable='box')  # Maintain aspect ratio
        plt.show()

In [67]:
import gym
from gym import spaces
import numpy as np

class UltimateTicTacToeEnv(gym.Env):
    def __init__(self):
        super(UltimateTicTacToeEnv, self).__init__()
        self.board = Board()
        self.action_space = spaces.Discrete(81)  # 9x9 board with 81 cells
        self.observation_space = spaces.Box(low=0, high=2, shape=(9, 9), dtype=int)

    def reset(self):
        self.board = Board()
        return self.board.board.flatten()

    def step(self, action):
        row, col = divmod(action, 9)
        player = 1  # Assuming the RL agent is Player 1

        if not self.board.is_valid_move(row, col):
            return self.board.board.flatten(), -1, True, {}  # Invalid move penalty

        # Update the Last action variable
        self.board.last_move = action

        # Update the cell and check if we have a winner
        self.board.update_cell(action, player)

        done = self.board.overall_winner != 0 or np.all(self.board.board != 0)  # Win or full board

        reward = 1 if self.board.overall_winner == player else 0  # Reward if agent wins
        return self.board.board.flatten(), reward, done, {}

    def render(self, mode='human'):
        self.board.print_board()


In [68]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        return nn.Sequential(
            nn.Linear(self.state_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, self.action_size)
        )

    def remember(self, last_move, state, action, reward, next_state, done):
        self.memory.append((last_move, state, action, reward, next_state, done))

    def get_next_subgrid(self, previous_move):
        # Determine the row and column within the 9x9 board (0-indexed)
        board_row, board_col = divmod(previous_move, 9)

        # Calculate the row and column within the current 3x3 subgrid
        subgrid_row = board_row % 3
        subgrid_col = board_col % 3

        # Calculate the next subgrid index (0-8)
        next_subgrid_index = subgrid_row * 3 + subgrid_col
        return next_subgrid_index

    def get_valid_actions(self, board, last_move):
        # This function should return a list of valid actions based on the `last_move`.
        # For example, in Ultimate Tic Tac Toe, it might return the set of moves allowed in the specific sub-board indicated by last_move.
        # You will need to implement this according to the game's rules.
        # Here is a placeholder implementation:
        valid_actions = []
        target_sub_board = self.get_next_subgrid(last_move)  # Assuming sub-boards are indexed 0 to 8

        # Calculate the start index of the target sub-board
        start_row = (target_sub_board // 3) * 3
        start_col = (target_sub_board % 3) * 3

        # Loop through the cells in the target sub-board
        for row in range(start_row, start_row + 3):
            for col in range(start_col, start_col + 3):
                action = row * 9 + col  # Calculate the flat index (0-80)
                if board[0][action] == 0:  # Check if the cell is empty
                    valid_actions.append(action)

        # Return Valid Actions if There are any valid actions possible for that Subgrid
        if valid_actions:
          return valid_actions

        # Else Return all the valid actions in the board
        valid_actions = []
        for row in range(9):
            for col in range(9):
                action = row * 9 + col  # Calculate the flat index (0-80)
                if board[0][action] == 0:  # Check if the cell is empty
                    valid_actions.append(action)

        return valid_actions

    def act(self, state, last_move):
        if np.random.rand() <= self.epsilon:
            # Choose a random valid action based on last_move
            return random.choice(self.get_valid_actions(state, last_move))

        # Combine state and last_move as inputs
        combined_input = np.concatenate((state, [[last_move]]), axis=1)
        state_tensor = torch.FloatTensor(combined_input).unsqueeze(0)

        with torch.no_grad():
            q_values = self.model(state_tensor)

        # Masking invalid actions based on last_move
        valid_actions = self.get_valid_actions(state, last_move)
        q_values = q_values.cpu().data.numpy()[0][0]
        # print(q_values.shape,last_move//9, valid_actions)
        masked_q_values = [q if i in valid_actions else -np.inf for i, q in enumerate(q_values)]
        # print("Predicted Action ", np.argmax(masked_q_values), masked_q_values)
        return np.argmax(masked_q_values)

    def replay(self, batch_size):
      minibatch = random.sample(self.memory, batch_size)
      criterion = nn.MSELoss()
      optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

      for last_move, state, action, reward, next_state, done in minibatch:
          target = reward
          if not done:
              next_state_tensor = torch.FloatTensor(np.concatenate((next_state, [[action]]), axis=1)).unsqueeze(0)
              target = reward + self.gamma * torch.max(self.model(next_state_tensor)).item()

          state_tensor = torch.FloatTensor(np.concatenate((state, [[last_move]]), axis=1)).unsqueeze(0)
          q_values = self.model(state_tensor)
          current_q_value = q_values[0][0][action]

          # Convert target to a FloatTensor
          target_tensor = torch.FloatTensor([target])

          optimizer.zero_grad()
          loss = criterion(current_q_value, target_tensor)

          loss.backward()
          optimizer.step()

      if self.epsilon > self.epsilon_min:
          self.epsilon *= self.epsilon_decay


In [None]:
env = UltimateTicTacToeEnv()
agent = DQNAgent(state_size=82, action_size=81)
episodes = 100
batch_size = 32

for e in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, 81])

    for time in range(500):
        last_move = env.board.last_move
        action = agent.act(state, last_move)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, 81])
        agent.remember(last_move, state, action, reward, next_state, done)
        state = next_state

        if done:
            print(f"Episode {e+1}/{episodes}, Score: {time}, Epsilon: {agent.epsilon}")
            break

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

env.close()


# AGENT VS ME

In [None]:
# After training, set epsilon to 0 so the agent uses its learned policy
agent.epsilon = 0

# Initialize the environment
env = UltimateTicTacToeEnv()
state = env.reset()
state = np.reshape(state, [1, 81])
last_move = -1

done = False

print("\nStarting a new game of Ultimate Tic-Tac-Toe!")
print("You are Player 2 (O), and the agent is Player 1 (X).")
print("Enter your moves by typing a number between 0 and 80 corresponding to the cell on the 9x9 board.\n")

while not done:
    # Agent's turn
    print("Agent's Turn:")

    valid_move = False
    # while not valid_move:
    action = agent.act(state, last_move)

      # row, col = divmod(action, 9)
      # if env.board.is_valid_move(row, col):
      #     valid_move = True
      # else:
      #   pass

    next_state, reward, done, _ = env.step(action)
    state = np.reshape(next_state, [1, 81])
    env.render()
    if done:
        if reward == 1:
            print("Agent wins!")
        else:
            print("It's a tie!")
        break

    # User's turn
    valid_move = False
    while not valid_move:
        user_input = input("Your Turn. Enter your move (0-80): ")
        try:
            user_action = int(user_input)
            if not (0 <= user_action < 81):
                print("Invalid input. Enter a number between 0 and 80.")
                continue
            row, col = divmod(user_action, 9)
            if env.board.is_valid_move(row, col):
                valid_move = True
            else:
                print("Invalid move. The cell is either occupied or not in the valid subgrid. Try again.")
        except ValueError:
            print("Invalid input. Enter a number between 0 and 80.")

    last_move = user_action
    # Update the board with the user's move
    env.board.update_cell(user_action, 2)  # User is Player 2
    env.board.print_board()

    # Check if game is over after user's move
    if env.board.overall_winner != 0 or np.all(env.board.board != 0):
        done = True
        if env.board.overall_winner == 2:
            print("Congratulations! You win!")
        elif env.board.overall_winner == 1:
            print("Agent wins!")
        else:
            print("It's a tie!")
        break

    # Update the state for the agent
    state = env.board.board.flatten()
    state = np.reshape(state, [1, 81])

print("\nGame Over.")



Starting a new game of Ultimate Tic-Tac-Toe!
You are Player 2 (O), and the agent is Player 1 (X).
Enter your moves by typing a number between 0 and 80 corresponding to the cell on the 9x9 board.

Agent's Turn:
(81,) -1 [60, 61, 62, 69, 70, 71, 78, 79, 80]
Ultimate Tic-Tac-Toe Board (9x9):

0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 0 0 | 
--------------------
0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 0 0 | 
--------------------
0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 1 0 | 
--------------------
Your Turn. Enter your move (0-80): 67
Ultimate Tic-Tac-Toe Board (9x9):

0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 0 0 | 
--------------------
0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 0 0 | 
--------------------
0 0 0 | 0 0 0 | 0 0 0 | 
0 0 0 | 0 2 0 | 0 0 0 | 
0 0 0 | 0 0 0 | 0 1 0 | 
--------------------
Agent's Turn:
(81,) 7 [30, 31, 32, 39, 40, 41, 48, 49, 50]


# AGENT VS RANDOM BEST SEED

Agent 1 With Complete Model Exploitation

In [69]:
env_agent = UltimateTicTacToeEnv()
model_path = "dqn_agent_weights.pth"
state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed
agent = DQNAgent(state_size=82, action_size=81)
agent.model.load_state_dict(state_dict)

  state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed


<All keys matched successfully>

Agent 2 With Complete Model Exploration

In [70]:
env_random = UltimateTicTacToeEnv()
model_path = "dqn_agent_weights.pth"
state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed
random_agent = DQNAgent(state_size=82, action_size=81)
random_agent.model.load_state_dict(state_dict)

  state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed


<All keys matched successfully>

In [75]:
# After training, set epsilon to 0 so the agent uses its learned policy
agent.epsilon = 0
# After training, set epsilon to 1 so the random agent never uses its learned policy
random_agent.epsilon = 1

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

#Initialize a counter
ctr_agent_max = 0
ctr_draw_max = 0
ctr_random_max = 0
best_seed = 0

for seed in range(101):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

  #Initialize a counter
  ctr_agent = 0
  ctr_draw = 0
  ctr_random = 0

  for _ in range(100) :
    # Initialize the environment
    state_agent = env_agent.reset()
    state_agent = np.reshape(state_agent, [1, 81])
    state_random = env_random.reset()
    state_random = np.reshape(state_random, [1, 81])
    last_move = -1

    done = False

    # print("\nStarting a new game of Ultimate Tic-Tac-Toe!")
    # print("Random is Player 2 (O), and the Agent is Player 1 (X).")

    while not done:
        # Agent's turn
        # print("\nAgent's Turn:")
        agent_action = agent.act(state_agent, last_move)

        # Apply Agent's action
        next_state, reward, done, _ = env_agent.step(agent_action)
        state_agent = np.reshape(next_state, [1, 81])

        # Update the last move and synchronize Random's board
        last_move = agent_action
        env_random.board.update_cell(agent_action, 2)  # Agent is Player 1 in Random's board

        # Update Random Agent's state
        state_random = env_random.board.board.flatten()
        state_random = np.reshape(state_random, [1, 81])

        # Print the boards
        # print("\nAgent's Board:")
        # env_agent.render()
        # print("\nRandom Agent's Board:")
        # env_random.render()

        if done:
            if reward == 1:
                ctr_agent+=1
                # print("Agent wins!")
            else:
                ctr_draw+=1
                # print("It's a tie!")
            break

        # Random Agent's turn
        # print("\nRandom Agent's Turn:")
        random_action = random_agent.act(state_random, last_move)

        # Apply Random Agent's action
        next_state, reward, done, _ = env_random.step(random_action)
        state_random = np.reshape(next_state, [1, 81])

        # Synchronize Agent's board
        env_agent.board.update_cell(random_action, 2)  # Random is Player 2 in Agent's board

        # Print the boards
        # print("\nAgent's Board:")
        # env_agent.render()
        # print("\nRandom Agent's Board:")
        # env_random.render()

        if done:
            if reward == 1:
                ctr_random+=1
                # print("Random Agent wins!")
            else:
                ctr_draw+=1
                # print("It's a tie!")
            break

        # Update the last move and Agent's state
        last_move = random_action
        state_agent = env_agent.board.board.flatten()
        state_agent = np.reshape(state_agent, [1, 81])

    # print("\nGame Over.")

    if ctr_agent > ctr_agent_max:
      ctr_agent_max = ctr_agent
      ctr_draw_max = ctr_draw
      ctr_random_max = ctr_random
      best_seed = seed


In [77]:
print(ctr_agent_max, ctr_draw_max, ctr_random_max, best_seed)

26 68 6 16


  and should_run_async(code)


# AGENT VS RANDOM

Agent 1 With Complete Model Exploitation

In [78]:
env_agent = UltimateTicTacToeEnv()
model_path = "dqn_agent_weights.pth"
state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed
agent = DQNAgent(state_size=82, action_size=81)
agent.model.load_state_dict(state_dict)

  and should_run_async(code)
  state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed


<All keys matched successfully>

Agent 2 With Complete Model Exploration

In [79]:
env_random = UltimateTicTacToeEnv()
model_path = "dqn_agent_weights.pth"
state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed
random_agent = DQNAgent(state_size=82, action_size=81)
random_agent.model.load_state_dict(state_dict)

  state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed


<All keys matched successfully>

In [80]:
# After training, set epsilon to 0 so the agent uses its learned policy
agent.epsilon = 0
# After training, set epsilon to 1 so the random agent never uses its learned policy
random_agent.epsilon = 1

random.seed(16)
np.random.seed(16)
torch.manual_seed(16)

#Initialize a counter
ctr_agent = 0
ctr_draw = 0
ctr_random = 0


for _ in range(100) :
  # Initialize the environment
  state_agent = env_agent.reset()
  state_agent = np.reshape(state_agent, [1, 81])
  state_random = env_random.reset()
  state_random = np.reshape(state_random, [1, 81])
  last_move = -1

  done = False

  # print("\nStarting a new game of Ultimate Tic-Tac-Toe!")
  # print("Random is Player 2 (O), and the Agent is Player 1 (X).")

  while not done:
      # Agent's turn
      # print("\nAgent's Turn:")
      agent_action = agent.act(state_agent, last_move)

      # Apply Agent's action
      next_state, reward, done, _ = env_agent.step(agent_action)
      state_agent = np.reshape(next_state, [1, 81])

      # Update the last move and synchronize Random's board
      last_move = agent_action
      env_random.board.update_cell(agent_action, 2)  # Agent is Player 1 in Random's board

      # Update Random Agent's state
      state_random = env_random.board.board.flatten()
      state_random = np.reshape(state_random, [1, 81])

      # Print the boards
      # print("\nAgent's Board:")
      # env_agent.render()
      # print("\nRandom Agent's Board:")
      # env_random.render()

      if done:
          if reward == 1:
              ctr_agent+=1
              # print("Agent wins!")
          else:
              ctr_draw+=1
              # print("It's a tie!")
          break

      # Random Agent's turn
      # print("\nRandom Agent's Turn:")
      random_action = random_agent.act(state_random, last_move)

      # Apply Random Agent's action
      next_state, reward, done, _ = env_random.step(random_action)
      state_random = np.reshape(next_state, [1, 81])

      # Synchronize Agent's board
      env_agent.board.update_cell(random_action, 2)  # Random is Player 2 in Agent's board

      # Print the boards
      # print("\nAgent's Board:")
      # env_agent.render()
      # print("\nRandom Agent's Board:")
      # env_random.render()

      if done:
          if reward == 1:
              ctr_random+=1
              # print("Random Agent wins!")
          else:
              ctr_draw+=1
              # print("It's a tie!")
          break

      # Update the last move and Agent's state
      last_move = random_action
      state_agent = env_agent.board.board.flatten()
      state_agent = np.reshape(state_agent, [1, 81])

  # print("\nGame Over.")



In [81]:
print(ctr_agent, ctr_draw, ctr_random)

26 68 6


# AGENT VS AGENT

Agent 1 With Complete Model Exploitation

In [82]:
env_agent = UltimateTicTacToeEnv()
model_path = "dqn_agent_weights.pth"
state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed
agent = DQNAgent(state_size=82, action_size=81)
agent.model.load_state_dict(state_dict)

  state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed


<All keys matched successfully>

Agent 2 With Complete Model Exploration

In [83]:
env_random = UltimateTicTacToeEnv()
model_path = "dqn_agent_weights.pth"
state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed
random_agent = DQNAgent(state_size=82, action_size=81)
random_agent.model.load_state_dict(state_dict)

  state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed


<All keys matched successfully>

In [89]:
# After training, set epsilon to 0 so the agent uses its learned policy
agent.epsilon = 0
# After training, set epsilon to 1 so the random agent never uses its learned policy
random_agent.epsilon = 0

#Initialize a counter
ctr_agent = 0
ctr_draw = 0
ctr_random = 0


for _ in range(100) :
  # Initialize the environment
  state_agent = env_agent.reset()
  state_agent = np.reshape(state_agent, [1, 81])
  state_random = env_random.reset()
  state_random = np.reshape(state_random, [1, 81])
  last_move = -1

  done = False

  # print("\nStarting a new game of Ultimate Tic-Tac-Toe!")
  # print("Random is Player 2 (O), and the Agent is Player 1 (X).")

  while not done:
      # Agent's turn
      # print("\nAgent's Turn:")
      agent_action = agent.act(state_agent, last_move)

      # Apply Agent's action
      next_state, reward, done, _ = env_agent.step(agent_action)
      state_agent = np.reshape(next_state, [1, 81])

      # Update the last move and synchronize Random's board
      last_move = agent_action
      env_random.board.update_cell(agent_action, 2)  # Agent is Player 1 in Random's board

      # Update Random Agent's state
      state_random = env_random.board.board.flatten()
      state_random = np.reshape(state_random, [1, 81])

      # Print the boards
      # print("\nAgent's Board:")
      # env_agent.render()
      # print("\nRandom Agent's Board:")
      # env_random.render()

      if done:
          if reward == 1:
              ctr_agent+=1
              # print("Agent wins!")
          else:
              ctr_draw+=1
              # print("It's a tie!")
          break

      # Random Agent's turn
      # print("\nRandom Agent's Turn:")
      random_action = random_agent.act(state_random, last_move)

      # Apply Random Agent's action
      next_state, reward, done, _ = env_random.step(random_action)
      state_random = np.reshape(next_state, [1, 81])

      # Synchronize Agent's board
      env_agent.board.update_cell(random_action, 2)  # Random is Player 2 in Agent's board

      # Print the boards
      # print("\nAgent's Board:")
      # env_agent.render()
      # print("\nRandom Agent's Board:")
      # env_random.render()

      if done:
          if reward == 1:
              ctr_random+=1
              # print("Random Agent wins!")
          else:
              ctr_draw+=1
              # print("It's a tie!")
          break

      # Update the last move and Agent's state
      last_move = random_action
      state_agent = env_agent.board.board.flatten()
      state_agent = np.reshape(state_agent, [1, 81])

  # print("\nGame Over.")



In [90]:
print(ctr_agent, ctr_draw, ctr_random)

0 100 0


# RANDOM VS RANDOM

Agent 1 With Complete Model Exploitation

In [91]:
env_agent = UltimateTicTacToeEnv()
model_path = "dqn_agent_weights.pth"
state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed
agent = DQNAgent(state_size=82, action_size=81)
agent.model.load_state_dict(state_dict)

  state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed


<All keys matched successfully>

Agent 2 With Complete Model Exploration

In [92]:
env_random = UltimateTicTacToeEnv()
model_path = "dqn_agent_weights.pth"
state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed
random_agent = DQNAgent(state_size=82, action_size=81)
random_agent.model.load_state_dict(state_dict)

  state_dict = torch.load(model_path, map_location=torch.device('cpu'))  # Adjust for GPU if needed


<All keys matched successfully>

In [93]:
# After training, set epsilon to 0 so the agent uses its learned policy
agent.epsilon = 1
# After training, set epsilon to 1 so the random agent never uses its learned policy
random_agent.epsilon = 1

random.seed(16)
np.random.seed(16)
torch.manual_seed(16)

#Initialize a counter
ctr_agent = 0
ctr_draw = 0
ctr_random = 0


for _ in range(100) :
  # Initialize the environment
  state_agent = env_agent.reset()
  state_agent = np.reshape(state_agent, [1, 81])
  state_random = env_random.reset()
  state_random = np.reshape(state_random, [1, 81])
  last_move = -1

  done = False

  # print("\nStarting a new game of Ultimate Tic-Tac-Toe!")
  # print("Random is Player 2 (O), and the Agent is Player 1 (X).")

  while not done:
      # Agent's turn
      # print("\nAgent's Turn:")
      agent_action = agent.act(state_agent, last_move)

      # Apply Agent's action
      next_state, reward, done, _ = env_agent.step(agent_action)
      state_agent = np.reshape(next_state, [1, 81])

      # Update the last move and synchronize Random's board
      last_move = agent_action
      env_random.board.update_cell(agent_action, 2)  # Agent is Player 1 in Random's board

      # Update Random Agent's state
      state_random = env_random.board.board.flatten()
      state_random = np.reshape(state_random, [1, 81])

      # Print the boards
      # print("\nAgent's Board:")
      # env_agent.render()
      # print("\nRandom Agent's Board:")
      # env_random.render()

      if done:
          if reward == 1:
              ctr_agent+=1
              # print("Agent wins!")
          else:
              ctr_draw+=1
              # print("It's a tie!")
          break

      # Random Agent's turn
      # print("\nRandom Agent's Turn:")
      random_action = random_agent.act(state_random, last_move)

      # Apply Random Agent's action
      next_state, reward, done, _ = env_random.step(random_action)
      state_random = np.reshape(next_state, [1, 81])

      # Synchronize Agent's board
      env_agent.board.update_cell(random_action, 2)  # Random is Player 2 in Agent's board

      # Print the boards
      # print("\nAgent's Board:")
      # env_agent.render()
      # print("\nRandom Agent's Board:")
      # env_random.render()

      if done:
          if reward == 1:
              ctr_random+=1
              # print("Random Agent wins!")
          else:
              ctr_draw+=1
              # print("It's a tie!")
          break

      # Update the last move and Agent's state
      last_move = random_action
      state_agent = env_agent.board.board.flatten()
      state_agent = np.reshape(state_agent, [1, 81])

  # print("\nGame Over.")



In [94]:
print(ctr_agent, ctr_draw, ctr_random)

5 91 4
