**Setting Up Environment**

In [None]:
import numpy as np
import random

**Tic-Tac-Toe Game**

In [None]:
class TicTacToe:
  def __init__(self, width, height, state=None):
    self.board = np.array(state, dtype=int) if state is not None else np.zeros((height, width), dtype=int)
    self.players = ["X", "O"]
    self.current_player = self.players[0]
    self.winner = None
    self.game_over = False
    self.width = width
    self.height = height

  # Reset the Game
  def reset(self):
    self.board = np.zeros((self.height, self.width), dtype=np.int16)
    self.current_player = self.players[0]
    self.winner = None
    self.game_over = False
    return self.board.copy()

  # List of Available Moves
  def available_moves(self):
    moves = []
    for i in range(self.height):
      for j in range(self.width):
        if self.board[i][j] == 0:
          moves.append((i, j))
    return moves

  # Make a Move & Getting Reward/Penalty if there is a Winner
  def make_move(self, move):
    if self.board[move[0], move[1]] != 0 or self.game_over:
      return self.board.copy(), 0, self.game_over
    self.board[move[0]][move[1]] = self.players.index(self.current_player) + 1
    self.check_winner()
    reward = 1 if self.winner == "X" else -1 if self.winner == "O" else 0
    self.switch_player()
    return self.board.copy(), reward, self.game_over

  # Switches Players
  def switch_player(self):
    if self.current_player == self.players[0]:
      self.current_player = self.players[1]
    else:
      self.current_player = self.players[0]

  # Checks for a Winner
  def check_winner(self):
    lines = []

    lines.extend(self.board)
    lines.extend(self.board.T)
    lines.append(np.diag(self.board))
    lines.append(np.diag(np.fliplr(self.board)))

    for line in lines:
      if np.all(line == 1):
        self.winner = "X"
        self.game_over = True
        return

      if np.all(line == 2):
        self.winner = "O"
        self.game_over = True
        return

    if not np.any(self.board == 0):
      self.game_over = True

  # Print the Current State of the Board
  def print_board(self):
    print("-"*(self.height * 4 + 1))
    for i in range(self.height):
      print("|", end=" ")
      for j in range(self.width):
        print(self.players[int(self.board[i][j] - 1)] if self.board[i][j] != 0 else " ", end=" | ")
      print()
      print("-"*(self.height * 4 + 1))

In [None]:
# Test the TicTacToe Object with Playing the Game
game = TicTacToe(3, 3)
game.current_player = game.players[0]
game.print_board()

while not game.game_over:
  move = input(f"{game.current_player}'s turn. Enter row and column (e.g. 0 0): ")
  move = tuple(map(int, move.split()))
  while move not in game.available_moves():
    move = input("Invalid move. Try again: ")
    move = tuple(map(int, move.split()))
  game.make_move(move)
  game.print_board()

if game.winner:
  print(f"{game.winner} wins!")
else:
  print("It's a tie!")

-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
X's turn. Enter row and column (e.g. 0 0): 1 1
-------------
|   |   |   | 
-------------
|   | X |   | 
-------------
|   |   |   | 
-------------
O's turn. Enter row and column (e.g. 0 0): 0 0
-------------
| O |   |   | 
-------------
|   | X |   | 
-------------
|   |   |   | 
-------------
X's turn. Enter row and column (e.g. 0 0): 1 0
-------------
| O |   |   | 
-------------
| X | X |   | 
-------------
|   |   |   | 
-------------
O's turn. Enter row and column (e.g. 0 0): 0 1
-------------
| O | O |   | 
-------------
| X | X |   | 
-------------
|   |   |   | 
-------------
X's turn. Enter row and column (e.g. 0 0): 1 2
-------------
| O | O |   | 
-------------
| X | X | X | 
-------------
|   |   |   | 
-------------
X wins!


**Q-learning Agent**

In [None]:
class QLearningAgent:
  def __init__(self, alpha, epsilon, discount_factor):
    self.Q = {}
    self.alpha = alpha
    self.epsilon = epsilon
    self.gamma = discount_factor

  # Method retrieves the Q-value for a specific state-action pair
  def get_Q_value(self, state_key, action):
    if (state_key, action) not in self.Q:
        self.Q[(state_key, action)] = 0.0
    return self.Q[(state_key, action)]

  # Method selects an action based on the current state and available moves
  def choose_action(self, state_key, available_moves):
    if random.random() < self.epsilon:
        return random.choice(available_moves)

    Q_values = [self.get_Q_value(state_key, a) for a in available_moves]
    max_Q = max(Q_values)
    best_moves = [a for a, q in zip(available_moves, Q_values) if q == max_Q]
    return random.choice(best_moves)

  # Updates the Q-value using Bellman Equation
  def update_Q_value(self, state_key, action, reward, next_state_key, next_available_moves):
    max_next_Q = max([self.get_Q_value(next_state_key, a) for a in next_available_moves], default=0)
    old_Q = self.get_Q_value(state_key, action)
    self.Q[(state_key, action)] = old_Q + self.alpha * (reward + self.gamma * max_next_Q - old_Q)

**Training**

In [None]:
def train(num_episodes, alpha, epsilon, discount_factor):
  agent = QLearningAgent(alpha, epsilon, discount_factor)

  # Loop through the specified number of training episodes
  for episode in range(num_episodes):
    env = TicTacToe(3, 3)
    state = env.board.copy()
    env.current_player = env.players[0]
    done = False

    while not done:
      # Convert state to tuple for dict key
      state_key = tuple(state.flatten()) + (env.current_player,)

      available_moves = env.available_moves()
      action = agent.choose_action(state_key, available_moves)

      # Agent makes move
      next_state, reward, done = env.make_move(action)
      next_available_moves = env.available_moves()

      # If game not over, opponent makes a random move
      if not done and next_available_moves:
        env.current_player = env.players[1]
        opp_action = random.choice(next_available_moves)
        next_state, reward, done = env.make_move(opp_action)

      next_state_key = tuple(next_state.flatten()) + (env.current_player,)
      next_available_moves = env.available_moves()

      # Update Q-table
      agent.update_Q_value(state_key, action, reward, next_state_key, next_available_moves)

      state = next_state.copy()

  return agent


**Testing**

In [None]:
def test(agent, num_games):
  num_wins = 0

  # Loop through number of games
  for _ in range(num_games):

    # Make a new game environment
    env = TicTacToe(3, 3)
    state = env.board.copy()
    env.current_player = env.players[0]
    done = False
    reward = 0

    # Loop through all the action needed before the game ends
    while not done:
      state_key = tuple(state.flatten()) + (env.current_player,)
      available_moves = env.available_moves()

      if env.current_player == env.players[0]:
          # Agent's turn
          action = agent.choose_action(state_key, available_moves)
      else:
          # Opponent's turn
          action = random.choice(available_moves)

      state, reward, done = env.make_move(action)

    # Add the number of wins the Agent gets
    if reward == 1:
      num_wins += 1

  return num_wins / num_games * 100


**Run**

In [None]:
# Train the Agent
agent = train(num_episodes=100000, alpha=0.5, epsilon=0.1, discount_factor=1.0)

# Test the Agent and Get the Win Percentage
win_percentage = test(agent, num_games=1000)
print("Win percentage: {:.2f}%".format(win_percentage))

Win percentage: 92.10%
