Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.


# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

- Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
- Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

- Reviews will be assigned on Monday, December 4
- You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)


In [None]:
from tqdm import tqdm
import numpy as np
from copy import deepcopy
import pickle
import random

# Tic Tac Toe

In [None]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1  # 1 for X  -1 for O
        self.last_move = None

    def print_board(self):
        for row in self.board:
            for cell in row:
                if cell == 1:
                    print("❌", end=" ")
                elif cell == -1:
                    print("⭕️", end=" ")
                else:
                    print("--", end=" ")
            print()
        print()

    def reset(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1

    def is_board_full(self):
        return not any(0 in row for row in self.board)

    def is_winner(self, player):
        return (
            np.any(np.all(self.board == player, axis=0))
            or np.any(np.all(self.board == player, axis=1))
            or np.all(np.diag(self.board) == player)
            or np.all(np.diag(np.fliplr(self.board)) == player)
        )

    def is_game_over(self):
        return self.is_winner(1) or self.is_winner(-1) or self.is_board_full()

    def is_tie(self):
        return self.is_board_full() and not self.is_winner(1) and not self.is_winner(-1)

    def get_available_moves(self):
        return np.argwhere(self.board == 0)

    def make_move(self, move):
        self.board[move[0], move[1]] = self.current_player
        self.current_player *= -1

    def undo_move(self, move):
        self.board[move[0], move[1]] = 0
        self.current_player *= -1

    def random_move(self):
        available_moves = self.get_available_moves()
        return available_moves[np.random.randint(len(available_moves))]


In [None]:
class RandomPlayer:
    def __init__(self):
        pass

    def choose_move(self, game):
        return game.random_move()

# Minimax

In [None]:
class MinimaxPlayer:
    def __init__(self, player_index) -> None:
        self.player_index = player_index

    def evaluate(self, game) -> int:
        if game.is_winner(self.player_index):
            return 9
        elif game.is_winner(-self.player_index):
            return -9
        else:
            return 2

    def choose_move(self, game):
        if game.is_board_full():
            return None

        l = len(game.get_available_moves())
        if l == 9:
            return(0,0) # optimal opening move
        
        if l==8 and game.board[1,1]==0:
            return(1,1) # optimal second move if center is available

        maximizing = True if self.player_index==1 else False
        best_move = None
        best_score = None

        for move in game.get_available_moves():
            game_copy = deepcopy(game)
            game_copy.make_move(move)
            score = self.minimax(game_copy, maximizing)
            if best_score is None or score > best_score:
                best_score = score
                best_move = move
        return best_move

    def minimax(self, game, maximizing):
        if game.is_game_over():
            return self.evaluate(game)

        best_score = None
        for move in game.get_available_moves():

            game.make_move(move)
            score = self.minimax(game, not maximizing)
            
            if best_score is None:
                best_score = score
            elif maximizing:
                best_score = max(best_score, score)
            else:
                best_score = min(best_score, score) 
        
        # diminishing score with depth should make the AI value faster wins
        return best_score-np.sign(best_score) 

In [None]:
# game=TicTacToe()

# for MINIMAX_INDEX in [1,-1]:
#     minimax_player=MinimaxPlayer(MINIMAX_INDEX)
#     random_player=RandomPlayer()
#     wins=0
#     ties=0
#     losses=0
#     print(f"Minimax player as {MINIMAX_INDEX}")
#     for _ in tqdm(range(100)):
#         game.reset()
#         while not game.is_game_over():
#             if game.current_player==MINIMAX_INDEX:
#                 move=minimax_player.choose_move(deepcopy(game))
#                 game.make_move(move)
#             else:
#                 move=random_player.get_move(game)
#                 game.make_move(move)
#         if game.is_winner(MINIMAX_INDEX):
#             wins+=1
#         if game.is_tie():
#             ties+=1
#         if game.is_winner(-MINIMAX_INDEX):
#             losses+=1
    
#     print(f"{wins} wins")
#     print(f"{ties} ties")
#     print(f"{losses} losses")


# # Minimax player as 1
# # 100%|██████████| 100/100 [00:01<00:00, 94.49it/s]
# # 90 wins
# # 2 ties
# # 8 losses
# # Minimax player as -1
# # 100%|██████████| 100/100 [00:00<00:00, 128.52it/s]
# # 92 wins
# # 4 ties
# # 4 losses



# Q-Learning

In [None]:
class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.epsilon = epsilon  # Exploration-exploitation trade-off
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor

        # Q-table: state-action values
        self.q_table = {}

    def save_q_table(self, filename="q_table.pickle"):
        with open(filename, "wb") as f:
            pickle.dump(self.q_table, f)

    def load_q_table(self, filename="q_table.pickle"):
        with open(filename, "rb") as f:
            self.q_table = pickle.load(f)

    def get_q_value(self, state, action):
        return self.q_table.get((str(state.board), str(action)), 0.05)

    def choose_move(self, state):
        available_moves = state.get_available_moves()

        if random.uniform(0, 1) < self.epsilon:
            # Exploration:
            return random.choice(available_moves)
        else:
            # Exploitation:
            q_values = [
                (action, self.get_q_value(state, action)) for action in available_moves
            ]
            best_actions = [
                action
                for action, q_value in q_values
                if q_value == max([q_value for _, q_value in q_values])
            ]
            return random.choice(best_actions)

    def update_q_value(self, state, action, next_state, reward):
        # Q-value update using the Q-learning formula
        self.q_table[(str(state.board), str(action))] = (
            1 - self.alpha
        ) * self.get_q_value(state, action) + self.alpha * (
            reward
            + self.gamma
            * max(
                [
                    self.get_q_value(next_state, next_action)
                    for next_action in next_state.get_available_moves()
                ]
            )
        )

In [None]:
def play_game(agent,opponent, environment,printing=False):
    environment.reset()

    while not environment.is_game_over():
        current_state = environment

        # Agent Move
        action = agent.choose_move(current_state)
        environment.make_move(action)
        if printing:
            print("Agent's turn")
            environment.print_board()

        # Check if the game is over
        reward = 0
        if environment.is_winner(1):
            reward = 1
        elif environment.is_winner(-1):
            reward = -1
        if not environment.is_game_over():
            agent.update_q_value(current_state, action, environment, reward)
        else:
            if printing:
                print("Agent wins" if reward == 1 else "Tie" if reward == 0 else "Opponent wins")
            break

        # Opponent Move
        opponent_action = opponent.choose_move(environment)
        environment.make_move(opponent_action)
        if printing:
            print("Opponent's turn")
            environment.print_board()

        # Check if the game is over
        reward = 0
        if environment.is_winner(1):
            reward = 1
        elif environment.is_winner(-1):
            reward = -1
        if not environment.is_game_over():
            agent.update_q_value(current_state, action, environment, reward)
        else:
            if printing:
                print("Agent wins" if reward == 1 else "Tie" if reward == 0 else "Opponent wins")
            break



## Training

In [None]:
num_episodes = 100000
train=False
# train=True # uncomment/comment to train/not train
if train:
    # RANDOM
    agent = QLearningAgent()
    opponent=RandomPlayer()
    for episode in tqdm(range(num_episodes)):
        environment = TicTacToe()
        play_game(agent,opponent, environment)
    agent.save_q_table("q_table_random.pkl")

    # MINIMAX
    agent = QLearningAgent()
    opponent=MinimaxPlayer(-1)
    for episode in tqdm(range(num_episodes)):
        environment = TicTacToe()
        play_game(agent,opponent, environment)
    agent.save_q_table("q_table_minimax.pkl")

    # SELF
    agent = QLearningAgent()
    opponent=QLearningAgent()
    for episode in tqdm(range(num_episodes)):
        environment = TicTacToe()
        play_game(agent,opponent, environment)
    agent.save_q_table("q_table_self.pkl")

## Testing

In [347]:
def test(agent,opponent):
    # 100 matches vs random
    test_environment = TicTacToe()

    wins=0
    ties=0
    losses=0
    for _ in range(100):
        test_environment.reset()
        play_game(agent,opponent,test_environment)
        if test_environment.is_winner(1):
            wins+=1
        if test_environment.is_tie():
            ties+=1
        if test_environment.is_winner(-1):
            losses+=1

    print(f" {wins} wins")
    print(f" {ties} ties")
    print(f" {losses} losses")

In [None]:
testing=False
testing=True # uncomment/comment to test/not test

if testing:
    # RANDOM TRAINED AGENT VS RANDOM
    print("\nRANDOM TRAINED AGENT VS RANDOM")
    agent = QLearningAgent()
    opponent=RandomPlayer()
    agent.load_q_table("q_table_random.pkl")
    test(agent,opponent)

    # MINIMAX TRAINED AGENT VS MINIMAX
    print("\nMINIMAX TRAINED AGENT VS MINIMAX")
    agent = QLearningAgent()
    opponent=MinimaxPlayer(-1)
    agent.load_q_table("q_table_minimax.pkl")
    test(agent,opponent)

    # SELF TRAINED AGENT VS SELF
    print("\nSELF TRAINED AGENT VS SELF")
    agent = QLearningAgent()
    opponent=QLearningAgent()
    agent.load_q_table("q_table_random.pkl")
    opponent.load_q_table("q_table_random.pkl")
    test(agent,opponent)

    # RANDOM TRAINED AGENT VS MINIMAX
    print("\nRANDOM TRAINED AGENT VS MINIMAX")
    agent = QLearningAgent()
    opponent=MinimaxPlayer(-1)
    agent.load_q_table("q_table_random.pkl")
    test(agent,opponent)

    # RANDOM TRAINED AGENT VS SELF
    print("\nRANDOM TRAINED AGENT VS SELF")
    agent = QLearningAgent()
    opponent=QLearningAgent()
    agent.load_q_table("q_table_random.pkl")
    opponent.load_q_table("q_table_random.pkl")
    test(agent,opponent)

    # MINIMAX TRAINED AGENT VS RANDOM
    print("\nMINIMAX TRAINED AGENT VS RANDOM")
    agent = QLearningAgent()
    opponent=RandomPlayer()
    agent.load_q_table("q_table_minimax.pkl")
    test(agent,opponent)

    # MINIMAX TRAINED AGENT VS SELF
    print("\nMINIMAX TRAINED AGENT VS SELF")
    agent = QLearningAgent()
    opponent=QLearningAgent()
    agent.load_q_table("q_table_minimax.pkl")
    opponent.load_q_table("q_table_minimax.pkl")
    test(agent,opponent)

    # SELF TRAINED AGENT VS RANDOM
    print("\nSELF TRAINED AGENT VS RANDOM")
    agent = QLearningAgent()
    opponent=RandomPlayer()
    agent.load_q_table("q_table_self.pkl")
    test(agent,opponent)

    # SELF TRAINED AGENT VS MINIMAX
    print("\nSELF TRAINED AGENT VS MINIMAX")
    agent = QLearningAgent()
    opponent=MinimaxPlayer(-1)
    agent.load_q_table("q_table_self.pkl")
    test(agent,opponent)



# Human vs. Machine

In [None]:
opponent=input("Enter opponent type (random/minimax/qagent): ")
if opponent=="random":
    opponent=RandomPlayer()
elif opponent=="minimax":
    opponent=MinimaxPlayer(-1)
else:
    ql_type=input("Enter qlearning type (random/minimax/self): ")
    opponent=QLearningAgent()
    opponent.load_q_table(f"q_table_{ql_type}.pkl")

play_again="y"
environment = TicTacToe()
environment.print_board()
while play_again=="y":

    environment.reset()
    while not environment.is_game_over():
        
        print("Your turn")
        move=input("Enter move (row,col): ").split(",")
        move=(int(move[0]),int(move[1]))
        environment.make_move(move)
        environment.print_board()
        if environment.is_game_over():
            break
        
        print("Opponent's turn")
        move=opponent.choose_move(environment)
        environment.make_move(move)
        environment.print_board()
        if environment.is_game_over():
            break

    if environment.is_winner(1):
        print("You win!")
    elif environment.is_tie():
        print("Tie!")
    else:
        print("You lose!")
    play_again=input("Play again? (y/n): ")
