Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.


# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

- Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
- Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

- Reviews will be assigned on Monday, December 4
- You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)


In [213]:
from tqdm import tqdm
import numpy as np
import platform
from os import system
from copy import deepcopy

# Tic Tac Toe

In [214]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1  # 1 for X  -1 for O

    def print_board(self):
        if 'windows' in platform.system().lower():
            system('cls')
        else:
            system('clear')
        for row in self.board:
            for cell in row:
                if cell == 1:
                    print("X", end=" ")
                elif cell == -1:
                    print("O", end=" ")
                else:
                    print("-", end=" ")
            print()

    def reset(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1

    def is_board_full(self):
        return not any(0 in row for row in self.board)

    def is_winner(self, player):
        return (
            np.any(np.all(self.board == player, axis=0))
            or np.any(np.all(self.board == player, axis=1))
            or np.all(np.diag(self.board) == player)
            or np.all(np.diag(np.fliplr(self.board)) == player)
        )

    def is_game_over(self):
        return self.is_winner(1) or self.is_winner(-1) or self.is_board_full()

    def is_tie(self):
        return self.is_board_full() and not self.is_winner(1) and not self.is_winner(-1)

    def get_available_moves(self):
        return np.argwhere(self.board == 0)

    def make_move(self, move):
        self.board[move[0], move[1]] = self.current_player
        self.current_player *= -1

    def make_random_move(self):
        available_moves = self.get_available_moves()
        random_move = available_moves[np.random.randint(len(available_moves))]
        self.make_move(random_move)

# Minimax

In [215]:
class MinimaxPlayer:
    def __init__(self, player_index) -> None:
        self.player_index = player_index

    def evaluate(self, game) -> int:
        if game.is_winner(self.player_index):
            return 1
        elif game.is_winner(-self.player_index):
            return -1
        else:
            return 0

    def choose_move(self, game):
        if game.is_board_full():
            return None

        l = len(game.get_available_moves())
        if l == 9:
            return (np.random.randint(3), np.random.randint(3))

        maximizing = self.player_index == game.current_player
        best_move = None
        best_score = None

        for move in game.get_available_moves():
            game_copy = deepcopy(game)
            game_copy.make_move(move)
            score = self.minimax(game_copy, maximizing)
            if best_score is None or score > best_score:
                best_score = score
                best_move = move
        return best_move

    def minimax(self, game, maximizing):
        if game.is_game_over():
            return self.evaluate(game)

        best_score = None
        for move in game.get_available_moves():
            game.make_move(move)
            score = self.minimax(game, not maximizing)
            if best_score is None:
                best_score = score
            elif maximizing:
                best_score = max(best_score, score)
            else:
                best_score = min(best_score, score)
        return best_score

In [216]:
game=TicTacToe()

for MINIMAX_INDEX in [1,-1]:
    minimax_player=MinimaxPlayer(MINIMAX_INDEX)
    wins=0
    for _ in tqdm(range(1000)):
        game.reset()
        while not game.is_game_over():
            if game.current_player==MINIMAX_INDEX:
                move=minimax_player.choose_move(deepcopy(game))
                game.make_move(move)
            else:
                game.make_random_move()
        if game.is_winner(MINIMAX_INDEX):
            wins+=1
    print(f"minimax is player {MINIMAX_INDEX} -> {wins} wins")



100%|██████████| 1000/1000 [00:07<00:00, 142.56it/s]


minimax is player 1 -> 728 wins


100%|██████████| 1000/1000 [00:11<00:00, 89.49it/s]

minimax is player -1 -> 716 wins





# Q-Learning

In [218]:
class QLearningAgent():
    def __init__(self, player_index, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.player_index = player_index
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = {}

    def get_q_value(self, state, action):
        if state not in self.q_table:
            self.q_table[state] = np.zeros((3, 3))
        return self.q_table[state][action[0], action[1]]

    def set_q_value(self, state, action, value):
        if state not in self.q_table:
            self.q_table[state] = np.zeros((3, 3))
        self.q_table[state][action[0], action[1]] = value

    def choose_action(self, state, game):
        if np.random.uniform(0, 1) < self.epsilon:
            return game.get_available_moves()[np.random.randint(len(game.get_available_moves()))]
        else:
            q_values = np.array([self.get_q_value(state, action) for action in game.get_available_moves()])
            return game.get_available_moves()[np.argmax(q_values)]

    def learn(self, state, action, reward, next_state, next_action, done):
        q_value = self.get_q_value(state, action)
        if done:
            td_target = reward
        else:
            td_target = reward + self.gamma * self.get_q_value(next_state, next_action)
        self.set_q_value(state, action, q_value + self.alpha * (td_target - q_value))



  0%|          | 0/1000 [00:00<?, ?it/s]


TypeError: 'NoneType' object is not subscriptable