# LAB 10
Use reinforcement learning to devise a tic-tac-toe player.

## Deadlines

- Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
- Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

### Notes
- Reviews will be assigned on Monday, December 4
- You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [270]:
import numpy as np
from tqdm import tqdm, trange
from dataclasses import dataclass, field
from typing import Literal, Union
from abc import ABC, abstractmethod
from collections import defaultdict
from pprint import pprint
import random
import matplotlib.pyplot as plt

## Game Class

In [271]:
DirectIndex = Literal[0,1,2,3,4,5,6,7,8]
RowColIndex = tuple[Literal[0,1,2], Literal[0,1,2]]
Move = Union[DirectIndex, RowColIndex]
Cell = Literal[-1, 0, 1]
PlayerIndex = Literal[0,1]
BoardHash = str

CELL_TO_EMOJI=("⬜","❎","⏺️")
CELL_TO_CHAR=("B", "X", "O")

@dataclass(repr=False)
class Board:
    board: np.ndarray = field(default_factory=lambda: np.ones(9, dtype=np.int8) * -1)

    @staticmethod
    def i_to_rc(i: DirectIndex) -> RowColIndex:
        return i//3, i % 3
    
    @staticmethod
    def rc_to_i(rc: RowColIndex) -> DirectIndex:
        r, c = rc
        return r*3 + c

    @staticmethod
    def is_valid_index(idx: Move) -> bool:
        if isinstance(idx, tuple):
            return idx[0] >= 0 and idx[0] <= 2 and idx[1]>=0 and idx[1]<= 2
        else:
            return idx >= 0 and idx <= 8
        
    def __getitem__(self, idx: Move) -> Cell:
        """Access the cell directly with index or row-col"""
        assert Board.is_valid_index(idx), "Invalid Index: {idx}"
        if isinstance(idx, tuple):
            idx = Board.rc_to_i(idx)
        return self.board[idx]

    def __setitem__(self, idx: Move, value: Cell) -> None:
        assert Board.is_valid_index(idx), "Invalid Index: {idx}"
        if isinstance(idx, tuple):
            idx = Board.rc_to_i(idx)
        self.board[idx] = value

    def is_valid_move(self: "Board",move: Move) -> bool:
        return self[move] == -1
    
    def move(self: "Board", player: "PlayerIndex", move: Move) -> bool:
        valid = self[move] == -1 
        if valid:
            self[move] = player
        return valid
    
    def is_playable(self: "Board") -> bool:
        return any(self.board == -1) and self.won() == -1
    
    def won(self: "Board") -> Literal[0, 1, -1]:
        """Check if someone has won"""

        rows = [[0,1,2], [3,4,5], [6,7,8]]
        cols = [[0,3,6],[1,4,7], [2,5,8]]
        diag = [[0,4,8], [2,4,6]]
        all_ = [*rows, *cols, *diag]

        if any(all(self.board[c] == 0) for c in all_):
            return 0
        elif any(all(self.board[c] == 1) for c in all_):
            return 1
        else: 
            return -1
    
    def __repr__(self: "Board") -> str:
        winner = self.won()
        return f"Board({str(self.board)}, {winner=}) "

    def __str__(self) -> str:
        """Pretty print the board"""
        s = ""
        for r in range(3):
            for c in range(3):
                s += CELL_TO_EMOJI[self[(r,c)] + 1]
            s+="\n"
        winner = self.won()
        if winner != -1:
            s += f"Winner: Player {winner}"
        return s
    
    def hash(self: "Board", plind: PlayerIndex) -> BoardHash: 
        """Stringified version of the board, so it can be used as a dict key"""
        return "".join([CELL_TO_CHAR[c+1] for c in self.board]) + str(plind)
    
    @staticmethod
    def from_hash(s: BoardHash) -> "Board":
        assert len(s) >= 9, "Invalid board"
        b: list[int]
        try:
            b = [CELL_TO_CHAR.index(c)-1 for c in s[:9]]
        except ValueError:
            raise AssertionError("InvalidError")
        return Board(np.array(b))
        


In [272]:
def clamp(value, min_, max_):
    """Clamp value between min_ and max_"""
    return min(max(value, min_), max_)

def avg(iterable):
    return sum(iterable)/len(iterable)

## Players

In [273]:
class Player(ABC):
    """Abstract Player class"""
    
    @property 
    @abstractmethod
    def name(self: "Player") -> str:
        pass

    @abstractmethod
    def choose_move(self, board: "Board", player_index: PlayerIndex) -> Move:
        raise NotImplementedError
    


In [350]:
def game(player0: "Player", player1: "Player", verbose: bool = False) -> Literal[-1, 0, 1]:
    """Play a single game"""
    board = Board()
    if verbose: 
        print(board)
    players = [player0, player1]
    plind: PlayerIndex = 1
    while board.is_playable():
        plind = 1-plind
        player = players[plind]
        move = None
        while move is None or not board.is_valid_move(move):
            move = player.choose_move(board, plind)
        board.move(plind, move)
        if verbose:
            print(board)
    return board.won()

def benchmark(player_to_benchmark: "Player", opponent: "Player", games: int = 100, *, quiet: bool = False) -> tuple[float, float, float]:
    """Benchmark a player, in both position"""
    wins_as_first, wins_as_second = 0, 0
    draws_as_first, draws_as_second = 0, 0
    for i in range(games):
        if i % 2 == 0:
            end = game(player_to_benchmark, opponent)
            wins_as_first += 1 if end == 0 else 0
            draws_as_first += 1 if end == -1 else 0
        else:
            end = game(opponent, player_to_benchmark)
            wins_as_second += 1 if end == 1 else 0
            draws_as_second += 1 if end == -1 else 0
    acc, first_acc, sec_acc = (wins_as_first + wins_as_second) / games, wins_as_first*2/games, wins_as_second*2/games
    draw_acc, draw_first_acc, draw_sec_acc = (wins_as_first + wins_as_second + draws_as_first + draws_as_second) / games, (wins_as_first+draws_as_first)*2/games, (wins_as_second+draws_as_second)*2/games
    if not quiet:
        print(f"[{player_to_benchmark.name} vs {opponent.name} for {games} games]")
        print(f"        Wins: {acc:.2%}, {first_acc:.2%} as first, {sec_acc:.2%} as second")
        print(f"Wins + Draws: {draw_acc:.2%}, {draw_first_acc:.2%} as first, {draw_sec_acc:.2%} as second")
    else:
        return (acc, first_acc, sec_acc), (draw_acc, draw_first_acc, draw_sec_acc)

        

### Random Player and Human Player

In [298]:
@dataclass
class AndyDwyer(Player):
    """Random Player"""

    @property
    def name(self):
        return "Andy Dwyer"

    def choose_move(self, board, player_index) -> DirectIndex:
        """Make random move"""
        return random.randrange(0,9)
    
@dataclass
class TomHaverford(Player):
    """Human Player, I wanted to have fun :)"""

    @property
    def name(self):
        return "Tom Haverford"

    def choose_move(self, board, player_index) -> DirectIndex:
        print(board)
        while True:
            inp = input(f"{CELL_TO_EMOJI[player_index+1]} choose your move (row, column):")
            try:
                r, c = inp.split(",")
                r = int(r.strip())
                c = int(c.strip())
                return Board.rc_to_i((r,c))
            except:
                pass


### Q-Learning

In [343]:
@dataclass
class RonSwanson(Player):
    """Q-Learning Player"""

    learning_rate: float = field(default=0.1)
    discount_rate: float = field(default=0.99)
    exploration_rate: float = field(default=1)
    min_exploration_rate: float= field(default=0.01)
    exploration_decay_rate: float= field(default=2.5e-5)
    num_of_episodes: int = field(default=1_000)
    qtable: dict[BoardHash, list[float]] = field(default_factory=lambda: defaultdict(lambda: [0]*9), repr=False)

    @property
    def name(self): 
        return "Ron Swanson"

    def reward(self, type: Literal["action", "game"], board: "Board", *, move: Move = None, player_position: PlayerIndex = None) -> float:
        assert type in ["action", "game"], "Invalid reward type"
        if type == "action":
            assert move is not None, "Cannot retrieve reward for action if no move is provided"
            return 1 if board.is_valid_move(move) else float('-inf')
        else:
            assert player_position is not None, "Cannot retrieve reward for game if no player position is provided"
            won = board.won() 
            draw = won == -1
            if draw: return 0
            else:
                return 10 if won == player_position else -10
        
    def training_move_chooser(self, board: "Board", player_position: PlayerIndex) -> Move:
        if random.uniform(0, 1) > self.exploration_rate:
            # exploit
            if board.hash(plind=player_position) in self.qtable:
                return np.argmax(self.qtable[board.hash(player_position)])
        # explore or nothing to exploit
        return random.randrange(0, 9)    

    def train(self: "RonSwanson", opponent: "Player" = None, verbose: bool = False):
        if opponent is None:
            opponent = AndyDwyer()
        rewards_per_episode = [0] * self.num_of_episodes
        pbar = trange(self.num_of_episodes, unit="episode", desc=f"Training against {opponent.name}")

        if not verbose:
            vprint = lambda x: None
        else:
            vprint = print
        for episode in pbar:
            board = Board()
            if episode % 2 == 0:
                whoami = 0
            else:
                whoami = 1
            plind: PlayerIndex = 1

            previous_board_hash: BoardHash 
            next_board_hash: BoardHash 
            move: Move
            
            while board.is_playable():
                plind = 1-plind
                if whoami == plind:
                    move_was_valid = False
                    vprint(f"{self.name}'s turn ({plind})")
                    while not move_was_valid:
                        move = self.training_move_chooser(board, plind)
                        reward = self.reward("action", board, move=move)
                        rewards_per_episode[episode] += reward
                        previous_board_hash = board.hash(plind)
                        move_was_valid = board.move(plind, move)
                        next_board_hash = board.hash(plind)
                        vprint(f"{self.name} is picking: {move=},{reward=},{previous_board_hash=},{next_board_hash=}")

                        self.qtable[previous_board_hash][move] *= 1-self.learning_rate
                        self.qtable[previous_board_hash][move] += self.learning_rate * (
                            reward + self.discount_rate * np.max(self.qtable[next_board_hash])
                            )
                else:
                    opponent_move: Move = None
                    vprint(f"{opponent.name}'s turn ({plind})")
                    while opponent_move is None or not board.is_valid_move(opponent_move):
                        opponent_move = opponent.choose_move(board, plind)
                    board.move(plind, opponent_move)
                    next_board_hash = board.hash(plind)

            
            reward = self.reward("game", board, player_position=whoami)
            rewards_per_episode[episode] += reward
            self.qtable[previous_board_hash][move] *= 1-self.learning_rate
            self.qtable[previous_board_hash][move] += self.learning_rate * (
                reward + self.discount_rate * np.max(self.qtable[next_board_hash])
                )

            self.exploration_rate = clamp(np.exp(-self.exploration_decay_rate * episode), self.min_exploration_rate, 1)
            if episode % 1000 == 0:
                pbar.set_postfix({
                    "Explored": len(self.qtable.keys())
                    })

        return rewards_per_episode
    
    def choose_move(self, board: Board, player_index: PlayerIndex) -> Move:
        if board.hash(player_index) in self.qtable:
            move = np.argmax(self.qtable[board.hash(player_index)])
            if board.is_valid_move(move):
                return move
        return random.randrange(0,9)

In [344]:
qlearning = RonSwanson(num_of_episodes=100_000)
rewards_history = qlearning.train()

Training against Andy Dwyer: 100%|██████████| 100000/100000 [04:21<00:00, 382.66episode/s, Explored=9994]


In [351]:
benchmark(qlearning, AndyDwyer(), games=1000)

[Ron Swanson vs Andy Dwyer for 1000 games]
        Wins: 84.90%, 95.20% as first, 74.60% as second
Wins + Draws: 97.00%, 99.80% as first, 94.20% as second
