Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.


# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

- Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
- Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

- Reviews will be assigned on Monday, December 4
- You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)


In [1]:
import numpy as np


class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1  # 1 for X  -1 for O

    def print_board(self):
        for row in self.board:
            for cell in row:
                if cell == 1:
                    print("X", end=" ")
                elif cell == -1:
                    print("O", end=" ")
                else:
                    print("-", end=" ")
            print()

    def reset(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1

    def is_board_full(self):
        return not any(0 in row for row in self.board)

    def is_winner(self, player):
        return (
            np.any(np.all(self.board == player, axis=0))
            or np.any(np.all(self.board == player, axis=1))
            or np.all(np.diag(self.board) == player)
            or np.all(np.diag(np.fliplr(self.board)) == player)
        )

    def is_game_over(self):
        return self.is_winner(1) or self.is_winner(-1) or self.is_board_full()

    def is_tie(self):
        return self.is_board_full() and not self.is_winner(1) and not self.is_winner(-1)

    def get_available_moves(self):
        return np.argwhere(self.board == 0)

    def make_move(self, move):
        self.board[move[0], move[1]] = self.current_player
        self.current_player *= -1

In [2]:
import pickle

class QAgent:
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=1):
        self.q = {}
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma

    def getQ(self, state, action):
        key = str((state, action))
        if self.q.get(key) is None:
            self.q[key] = 1.0
        return self.q.get(key)

    def updateQ(self, state, action, reward, value):
        key = str((state, action))
        oldv = self.q.get(key, None)
        if oldv is None:
            self.q[key] = reward
        else:
            self.q[key] = oldv + self.alpha * (value - oldv)

    def choose_action(self, state, available_moves):
        if np.random.uniform(0, 1) < self.epsilon:
            action = available_moves[np.random.randint(0, len(available_moves))]
        else:
            q = [self.getQ(state, a) for a in available_moves]
            maxQ = max(q)
            if q.count(maxQ) > 1:
                best_options = [i for i in range(len(available_moves)) if q[i] == maxQ]
                i = np.random.choice(best_options)
            else:
                i = q.index(maxQ)
            action = available_moves[i]
        return action
    
    def save_q_values(self, filename='q_values.pkl'):
        with open(filename, 'wb') as file:
            pickle.dump(self.q, file)


    def load_q_values(self, filename='q_values.pkl'):
        with open(filename, 'rb') as file:
            self.q = pickle.load(file)



In [3]:
# from tqdm import tqdm

# def train_QAgent(agent, environment, episodes):
#     for _ in tqdm(range(episodes)):
#         environment.reset()
#         while not environment.is_game_over():
#             available_moves = environment.get_available_moves()
#             action = agent.choose_action(environment.board, available_moves)

#             environment.make_move(action)

#             if environment.is_winner(1):
#                 reward = 1
#             elif environment.is_winner(-1):
#                 reward = -1
#             else:
#                 reward = 0

#             agent.updateQ(environment.board, action, reward, reward)


# env = TicTacToe()
# agent = QAgent()
# train_QAgent(agent, env, 100_000)
# agent.save_q_values()

In [4]:
env = TicTacToe()
agent = QAgent()
agent.load_q_values()

env.reset()
while not env.is_game_over():
    print(f"Current Player: {env.current_player}")
    if env.current_player == 1:
        action = agent.choose_action(
            str(env.board.flatten()), env.get_available_moves()
        )

    else:
        action = tuple(map(int, input("Enter O's move (row, column): ").split()))

    env.make_move(action)
    env.print_board()
    if env.is_winner(1):
        print("\nX wins!")
    if env.is_winner(-1):
        print("\nO wins!")
    if env.is_tie():
        print("\nTie!")

Current Player: 1
X - - 
- - - 
- - - 
Current Player: -1
X - - 
- O - 
- - - 
Current Player: 1
X - X 
- O - 
- - - 
Current Player: -1
X O X 
- O - 
- - - 
Current Player: 1
X O X 
- O X 
- - - 
Current Player: -1
X O X 
- O X 
- O - 

O wins!
