Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: 25/12 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: 06/01 ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

You can see it this way: picking 3 numbers whose sum is 15

In [1449]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice, random, randint
from copy import deepcopy
from tqdm.auto import tqdm
import numpy as np

In [1450]:
any(sum(c)==15 for c in combinations({1,2,3,4,5,6},3))

True

In [1451]:
State= namedtuple('State',['x','o'])

In [1452]:
MAGIC=[2,7,6,
       9,5,1,
       4,3,8]

In [1453]:
def win(elements):
    """ Checks if elements is winning"""
    return any(sum(c)==15 for c in combinations(elements,3))

def state_value(pos:State):
    """Evaluate position: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0

In [1454]:
def print_board(pos):
    """ Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            index=r*3+c
            if MAGIC[index] in pos.x:
                print('X', end='')
            elif MAGIC[index] in pos.o:
                print('O', end='')
            else:
                print('-', end='')
        print("\n")

print("\n")





In [1455]:
class RandomPlayer:
    def __init__(self, symbol) -> None:
        self._symbol=symbol
        
    @property
    def symbol(self)-> int:
        return self._symbol
    
    def move(self,available_moves, state=None):
        #print("random move")
        return choice(list(available_moves))

# Q-LEARNING

In [1456]:

class QLearningAgent:
    def __init__(self, symbol):
        self.q_table = {}
        self.symbol=symbol
        self._winning_games=0
        self._drawn_games=0
        self.exploration_rate=0.5
        self.alpha=0.7
        self.gamma=0.9
        self.is_train=True


    def move(self, available_moves, state)-> int:
        
        if self.exploration_rate>0.01:
            self.exploration_rate*=0.99

        if self.alpha>0.1:
            self.alpha*=0.99

        if self.gamma<0.99:
            self.gamma*=1.01

        state_key = (frozenset(state.x), frozenset(state.o))

        if random() < self.exploration_rate:
            # Scelta casuale di un'azione
            action = choice(list(available_moves))
        else:
            # Scelta dell'azione basata sulla Q-table
            if state_key not in self.q_table:
                self.q_table[state_key] = dict.fromkeys(range(1,10), 0)
                
            action = max(self.q_table[state_key], key=self.q_table[state_key].get)

        if self.is_train:
            self.update_q_table(action, available_moves, state)
       
        if action not in available_moves:
            action=choice(list(available_moves))
            #self.update_q_table(action, available_moves, state)
            if state_key not in self.q_table:
                self.q_table[state_key] = dict.fromkeys([action], 0)
            return action
            
        return action

    def add_winning(self):
        self._winning_games+=1
        
    def add_drawn(self):
        self._drawn_games+=1

    def update_q_table(self, action, available_moves, state, reward=None):
        #print("updating")
        state_key=(frozenset(state.x), frozenset(state.o))
        if reward is None:

            if action not in available_moves:
                reward=-5
                
            else:
                reward=0.1
                
        if state_key not in self.q_table:
            self.q_table[state_key] = dict.fromkeys([action], 0)
        
        temp_state=deepcopy(state)
        temp_state.x.add(action)

        new_state = deepcopy(state)
        new_state.x.add(action)
        next_state_key = (frozenset(new_state.x), frozenset(new_state.o))

        if next_state_key not in self.q_table:
            self.q_table[next_state_key] = dict.fromkeys(range(1,10), 0)

        self.q_table[state_key][action] = (1 - self.alpha) * self.q_table[state_key].get(action, 0) + self.alpha * (reward + self.gamma * (max(self.q_table[next_state_key].values(), default=0)))


In [1457]:

def game(p1,p2, index):
    
    trajectory=list()
    state=State(set(), set())
    available=set(range(1,9+1))
    
    players=[p1,p2]

    while True:
    
        current_player=players[index]

        move=current_player.move(list(available),state)
        trajectory.append((deepcopy(state), move))
        available.remove(move)
        
        if(current_player.symbol == -1):
            state.o.add(move)
            if(win(state.o)) or not available:
                break
        
        else:
            state.x.add(move)
            if(win(state.x)) or not available:
                break
        last_index=index
        index=1-index

    # i compute the final reward
        
    final_reward=state_value(state)
    #print(index, final_reward)
    return trajectory, final_reward, last_index
    

In [1458]:
#o
p1=RandomPlayer(-1)
#x
p2=QLearningAgent(1)

In [1459]:
def train(p1,p2, index):


      num_iterations=10_000
      p2._winning_games=0
      p2._drawn_games=0

      for _ in tqdm(range(num_iterations)):

            trajectory, final_reward=game(p1,p2,index)

            if final_reward ==0:
                  p2.add_drawn()
            elif final_reward==1:
                  p2.add_winning()

            if final_reward == -1:
            
                  s=trajectory[-2][0]
                  a=trajectory[-2][1]

            else:  
                  s=trajectory[-1][0]
                  a=trajectory[-1][1]   
                  

            p2.update_q_table(a,(),s,final_reward)
            

      if index==0:
            print("RANDOM STARTS")
      else:
            print("Q AGENT STARTS")
      print("Winning percentage of the agent ",p2._winning_games/num_iterations*100)
      print("Drawn percentage of the agent ",p2._drawn_games/num_iterations*100)
      #for c,v in p2.q_table.items():
            #print(c,v)
      
train(p1,p2,1)
train(p1,p2,0)

  6%|▋         | 650/10000 [00:00<00:04, 2187.19it/s]

100%|██████████| 10000/10000 [00:03<00:00, 2624.92it/s]


Q AGENT STARTS
Winning percentage of the agent  80.08
Drawn percentage of the agent  14.000000000000002


100%|██████████| 10000/10000 [00:03<00:00, 2820.24it/s]

RANDOM STARTS
Winning percentage of the agent  48.59
Drawn percentage of the agent  38.21





In [1460]:
def test(p1,p2, index):
    
    global count_winning
    global count_losing
    global count_tie

    #trajectory=list()
    state=State(set(), set())
    available=set(range(1,9+1))
    
    players=[p1,p2]
    

    while True:
    
        current_player=players[index]

        move=current_player.move(list(available),state)
        #trajectory.append((deepcopy(state), move))
        
        if(current_player.symbol == -1):
            state.o.add(move)
            available.remove(move) 
            if(win(state.o)) or not available:
                if (win(state.o)):
                    #print("O ")
                    
                    count_losing+=1
                else: 
                    #print("tie")
                    count_tie+=1
                break
        
        else:
            state.x.add(move)
            #trajectory.append((deepcopy(state),move))
            available.remove(move) 
            if(win(state.x)) or not available:
                #current_player.add_winning()
                if(win(state.x)):
                    #print("X")
                    count_winning+=1
                else:
                    #print("tie")
                    count_tie+=1

                break
                
        index=1-index
    return


count_winning=0
count_losing=0
count_tie=0

p2.is_train=False


for _ in tqdm(range(1000)):
    test(p1,p2,0)

print("RANDOM STARTS")
    
print("winning or tie", (count_winning+count_tie)/1000)
print("losing ", count_losing/1000)


count_winning=0
count_losing=0
count_tie=0


for _ in tqdm(range(1000)):
    test(p1,p2,1)
print("Q AGENT STARTS")

print("winning or tie", (count_winning+count_tie)/1000)
print("losing ", count_losing/1000)



100%|██████████| 1000/1000 [00:00<00:00, 56113.34it/s]


RANDOM STARTS
winning or tie 0.913
losing  0.087


100%|██████████| 1000/1000 [00:00<00:00, 27560.02it/s]

Q AGENT STARTS
winning or tie 0.95
losing  0.05





In [1461]:

for c,v in p2.q_table.items():
    print(c,v)

(frozenset(), frozenset()) {7: 0.09999999999999991, 9: 0.09746328402025484, 5: 0.09585185820226724, 1: 0.08128869860331746, 6: 0.09633597785800481, 3: 0.0864761033596508, 4: 0.0684710518292968, 2: 0.07443971145578174, 8: 0.07927846032393907}
(frozenset({7}), frozenset()) {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
(frozenset({7}), frozenset({6})) {5: 0.09999999999999991, 2: 0.009961524144746199, 4: 0.01893072865662878, 9: 0.027006463695468517, 8: 0.009961524144746199}
(frozenset({5, 7}), frozenset({6})) {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
(frozenset({5, 7}), frozenset({1, 6})) {2: -0.18670312680118506, 9: -0.11822029039788899, 4: -0.17741722645727287}
(frozenset({2, 5, 7}), frozenset({1, 6})) {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
(frozenset({9}), frozenset()) {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
(frozenset({9}), frozenset({4})) {1: 0.0698995103973016, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
(frozenset({9, 1}), fro

# MONTECARLO (attempts)

In [1462]:
class Agent:
    def __init__(self, symbol) -> None:
        self._symbol=symbol
        self._dict_moves=defaultdict(float)
        self._winning_games=0

    @property
    def symbol(self)-> int:
        return self._symbol
    
    @property
    def dict_moves(self)-> int:
        return self._dict_moves
    
    def move(self,available_moves, state)->int:


        temp_state = deepcopy(state) 
        if random()<0.1:
            best_move = choice(list(available_moves))
            temp_state.x.add(best_move)
            if (frozenset(temp_state.x), frozenset(state.o)) not in self.dict_moves.keys():
                self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0
 
        else:
        
        
            keys= [k for k in self.dict_moves.keys() if (k[1] == state.o and k[0]==state.x )]

            if keys:
                best_move = None
                max_value = float('-inf')

                for move in available_moves:
                    temp_state=deepcopy(state)
                    temp_state.x.add(move)
                    hashable_state = (frozenset(temp_state.x), frozenset(temp_state.o))

                    if hashable_state in self.dict_moves.keys():
                        move_value = self.dict_moves[hashable_state]
                        if move_value > max_value:
                            max_value = move_value
                            best_move = move

                
                if best_move is None:
                    
                    best_move = choice(list(available_moves))
                    #state.x.add(best_move)

                    temp_state.x.add(best_move)
                    self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0
            else:
                best_move = choice(list(available_moves))
                #state.x.add(best_move)
                temp_state.x.add(best_move)
                self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0


        """ if state_value(temp_state) == 1:
            keys= [k for k in self.dict_moves.keys() if (k[1] == temp_state.o and k[0]==temp_state.x )]
            new_state = sorted(keys, key=lambda k: self.dict_moves[k], reverse=True)[0]
            self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0.99*self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]+0.01*(0.5+0.01*self.dict_moves[(frozenset(new_state[0]), frozenset(new_state[1]))])
        elif state_value(temp_state) == -1:
            keys= [k for k in self.dict_moves.keys() if (k[1] == temp_state.o and k[0]==temp_state.x )]
            new_state = sorted(keys, key=lambda k: self.dict_moves[k], reverse=True)[0]
    
            self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0.99*self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]+0.01*(-0.5+0.01*self.dict_moves[(frozenset(new_state.x), frozenset(new_state.o))])
        """
        
        keys= [k for k in self.dict_moves.keys() if (k[1] == temp_state.o and k[0]==temp_state.x )]
      
    
        if keys:
            new_state = sorted(keys, key=lambda k: self.dict_moves[k], reverse=True)[0]

            self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0.99*self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]+0.01*(1+0.01*self.dict_moves[(frozenset(new_state[0]), frozenset(new_state[1]))])
        
        return best_move
    
    def add_winning(self):
        self._winning_games+=1

In [1463]:
class Agent:
    def __init__(self, symbol) -> None:
        self._symbol=symbol
        self._dict_moves=defaultdict(float)
        self._winning_games=0

    @property
    def symbol(self)-> int:
        return self._symbol
    
    @property
    def dict_moves(self)-> int:
        return self._dict_moves
    
    def move(self,available_moves, state)->int:

        if random()<0.05:
            best_move = choice(list(available_moves))
            temp_state = deepcopy(state) 
            temp_state.x.add(best_move)
            if (frozenset(state.x), frozenset(state.o)) not in self.dict_moves.keys():
                self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0.001
            
        else:
            
            #keys= [k for k in self.dict_moves.keys() if (k[1] == state.o and len(k[0])==len(state.x)+1 )]
            keys= [k for k in self.dict_moves.keys() if (k[1] == state.o and k[0]==state.x )]

            if keys:

                best_move = None
                max_value = float('-inf')

                for move in available_moves:
                    temp_state = deepcopy(state)  
                    temp_state.x.add(move)
                    hashable_state = (frozenset(temp_state.x), frozenset(temp_state.o))

                    if hashable_state in self.dict_moves.keys():
                        move_value = self.dict_moves[hashable_state]
                        if move_value > max_value:
                            max_value = move_value
                            best_move = move

                if best_move is None:
                    best_move = choice(list(available_moves))
                    #state.x.add(best_move)
                    temp_state = deepcopy(state) 
                    temp_state.x.add(best_move)
                    self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0.001
            else:
                best_move = choice(list(available_moves))
                #state.x.add(best_move)
                temp_state = deepcopy(state) 
                temp_state.x.add(best_move)
                self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0.001
                
        return best_move
    
    def add_winning(self):
        self._winning_games+=1

def random_game_2(p1,p2):
    
    trajectory=list()
    state=State(set(), set())
    available=set(range(1,9+1))


    players=[p1,p2]
    index=choice([0,1])
    #index=1
    #index=0
    while True:
        
        current_player=players[index]

        move=current_player.move(list(available),state)

        if(current_player.symbol == -1):
            state.o.add(move)
            trajectory.append(deepcopy(state))
            available.remove(move) 
            if(win(state.o)) or not available:
                break
        
        else:
            state.x.add(move)
            trajectory.append(deepcopy(state))
            available.remove(move) 
            if(win(state.x)) or not available:
                current_player.add_winning()
                break
    
        index=1-index

    return trajectory
"""     
value_dictionary=defaultdict(float)
epsilon=0.001


#o
p1=RandomPlayer(-1)
#x
p2=Agent(1)

num_iterations=500_000

for steps in tqdm(range(num_iterations)):
    trajectory=random_game_2(p1,p2)
   
    # i compute the final reward
    final_reward=state_value(trajectory[-1])

    #update all the state according to this reward
    for s in trajectory:
        hashable_state=(frozenset(s.x),frozenset(s.o))
        p2.dict_moves[hashable_state]= p2.dict_moves[hashable_state]+epsilon*(final_reward-p2.dict_moves[hashable_state])


print("Winning percentage of the agent ",p2._winning_games/num_iterations*100) """

'     \nvalue_dictionary=defaultdict(float)\nepsilon=0.001\n\n\n#o\np1=RandomPlayer(-1)\n#x\np2=Agent(1)\n\nnum_iterations=500_000\n\nfor steps in tqdm(range(num_iterations)):\n    trajectory=random_game_2(p1,p2)\n   \n    # i compute the final reward\n    final_reward=state_value(trajectory[-1])\n\n    #update all the state according to this reward\n    for s in trajectory:\n        hashable_state=(frozenset(s.x),frozenset(s.o))\n        p2.dict_moves[hashable_state]= p2.dict_moves[hashable_state]+epsilon*(final_reward-p2.dict_moves[hashable_state])\n\n\nprint("Winning percentage of the agent ",p2._winning_games/num_iterations*100) '