Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: 25/12 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: 06/01 ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

You can see it this way: picking 3 numbers whose sum is 15

In [1]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice, random, randint
from copy import deepcopy
from tqdm.auto import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
any(sum(c)==15 for c in combinations({1,2,3,4,5,6},3))

True

In [3]:
State= namedtuple('State',['x','o'])

In [4]:
MAGIC=[2,7,6,
       9,5,1,
       4,3,8]

In [5]:
def win(elements):
    """ Checks if elements is winning"""
    return any(sum(c)==15 for c in combinations(elements,3))

def state_value(pos:State):
    """Evaluate position: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0

In [6]:
def print_board(pos):
    """ Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            index=r*3+c
            if MAGIC[index] in pos.x:
                print('X', end='')
            elif MAGIC[index] in pos.o:
                print('O', end='')
            else:
                print('-', end='')
        print("\n")

print("\n")





In [7]:
class RandomPlayer:
    def __init__(self, symbol) -> None:
        self._symbol=symbol
        
    @property
    def symbol(self)-> int:
        return self._symbol
    
    def move(self,available_moves, state=None):
        #print("random move")
        return choice(list(available_moves))

# Q-LEARNING

In [8]:

class QLearningAgent:
    def __init__(self, symbol):
        self.q_table = {}
        self.symbol=symbol
        self._winning_games=0
        self._drawn_games=0
        self.exploration_rate=0.5
        self.alpha=0.7
        self.gamma=0.9
        self.is_train=True


    def move(self, available_moves, state)-> int:
        
        if self.exploration_rate>0.01:
            self.exploration_rate*=0.99

        if self.alpha>0.1:
            self.alpha*=0.99

        if self.gamma<0.99:
            self.gamma*=1.01

        state_key = (frozenset(state.x), frozenset(state.o))

        if random() < self.exploration_rate:
            # Scelta casuale di un'azione
            action = choice(list(available_moves))
        else:
            # Scelta dell'azione basata sulla Q-table
            if state_key not in self.q_table:
                self.q_table[state_key] = dict.fromkeys(range(1,10), 0)
                
            action = max(self.q_table[state_key], key=self.q_table[state_key].get)

        if self.is_train:
            self.update_q_table(action, available_moves, state)
       
        if action not in available_moves:
            action=choice(list(available_moves))
            #self.update_q_table(action, available_moves, state)
            if state_key not in self.q_table:
                self.q_table[state_key] = dict.fromkeys([action], 0)
            return action
            
        return action

    def add_winning(self):
        self._winning_games+=1
        
    def add_drawn(self):
        self._drawn_games+=1

    def update_q_table(self, action, available_moves, state, reward=None):
        #print("updating")
        state_key=(frozenset(state.x), frozenset(state.o))
        if reward is None:

            if action not in available_moves:
                reward=-5
                
            else:
                reward=0.1
                
        if state_key not in self.q_table:
            self.q_table[state_key] = dict.fromkeys([action], 0)

        new_state = deepcopy(state)
        new_state.x.add(action)
        next_state_key = (frozenset(new_state.x), frozenset(new_state.o))

        if next_state_key not in self.q_table:
            self.q_table[next_state_key] = dict.fromkeys(range(1,10), 0)

        self.q_table[state_key][action] = (1 - self.alpha) * self.q_table[state_key].get(action, 0) + self.alpha * (reward + self.gamma * (max(self.q_table[next_state_key].values(), default=0)))


In [9]:

def game(p1,p2, index):
    
    trajectory=list()
    state=State(set(), set())
    available=set(range(1,9+1))
    
    players=[p1,p2]

    while True:
    
        current_player=players[index]

        move=current_player.move(list(available),state)
        trajectory.append((deepcopy(state), move))
        available.remove(move)
        
        if(current_player.symbol == -1):
            state.o.add(move)
            if(win(state.o)) or not available:
                break
        
        else:
            state.x.add(move)
            if(win(state.x)) or not available:
                break
        last_index=index
        index=1-index

    # i compute the final reward
        
    final_reward=state_value(state)
    #print(index, final_reward)
    return trajectory, final_reward
    

In [10]:
#o
p1=RandomPlayer(-1)
#x
p2=QLearningAgent(1)

In [11]:
def train(p1,p2, index):


      num_iterations=100_000
      p2._winning_games=0
      p2._drawn_games=0

      for _ in tqdm(range(num_iterations)):

            trajectory, final_reward=game(p1,p2,index)

            if final_reward ==0:
                  p2.add_drawn()
            elif final_reward==1:
                  p2.add_winning()

            if final_reward == -1:
            
                  s=trajectory[-2][0]
                  a=trajectory[-2][1]

            else:  
                  s=trajectory[-1][0]
                  a=trajectory[-1][1]   
                  

            p2.update_q_table(a,(),s,final_reward)
            

      if index==0:
            print("RANDOM STARTS")
      else:
            print("Q AGENT STARTS")
      print("Winning percentage of the agent ",p2._winning_games/num_iterations*100)
      print("Drawn percentage of the agent ",p2._drawn_games/num_iterations*100)
      #for c,v in p2.q_table.items():
            #print(c,v)
      
train(p1,p2,1)
train(p1,p2,0)

  1%|          | 612/100000 [00:00<02:10, 759.20it/s]

100%|██████████| 100000/100000 [01:29<00:00, 1119.24it/s]


Q AGENT STARTS
Winning percentage of the agent  92.252
Drawn percentage of the agent  6.582000000000001


100%|██████████| 100000/100000 [01:39<00:00, 1005.94it/s]

RANDOM STARTS
Winning percentage of the agent  59.297
Drawn percentage of the agent  27.959





In [12]:
def test(p1,p2, index):
    
    global count_winning
    global count_losing
    global count_tie

    #trajectory=list()
    state=State(set(), set())
    available=set(range(1,9+1))
    
    players=[p1,p2]
    

    while True:
    
        current_player=players[index]

        move=current_player.move(list(available),state)
        #trajectory.append((deepcopy(state), move))
        
        if(current_player.symbol == -1):
            state.o.add(move)
            available.remove(move) 
            if(win(state.o)) or not available:
                if (win(state.o)):
                    #print("O ")
                    
                    count_losing+=1
                else: 
                    #print("tie")
                    count_tie+=1
                break
        
        else:
            state.x.add(move)
            #trajectory.append((deepcopy(state),move))
            available.remove(move) 
            if(win(state.x)) or not available:
                #current_player.add_winning()
                if(win(state.x)):
                    #print("X")
                    count_winning+=1
                else:
                    #print("tie")
                    count_tie+=1

                break
                
        index=1-index
    return


count_winning=0
count_losing=0
count_tie=0
num_iterations=5000
p2.is_train=False


for _ in tqdm(range(num_iterations)):
    test(p1,p2,0)

print("RANDOM STARTS")
    
print("winning or tie", (count_winning+count_tie)/num_iterations*100,"%")
print("losing ", count_losing/num_iterations*100,"%")


count_winning=0
count_losing=0
count_tie=0


for _ in tqdm(range(num_iterations)):
    test(p1,p2,1)
    
print("Q AGENT STARTS")

print("winning or tie", (count_winning+count_tie)/num_iterations*100,"%")
print("losing ", count_losing/num_iterations*100,"%")



 12%|█▏        | 602/5000 [00:00<00:00, 6007.77it/s]

100%|██████████| 5000/5000 [00:00<00:00, 10390.70it/s]


RANDOM STARTS
winning or tie 89.72 %
losing  10.280000000000001 %


100%|██████████| 5000/5000 [00:00<00:00, 16913.28it/s]

Q AGENT STARTS
winning or tie 99.48 %
losing  0.52 %





In [None]:
""" 
for c,v in p2.q_table.items():
    print(c,v) """

# MONTECARLO 

In [14]:

class MontecarloAgent:
    def __init__(self, symbol):
        self.q_table = {}
        self.symbol=symbol
        self._winning_games=0
        self._drawn_games=0
        self.exploration_rate=0.1
        self.rewards=[]
        self.gamma=0.9
        self.is_train=True


    def move(self, available_moves, state)-> int:
        
        if self.exploration_rate>0.01:
            self.exploration_rate*=0.99


        if self.gamma<0.99:
            self.gamma*=1.01

        state_key = (frozenset(state.x), frozenset(state.o))

        if random() < self.exploration_rate:
            # Scelta casuale di un'azione
            action = choice(list(available_moves))
        else:
            # Scelta dell'azione basata sulla Q-table
            if state_key not in self.q_table:
                self.q_table[state_key] = dict.fromkeys(range(1,10), 0)
                
            action = max(self.q_table[state_key], key=self.q_table[state_key].get)


        if action not in available_moves:
            reward=-5
            
        else:
            reward=0.1
            
        if action not in available_moves:
            action=choice(list(available_moves))
            #self.update_q_table(action, available_moves, state)
            if state_key not in self.q_table:
                self.q_table[state_key] = dict.fromkeys([action], 0)
            return action
            
        self.rewards.append(reward)
        return action

    def add_winning(self):
        self._winning_games+=1
        
    def add_drawn(self):
        self._drawn_games+=1



In [15]:
def train_montecarlo(p1,p2, index):

    num_iterations=50_000
    p2._winning_games=0
    p2._drawn_games=0

    for _ in tqdm(range(num_iterations)):

        p2.rewards=[]
        trajectory, final_reward=game(p1,p2,index)

        if final_reward ==0:
                p2.add_drawn()
        elif final_reward==1:
                p2.add_winning()


        #p2.rewards.append(final_reward)
     
        for state,action in trajectory:
            if (frozenset(state.x), frozenset(state.o)) in p2.q_table:
                p2.q_table[(frozenset(state.x), frozenset(state.o))][action]+=0.001 * (final_reward -  p2.q_table[(frozenset(state.x), frozenset(state.o))][action])
                
    print("Winning percentage of the agent ",p2._winning_games/num_iterations*100) 
    print("Tie percentage ",p2._drawn_games/num_iterations*100 )
    print("win and ties",(p2._winning_games+p2._drawn_games)/num_iterations*100 )

            
def test_montecarlo(p1,p2, index):
    
    global count_winning
    global count_losing
    global count_tie

    #trajectory=list()
    state=State(set(), set())
    available=set(range(1,9+1))
    
    players=[p1,p2]
    

    while True:
    
        current_player=players[index]

        move=current_player.move(list(available),state)
        #trajectory.append((deepcopy(state), move))
        
        if(current_player.symbol == -1):
            state.o.add(move)
            available.remove(move) 
            if(win(state.o)) or not available:
                if (win(state.o)):
                    #print("O ")
                    
                    count_losing+=1
                else: 
                    #print("tie")
                    count_tie+=1
                break
        
        else:
            state.x.add(move)
            #trajectory.append((deepcopy(state),move))
            available.remove(move) 
            if(win(state.x)) or not available:
                #current_player.add_winning()
                if(win(state.x)):
                    #print("X")
                    count_winning+=1
                else:
                    #print("tie")
                    count_tie+=1

                break
                
        index=1-index
    return


#o
p1=RandomPlayer(-1)
#x
p2=MontecarloAgent(1)

print("--------------TRAIN----------------")

print("RANDOM STARTS")
train_montecarlo(p1,p2,0)
print("MONTECARLO AGENT STARTS")
train_montecarlo(p1,p2,1)


print("--------------TEST----------------")
count_winning=0
count_losing=0
count_tie=0

num_iterations=10000
for _ in tqdm(range(num_iterations)):
    test_montecarlo(p1,p2,0)

print("RANDOM STARTS")
    
print("winning or tie", (count_winning+count_tie)/num_iterations*100,"%")
print("losing ", count_losing/num_iterations*100,"%")


count_winning=0
count_losing=0
count_tie=0


for _ in tqdm(range(num_iterations)):
    test_montecarlo(p1,p2,1)

print("MONTECARLO AGENT STARTS")

print("winning or tie", (count_winning+count_tie)/num_iterations*100,"%")
print("losing ", count_losing/num_iterations*100,"%")




--------------TRAIN----------------
RANDOM STARTS


100%|██████████| 50000/50000 [00:33<00:00, 1492.99it/s]


Winning percentage of the agent  79.706
Tie percentage  5.726
win and ties 85.432
MONTECARLO AGENT STARTS


100%|██████████| 50000/50000 [00:32<00:00, 1523.23it/s]


Winning percentage of the agent  87.862
Tie percentage  9.478
win and ties 97.34
--------------TEST----------------


100%|██████████| 10000/10000 [00:00<00:00, 12395.20it/s]


RANDOM STARTS
winning or tie 87.29 %
losing  12.709999999999999 %


100%|██████████| 10000/10000 [00:00<00:00, 16541.17it/s]

MONTECARLO AGENT STARTS
winning or tie 97.42 %
losing  2.58 %



