# REINFORCEMENT LEARNING FOR TIC-TAC-TOE USING THE STATE-ACTION FUNCTION


In [14]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy
from tqdm.auto import tqdm
import numpy as np

In [15]:
State = namedtuple('State', ['x','o'])

MAGIC = [2, 7, 6,
         9, 5, 1,
         4, 3, 8]


In [16]:
#function used to print the game state
def print_board(state):
    for r in range(3):
        for c in range(3):
            index = r*3 + c
            if MAGIC[index] in state.x:
                print(' X ', end='')
            elif MAGIC[index] in state.o:
                print(' O ', end='')
            else:
                print(' . ', end='')
        print()
    print()

#function to check if there is a winning state
def win(elements):
    return any(sum(c) == 15 for c in combinations(elements,3))

#function to give a reward and to check if there is a winning player (and which player is) or there is a draw
def state_value_first(state:State):
    '''This function is used to give rewards considering the win of the FIRST player positively'''
    if win(state.x):  
        return 1   #--> good reward if the player 1 wins
    elif win(state.o):
        return -1  #--> bad reward if the player 1 lose
    else:
        return 0   #--> no reward if there is a tie

#function to 'train' the second player
def state_value_second(state:State):
    '''This function is used to give rewards considering the win of the SECOND player positively'''
    if win(state.x):
        return -1
    elif win(state.o):
        return 1
    else:
        return 0

def state_value_second_v2(state:State):
    '''This function is used to give rewards considering the win of the SECOND player positively,
    differently from the other one, in this case the tie has the same positive reward as the win.'''
    if win(state.x):
        return -1
    elif win(state.o):
        return 1
    else:
        return 5



In [17]:
#the idea is to play randomly but updating the values of the state-action table
def train_agent_1(state_action_dict_first, epsilon, n_games, alpha, decay):
    
    for steps in tqdm(range(n_games)):
        trajectory = list()   #in the trajectory we append the tuple (state,action)
        state = State(set(), set())
        available = set(range(1,10))  #the available actions

        while available:
            if np.random.uniform(0,1) < epsilon: # --> random move
                x = choice(list(available))
            else:  #--> best move
                possible_actions =  [(key, value) for key, value in state_action_dict_first.items() if key[:2] == (state.x, state.o)]
                if len(possible_actions) > 0:
                    x = max(possible_actions, key=lambda x: x[1])[0][2]
                else:
                    x = choice(list(available))
            trajectory.append((deepcopy(state),x)) #append both the state and the action made
            state.x.add(x)
            available.remove(x) 
            if win(state.x) or not available:
                final_reward = state_value_first(state)
                break
            
            o = choice(list(available))
            trajectory.append((deepcopy(state),o)) #append both the state and the action made
            state.o.add(o)        
            available.remove(o)
            if win(state.o) or not available:
                final_reward = state_value_first(state)
                break
        
        for state in trajectory: #for each state in the game update its value
            hashable_state = ((frozenset(state[0].x),frozenset(state[0].o), state[1]))
            state_action_dict_first[hashable_state] = state_action_dict_first[hashable_state] + alpha*(final_reward - state_action_dict_first[hashable_state])
        
        epsilon = epsilon*decay  #decrease the probability of doing random moves

    return state_action_dict_first  

def train_agent_2(state_action_dict_sec, epsilon, n_games, alpha, decay):
    
    for steps in tqdm(range(n_games)):
        trajectory = list()   #in the trajectory we append the tuple (state,action)
        state = State(set(), set())
        available = set(range(1,10))  #the available actions

        while available:
            x = choice(list(available))   #the first player always plays randomly
            trajectory.append((deepcopy(state),x)) #append both the state and the action made
            state.x.add(x)        
            available.remove(x)
            if win(state.x) or not available:
                final_reward = state_value_second(state)
                break

            if np.random.uniform(0,1) < epsilon: # --> random move
                o = choice(list(available))
            else:  #--> best move
                possible_actions =  [(key, value) for key, value in state_action_dict_sec.items() if key[:2] == (state.x, state.o)]
                if len(possible_actions) > 0:
                    o = max(possible_actions, key=lambda x: x[1])[0][2]
                else:
                    o = choice(list(available))
            trajectory.append((deepcopy(state),o)) #append both the state and the action made
            state.o.add(o)
            available.remove(o) 
            if win(state.o) or not available:
                final_reward = state_value_second(state)  
                break
        
        for state in trajectory: #for each state in the game update its value
            hashable_state = ((frozenset(state[0].x),frozenset(state[0].o), state[1]))
            state_action_dict_sec[hashable_state] = state_action_dict_sec[hashable_state] + alpha*(final_reward - state_action_dict_sec[hashable_state])
        
        epsilon = epsilon*decay #decrease the probability of doing random moves

    return state_action_dict_sec  


def train_agent_2_v2(state_action_dict_sec, epsilon, n_games, alpha, decay):
    
    for steps in tqdm(range(n_games)):
        trajectory = list()   #in the trajectory we append the tuple (state,action)
        state = State(set(), set())
        available = set(range(1,10))  #the available actions

        while available:
            x = choice(list(available))   #the first player always plays randomly
            trajectory.append((deepcopy(state),x)) #append both the state and the action made
            state.x.add(x)        
            available.remove(x)
            if win(state.x) or not available:
                final_reward = state_value_second_v2(state)
                break

            if np.random.uniform(0,1) < epsilon: # --> random move
                o = choice(list(available))
            else:  #--> best move
                possible_actions =  [(key, value) for key, value in state_action_dict_sec.items() if key[:2] == (state.x, state.o)]
                if len(possible_actions) > 0:
                    o = max(possible_actions, key=lambda x: x[1])[0][2]
                else:
                    o = choice(list(available))
            trajectory.append((deepcopy(state),o)) #append both the state and the action made
            state.o.add(o)
            available.remove(o) 
            if win(state.o) or not available:
                final_reward = state_value_second_v2(state)
                break
        
        for state in trajectory: #for each state in the game update its value
            hashable_state = ((frozenset(state[0].x),frozenset(state[0].o), state[1]))
            state_action_dict_sec[hashable_state] = state_action_dict_sec[hashable_state] + alpha*(final_reward - state_action_dict_sec[hashable_state])
        
        epsilon = epsilon*decay #decrease the probability of doing random moves

    return state_action_dict_sec  



# COMPUTE THE STATE-ACTION DICTIONARIES FOR BOTH THE FIRST AND SECOND PLAYER

In [18]:
state_action_dict_first = defaultdict(float)  #dict with good moves for first player and bad forthe second one
state_action_dict_sec = defaultdict(float) #dict with good moves for the second player and bad for the first one
state_action_dict_sec_v2 = defaultdict(float)
alpha = 0.001
epsilon = 0.95
n_games = 90000
decay = 0.999999

state_action_dict_first = train_agent_1(state_action_dict_first, epsilon, n_games, alpha, decay)
state_action_dict_sec = train_agent_2(state_action_dict_sec, epsilon, n_games, alpha, decay)
state_action_dict_sec_v2 = train_agent_2_v2(state_action_dict_sec_v2, epsilon, n_games, alpha, decay)

    

  0%|          | 0/90000 [00:00<?, ?it/s]

  0%|          | 0/90000 [00:00<?, ?it/s]

  0%|          | 0/90000 [00:00<?, ?it/s]

In [19]:
print(f'Number of different state-action values explored: {len(state_action_dict_first)}')
print(f'Number of different state-action values explored: {len(state_action_dict_sec)}')
print(f'Number of different state-action values explored: {len(state_action_dict_sec_v2)}')

Number of different state-action values explored: 16167
Number of different state-action values explored: 16167
Number of different state-action values explored: 16167


In [20]:
def policy_vs_random(state_action_dict):
    '''Function to test the agent against a random player. In this case the agent always start as first player.'''
    trajectory = list()   #in the trajectory we append the tuple (state,action)
    state = State(set(), set())  #initial state
    available = set(range(1,10))  #the available actions

    while available:
        possible_actions =  [(key, value) for key, value in state_action_dict.items() if key[:2] == (state.x, state.o)]
        x = max(possible_actions, key=lambda x: x[1])[0][2]   #taking the move with the maximum value
        trajectory.append((deepcopy(state),x)) #append both the state and the action made
        state.x.add(x)      #update the state
        available.remove(x) #update the availbale moves, removing the move done
        if win(state.x) or not available:
            final_reward = state_value_first(state)   #check if it is a final state (stop the game and check who is the winner) or not
            break
        
        o = choice(list(available))   #the second player always plays randomly
        trajectory.append((deepcopy(state),o)) #append both the state and the action made
        state.o.add(o)        
        available.remove(o)
        if win(state.o) or not available:
            final_reward = state_value_first(state)
            break

    return trajectory, final_reward       

def random_vs_policy(state_action_dict):
    '''Function to test the agent against a random player. In this case the agent always start as second player.'''
    trajectory = list()   #in the trajectory we append the tuple (state,action)
    state = State(set(), set())
    available = set(range(1,10))  #the available actions

    while available:
        x = choice(list(available))     #the first player always plays randomly
        trajectory.append((deepcopy(state),x)) #append both the state and the action made
        state.x.add(x)        
        available.remove(x)
        if win(state.x) or not available:
            final_reward = state_value_first(state)
            break


        possible_actions =  [(key, value) for key, value in state_action_dict.items() if key[:2] == (state.x, state.o)]
        o = max(possible_actions, key=lambda o: o[1])[0][2]
        trajectory.append((deepcopy(state),o)) #append both the state and the action made
        state.o.add(o)
        available.remove(o) 
        if win(state.o) or not available:
            final_reward = state_value_first(state)
            break
        
    return trajectory, final_reward 

    
def policy_vs_policy(state_action_dict):
    '''Function to test the agent against an agent that follows a given policy using the same state-action dictionary.'''
    trajectory = list()   #in the trajectory we append the tuple (state,action)
    state = State(set(), set())  #initial state
    available = set(range(1,10))  #the available actions

    while available:
        possible_actions =  [(key, value) for key, value in state_action_dict.items() if key[:2] == (state.x, state.o)]
        x = max(possible_actions, key=lambda x: x[1])[0][2]
        trajectory.append((deepcopy(state),x)) #append both the state and the action made
        state.x.add(x)
        available.remove(x) 
        if win(state.x) or not available:
            final_reward = state_value_first(state)
            break
        
        possible_actions =  [(key, value) for key, value in state_action_dict.items() if key[:2] == (state.x, state.o)]
        o = max(possible_actions, key=lambda x: x[1])[0][2]
        trajectory.append((deepcopy(state),o)) #append both the state and the action made
        state.o.add(o)        
        available.remove(o)
        if win(state.o) or not available:
            final_reward = state_value_first(state)
            break

    return trajectory, final_reward   


def policy1_vs_policy2(state_action_dict_first, state_action_dict_sec):
    '''Function to test the agent against an agent that follows a given policy. In this case the two players follows their best corresponding policies.'''
    trajectory = list()   #in the trajectory we append the tuple (state,action)
    state = State(set(), set())
    available = set(range(1,10))  #the available actions

    while available:
        possible_actions =  [(key, value) for key, value in state_action_dict_first.items() if key[:2] == (state.x, state.o)]
        x = max(possible_actions, key=lambda x: x[1])[0][2]
        trajectory.append((deepcopy(state),x)) #append both the state and the action made
        state.x.add(x)
        available.remove(x) 
        if win(state.x) or not available:
            final_reward = state_value_first(state)
            break
        
        possible_actions =  [(key, value) for key, value in state_action_dict_sec.items() if key[:2] == (state.x, state.o)]
        o = max(possible_actions, key=lambda x: x[1])[0][2]
        trajectory.append((deepcopy(state),o)) #append both the state and the action made
        state.o.add(o)        
        available.remove(o)
        if win(state.o) or not available:
            final_reward = state_value_first(state)
            break

    return trajectory, final_reward





# POLICY PLAYER VS RANDOM PLAYER

In [21]:
cnt_w = 0
cnt_t = 0
cnt_l = 0
for steps in tqdm(range(1000)):
    trajectory, final_reward = policy_vs_random(state_action_dict_first)
    if final_reward == 1:
        cnt_w += 1
    elif final_reward == 0:
        cnt_t += 1
    else:
        cnt_l += 1

print('FOR PLAYER 1')
print(f'WIN: {cnt_w}')
print(f'DRAW: {cnt_t}')
print(f'LOSS: {cnt_l}')

  0%|          | 0/1000 [00:00<?, ?it/s]

FOR PLAYER 1
WIN: 982
DRAW: 18
LOSS: 0


# RANDOM PLAYER VS POLICY PLAYER

In [22]:
cnt_w = 0
cnt_t = 0
cnt_l = 0
for steps in tqdm(range(1000)):
    trajectory, final_reward = random_vs_policy(state_action_dict_sec)
    if final_reward == 1:
        cnt_w += 1
    elif final_reward == 0:
        cnt_t += 1
    else:
        cnt_l += 1

print('FOR PLAYER 1')
print(f'WIN: {cnt_w}')
print(f'DRAW: {cnt_t}')
print(f'LOSS: {cnt_l}')

  0%|          | 0/1000 [00:00<?, ?it/s]

FOR PLAYER 1
WIN: 26
DRAW: 58
LOSS: 916


# BEST POLICY PLAYER VS BEST POLICY PLAYER

In the following cells two cases are showed: 
in the first one the second player is trained to win, instead
in the second one the second player is traind to avoind losing

In [23]:
cnt_w = 0
cnt_t = 0
cnt_l = 0
for steps in tqdm(range(1000)):
    trajectory, final_reward = policy1_vs_policy2(state_action_dict_first, state_action_dict_sec)
    if final_reward == 1:
        cnt_w += 1
    elif final_reward == 0:
        cnt_t += 1
    else:
        cnt_l += 1
        
print('FOR PLAYER 1')
print(f'WIN: {cnt_w}')
print(f'DRAW: {cnt_t}')
print(f'LOSS: {cnt_l}')



  0%|          | 0/1000 [00:00<?, ?it/s]

FOR PLAYER 1
WIN: 1000
DRAW: 0
LOSS: 0


It is due to the fact that the second player is trained to win and not to avoid to lose.

In [24]:
cnt_w = 0
cnt_t = 0
cnt_l = 0

for steps in tqdm(range(1000)):
    trajectory, final_reward = policy1_vs_policy2(state_action_dict_first, state_action_dict_sec_v2)
    if final_reward == 1:
        cnt_w += 1
    elif final_reward == 0:
        cnt_t += 1
    else:
        cnt_l += 1

print('FOR PLAYER 1')
print(f'WIN: {cnt_w}')
print(f'DRAW: {cnt_t}')
print(f'LOSS: {cnt_l}')

  0%|          | 0/1000 [00:00<?, ?it/s]

FOR PLAYER 1
WIN: 0
DRAW: 1000
LOSS: 0


# HUMAN VS COMPUTER

In [25]:
def human_vs_policy(state_action_dict):
    trajectory = list()   #in the trajectory we append the tuple (state,action)
    state = State(set(), set())
    available = set(range(1,10))  #the available actions

    while available:

        print_board(state)
        for i in range(3):
            for j in range(3):
                print(f' {MAGIC[i * 3 + j]} ', end=" ")
            print()

        x = int(input(f"Enter 'x' value (available: {available}): "))
        while x not in available:
            print("Invalid input. Please choose from available options.")
            x = int(input(f"Enter 'x' value (available: {available}): "))

        trajectory.append((deepcopy(state),x)) #append both the state and the action made
        state.x.add(x)        
        available.remove(x)
        if win(state.x) or not available:
            final_reward = state_value_first(state)
            break

        possible_actions =  [(key, value) for key, value in state_action_dict.items() if key[:2] == (state.x, state.o)]
        o = max(possible_actions, key=lambda x: x[1])[0][2]
        trajectory.append((deepcopy(state),o)) #append both the state and the action made
        state.o.add(o)
        available.remove(o) 
        if win(state.o) or not available:
            final_reward = state_value_first(state)
            break

    return trajectory, final_reward 

In [27]:
#try to play against my agent
trajectory, final_reward = human_vs_policy(state_action_dict_sec_v2)
if final_reward == 1:
    print('COMPUTER WINS')
elif final_reward == -1:
    print('LUCKY; YOU WIN')
else:
    print('TIE')

 .  .  . 
 .  .  . 
 .  .  . 

 2   7   6  
 9   5   1  
 4   3   8  
 .  .  . 
 .  X  . 
 O  .  . 

 2   7   6  
 9   5   1  
 4   3   8  
 .  O  X 
 .  X  . 
 O  .  . 

 2   7   6  
 9   5   1  
 4   3   8  
 .  O  X 
 X  X  O 
 O  .  . 

 2   7   6  
 9   5   1  
 4   3   8  
 O  O  X 
 X  X  O 
 O  .  X 

 2   7   6  
 9   5   1  
 4   3   8  
TIE
