In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

class Board():
    def __init__(self):
        self.state = np.zeros((4,4)) # Cambio del tamaño de la matriz

    def valid_moves(self):
        return [(i, j) for j in range(4) for i in range(4) if self.state[i, j] == 0]

    def update(self, symbol, row, col):
        if self.state[row, col] == 0:
            self.state[row, col] = symbol
        else:
            raise ValueError ("movimiento ilegal !")

    def is_game_over(self):
        # comprobar filas y columnas de 4 en raya
        if (self.state.sum(axis=0) == 4).sum() >= 1 or (self.state.sum(axis=1) == 4).sum() >= 1:
            return 1
        if (self.state.sum(axis=0) == -4).sum() >= 1 or (self.state.sum(axis=1) == -4).sum() >= 1:
            return -1 
        # comprobar diagonales de 4 en raya
        diag_sums = [
            sum([self.state[i, i] for i in range(4)]),
            sum([self.state[i, 3 - i ] for i in range(4)]),
        ]
        if diag_sums[0] == 4 or diag_sums[1] == 4:
            return 1
        if diag_sums[0] == -4 or diag_sums[1] == -4:
            return -1        
        # empate 
        if len(self.valid_moves()) == 0:
            return 0
        # seguir jugando
        return None

    def reset(self):
        self.state = np.zeros((4,4))

In [8]:
# tqdm es una librería que nos permite visualizar el progreso de un bucle
from tqdm import tqdm 

class Game():
    def __init__(self, player1, player2):
        player1.symbol = 1
        player2.symbol = -1
        self.players = [player1, player2]
        self.board = Board()

    def selfplay(self, rounds=100):
        wins = [0, 0]
        for i in tqdm(range(1, rounds + 1)):
            self.board.reset()
            for player in self.players:
                player.reset()
            game_over = False
            while not game_over:
                for player in self.players:
                    action = player.move(self.board)
                    self.board.update(player.symbol, action[0], action[1])
                    for player in self.players:
                        player.update(self.board)
                    if self.board.is_game_over() is not None:
                        game_over = True
                        break
            self.reward()
            for ix, player in enumerate(self.players):
                if self.board.is_game_over() == player.symbol: 
                    wins[ix] += 1
        return wins


    def reward(self):
        winner = self.board.is_game_over()
        if winner == 0: # empate
            for player in self.players:
                player.reward(0.2)
        else: # le damos 1 recompensa al jugador que gana
            for player in self.players:
                if winner == player.symbol:
                    player.reward(1)
                else:
                    player.reward(0)

In [9]:
class Agent():
    def __init__(self, alpha=0.5, prob_exp=0.5):
        self.value_function = {} # tabla con pares estado -> valor
        self.alpha = alpha         # learning rate
        self.positions = []       # guardamos todas las posiciones de la partida
        self.prob_exp = prob_exp   # probabilidad de explotar

    def reset(self):
        self.positions = []

    def move(self, board, explore=True):
        valid_moves = board.valid_moves()
        # exploracion
        if explore and np.random.uniform(0, 1) < self.prob_exp:
            # vamos a una posición aleatoria
            ix = np.random.choice(len(valid_moves))
            return valid_moves[ix]
        # explotacion
        # vamos a la posición con más valor
        max_value = -1000
        for row, col in valid_moves:
            next_board = board.state.copy()
            next_board[row, col] = self.symbol
            next_state = str(next_board.reshape(4*4))  # 3*3 a 4*4
            value = 0 if self.value_function.get(next_state) is None else self.value_function.get(next_state)
            if value >= max_value:
                max_value = value
                best_row, best_col = row, col
        return best_row, best_col

    def update(self, board):
        self.positions.append(str(board.state.reshape(4*4)))  # 3*3 a 4*4 string

    #Actualizar el valor de la recompensa de la ultima partida
    def reward(self, reward):
        # al final de la partida (cuando recibimos la recompensa)
        # iteramos por tods los estados actualizando su valor en la tabla a la inversa porque depende de 
        # las recompensas futuras 
        for p in reversed(self.positions):
            if self.value_function.get(p) is None:
                self.value_function[p] = 0
            self.value_function[p] += self.alpha * (reward - self.value_function[p])
            reward = self.value_function[p]

In [75]:
# Definir los valores de alpha y prob_exp que quieres probar
alphas = [0.1, 0.3, 0.5, 0.9]
prob_exps = [0.1, 0.2, 0.3, 0.4, 0.5]
n_jugadas = 1000
# Crear una lista para almacenar los resultados
results = []

# Iterar sobre los valores de alpha y prob_exp
for alpha in alphas:
    for prob_exp in prob_exps:
        # Crear un nuevo agente con los valores actuales de alpha y prob_exp
        agent1 = Agent(alpha=alpha, prob_exp=prob_exp)
        agent2 = Agent()
        game = Game(agent1, agent2)
        wins = 0
        game = Game(agent1, agent2)
        wins += sum(game.selfplay(n_jugadas))
        # Almacenar los resultados
        results.append((alpha, prob_exp, wins))

100%|██████████| 1000/1000 [00:15<00:00, 63.14it/s]
100%|██████████| 1000/1000 [00:15<00:00, 63.75it/s]
100%|██████████| 1000/1000 [00:16<00:00, 61.07it/s]
100%|██████████| 1000/1000 [00:16<00:00, 61.67it/s]
100%|██████████| 1000/1000 [00:17<00:00, 56.38it/s]
100%|██████████| 1000/1000 [00:20<00:00, 49.82it/s]
100%|██████████| 1000/1000 [00:17<00:00, 56.69it/s]
100%|██████████| 1000/1000 [00:14<00:00, 70.04it/s]
100%|██████████| 1000/1000 [00:13<00:00, 75.17it/s]
100%|██████████| 1000/1000 [00:12<00:00, 81.26it/s]
100%|██████████| 1000/1000 [00:14<00:00, 68.24it/s]
100%|██████████| 1000/1000 [00:14<00:00, 69.15it/s]
100%|██████████| 1000/1000 [00:13<00:00, 71.82it/s]
100%|██████████| 1000/1000 [00:13<00:00, 75.47it/s]
100%|██████████| 1000/1000 [00:12<00:00, 80.68it/s]
100%|██████████| 1000/1000 [00:15<00:00, 64.74it/s]
100%|██████████| 1000/1000 [00:15<00:00, 66.58it/s]
100%|██████████| 1000/1000 [00:14<00:00, 68.88it/s]
100%|██████████| 1000/1000 [00:13<00:00, 72.91it/s]
100%|███████

In [77]:
# Convertir results en un DataFrame de pandas
df = pd.DataFrame(results, columns=['alpha', 'prob_exp', 'ganadas'])
# ver el mayor numero de la columna de ganadas
print(df[df['ganadas'] == df['ganadas'].max()])
print(df)

   alpha  prob_exp  ganadas
0    0.1       0.1      736
    alpha  prob_exp  ganadas
0     0.1       0.1      736
1     0.1       0.2      586
2     0.1       0.3      623
3     0.1       0.4      536
4     0.1       0.5      575
5     0.3       0.1      654
6     0.3       0.2      607
7     0.3       0.3      600
8     0.3       0.4      539
9     0.3       0.5      535
10    0.5       0.1      702
11    0.5       0.2      640
12    0.5       0.3      627
13    0.5       0.4      588
14    0.5       0.5      545
15    0.9       0.1      598
16    0.9       0.2      577
17    0.9       0.3      572
18    0.9       0.4      572
19    0.9       0.5      559


In [80]:
agent1 = Agent(alpha=0.1 , prob_exp=0.1)
agent1 = Agent(alpha=0.3 , prob_exp=0.1)

agent2 = Agent()

game = Game(agent1, agent2)

game.selfplay(30000)

100%|██████████| 30000/30000 [07:40<00:00, 65.08it/s]


[8584, 14818]

In [83]:
funcion_de_valor = sorted(agent1.value_function.items(), key=lambda kv: kv[1], reverse=True)
tabla = pd.DataFrame({'estado': [x[0] for x in funcion_de_valor], 'valor': [x[1] for x in funcion_de_valor]})

tabla

Unnamed: 0,estado,valor
0,[-1. -1. 1. -1. 0. 0. 1. 1. 0. -1. 1. ...,0.999995
1,[ 0. -1. 0. 1. 0. 0. 0. 1. 0. -1. 0. ...,0.999977
2,[ 0. 0. -1. 1. -1. 0. 0. 1. 0. 0. 0. ...,0.999977
3,[-1. -1. 0. 1. 0. 0. 0. 1. 0. 0. 0. ...,0.999954
4,[ 1. -1. -1. -1. 0. 1. 1. 1. -1. -1. 1. ...,0.999954
...,...,...
64148,[-1. -1. 0. -1. 1. 0. 1. 1. -1. 0. 1. -...,0.000000
64149,[-1. -1. 0. -1. 1. 0. 0. 1. -1. 0. 1. -...,0.000000
64150,[ 0. -1. 0. -1. 1. 0. 0. 1. -1. 0. 1. -...,0.000000
64151,[ 0. -1. 0. -1. 1. 0. 0. 1. -1. 0. 0. -...,0.000000


In [85]:
import pickle

with open('examen.pickle', 'wb') as handle:
    pickle.dump(agent1.value_function, handle, protocol=pickle.HIGHEST_PROTOCOL)