In [50]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

In [51]:
def puntiCarta(carta):
    valore = carta%10 + 1
    if valore == 1: return 11
    if valore == 3: return 10
    if valore >= 8: return valore - 6
    return 0

class Mazzo:
    def __init__(self):
        self.carte = []
        for carta in range (0,40):
            self.carte.append(carta)
    
    def carteRimaste(self):
        return len(self.carte)
    
    def pesca(self):
        if self.carteRimaste() == 0:
            return None
        if self.carteRimaste() == 1:
            return self.carte.pop()
        n = np.random.randint(0, self.carteRimaste())
        return self.carte.pop(n)

In [52]:
class Briscola:
    def __init__(self):
        self.mazzo = Mazzo()
        self.cartaInFondo = self.mazzo.pesca()
        self.briscola = int(self.cartaInFondo/10)
        self.mano0 = []
        self.mano1 = []
        for _ in range(3):
            self.mano0.append(self.mazzo.pesca())
            self.mano1.append(self.mazzo.pesca())
        self.cartaTirata = self.mano0.pop()

    def fasePescata(self):
        pescata = self.mazzo.pesca()
        self.mano0.append(pescata)
        pescata = self.mazzo.pesca()
        self.mano1.append(pescata)
    
    def cartaVincente(self, carta0, carta1):
        seme0 = int(carta0/10)
        seme1 = int(carta1/10)
        if seme0 == seme1:
            if puntiCarta(carta0) > puntiCarta(carta1): return carta0
            if puntiCarta(carta0) < puntiCarta(carta1): return carta1
            if carta0 > carta1: return carta0
            return carta1
        if seme0 == self.briscola: return carta0
        if seme1 == self.briscola: return carta1
        return carta0
    
    def partitaFinita(self):
        return self.mazzo.carteRimaste() <= 31

In [53]:
class NewEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    def __init__(self):
        super(NewEnv, self).__init__()
        self.observation_space = spaces.MultiDiscrete([40, # carte in mano
                                                       40,
                                                       40,
                                                       4, # briscola
                                                       40]) # carta tirata
        self.action_space = spaces.Discrete(3)
        self.reward = 0
        self.briscola = Briscola()
        self.state = (self.briscola.mano1[0],
                      self.briscola.mano1[1],
                      self.briscola.mano1[2],
                      self.briscola.briscola,
                      self.briscola.cartaTirata)

    def step(self, posizioneCarta):
        done = False
        info = {}

        cartaPrimo = self.briscola.cartaTirata
        cartaSecondo = self.briscola.mano1.pop(posizioneCarta)
        cartaVincente = self.briscola.cartaVincente(cartaPrimo, cartaSecondo)
        # reward = puntiCarta(cartaPrimo) + puntiCarta(cartaSecondo)
        reward = 1
        if cartaSecondo != cartaVincente:
            reward = 0
        self.briscola.fasePescata()
        self.briscola.cartaTirata = self.briscola.mano0.pop()
        if self.briscola.partitaFinita():
            done = True

        observation = (self.briscola.mano1[0],
                      self.briscola.mano1[1],
                      self.briscola.mano1[2],
                      self.briscola.briscola,
                      self.briscola.cartaTirata)
        
        return observation, reward, done, info
    
    def reset(self):
        # self.reward = 0
        self.briscola = Briscola()
        observation = (self.briscola.mano1[0],
                      self.briscola.mano1[1],
                      self.briscola.mano1[2],
                      self.briscola.briscola,
                      self.briscola.cartaTirata)
        return observation
    
    def close (self):
        self.reset()

In [54]:
env = NewEnv()
done = False
observation = env.reset()

In [55]:
from collections import defaultdict
class Agent:
    def __init__(
        self,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def get_action(self, obs: tuple[int, int, int, int, int]) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
        """
        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return env.action_space.sample()

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return int(np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, int, int, int],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, int, int, int],
    ):
        """Updates the Q-value of an action."""
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)

In [56]:
learning_rate = 0.01
n_episodes = 1_000_000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1

agent = Agent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

In [58]:
for episode in range(n_episodes):
    if episode%(n_episodes/100) == 0: print(int(100*(episode/n_episodes)),"%")
    obs = env.reset()
    done = False

    # play one episode
    while not done:
        action = agent.get_action(obs)
        next_obs, reward, done, info = env.step(action)

        # update the agent
        agent.update(obs, action, reward, done, next_obs)
        obs = next_obs

    agent.decay_epsilon()

0 %
1 %
2 %
3 %
4 %
5 %
6 %
7 %
8 %
9 %
10 %
11 %
12 %
13 %
14 %
15 %
16 %
17 %
18 %
19 %
20 %
21 %
22 %
23 %
24 %
25 %
26 %
27 %
28 %
28 %
30 %
31 %
32 %
33 %
34 %
35 %
36 %
37 %
38 %
39 %
40 %
41 %
42 %
43 %
44 %
45 %
46 %
47 %
48 %
49 %
50 %
51 %
52 %
53 %
54 %
55 %
56 %
56 %
57 %
59 %
60 %
61 %
62 %
63 %
64 %
65 %
66 %
67 %
68 %
69 %
70 %
71 %
72 %
73 %
74 %
75 %
76 %
77 %
78 %
79 %
80 %
81 %
82 %
83 %
84 %
85 %
86 %
87 %
88 %
89 %
90 %
91 %
92 %
93 %
94 %
95 %
96 %
97 %
98 %
99 %


In [59]:
def printCarta(carta):
    seme = int(carta/10)
    if seme == 0: seme="denara"
    elif seme == 1: seme="bastoni"
    elif seme == 2: seme="spade"
    else: seme="coppe"
    valore = str(carta%10 + 1)
    print(valore + " di " + seme)

def printBriscola(briscola):
    if briscola == 0: print("denara")
    elif briscola == 1: print("bastoni")
    elif briscola == 2: print("spade")
    else: print("coppe")

In [66]:
from IPython.display import clear_output
from time import sleep
while True:
    clear_output(wait=True)
    briscola = Briscola()
    state = (briscola.mano1[0],
             briscola.mano1[1],
             briscola.mano1[2],
             briscola.briscola,
             briscola.cartaTirata)
    printBriscola(briscola.briscola)
    printCarta(briscola.cartaTirata)
    print("mano")
    printCarta(briscola.mano1[0])
    printCarta(briscola.mano1[1])
    printCarta(briscola.mano1[2])

    mossa = agent.get_action(state)
    print(mossa)

    seme0 = int(briscola.cartaTirata/10)
    seme1 = int(briscola.mano1[mossa]/10)
    if seme0 == seme1:
        if puntiCarta(briscola.cartaTirata) > puntiCarta(briscola.mano1[mossa]): print("no")
        elif puntiCarta(briscola.cartaTirata) < puntiCarta(briscola.mano1[mossa]): print("daje")
        elif briscola.cartaTirata > briscola.mano1[mossa]: print("no")
        else: print("daje")
    elif seme0 == briscola.briscola: print("no")
    elif seme1 == briscola.briscola: print("daje")
    else: print("no")
    sleep(10)

bastoni
5 di bastoni
mano
8 di bastoni
8 di coppe
7 di bastoni
0
daje


KeyboardInterrupt: 