In [330]:
from random import randint, random, shuffle
import numpy as np
from time import time, sleep
import pickle
import os

# Card
rappresenta una certa del gioco briscola, gestisce info quali seme e punti associati alla carta

- attributi
    - number (il numero rappresentativo della carta)

- funzioni  
    - getSeed()
    - getValue()
    - getPoints()

In [331]:
class Card:
    POINT_SPLITTER = 9

    def __init__(self, number) -> None:
        self.number = number
    
    def getSeed(self) -> int:
        return self.number // 10
    
    def getValue(self) -> int:
        return self.number % 10 + 1
    
    def getPoints(self) -> int:
        match self.getValue():
            case 1: return 11
            case 3: return 10
            case x if x < 8: return 0
            case x: return x - 6
    
    def getStatus(self) -> tuple:
        match self.getPoints():
                case 0 : cardZone = 0 
                case x if x < self.POINT_SPLITTER: cardZone = 1
                case _: cardZone = 2
             
        return (self.getSeed(), cardZone)

    

# Deck
rappresenta il mazzo di carte del gioco briscola

- attributi
    - cards (lista delle carte nel mazzo)
    - lastCard (l'ultima carta del mazzo e di conseguenza la briscola)

- funzioni
    - reset() 
    - draw()
    - cardsLeft()
    - getLastCard()

In [332]:
class Deck:
    
    def __init__(self) -> None:
        self.cards = []
        for i in range(0,40):
            self.cards.append(Card(i))
        
        # si randomizza il mazzo
        shuffle(self.cards)
    
    def reset(self) -> None:
        self.__init__()
    
    def draw(self) -> Card:
        return self.cards.pop(0)

    def cardsLeft(self) -> int:
        return len(self.cards)
    
    def getLastCard(self) -> Card:
        return self.cards[-1]

# PlayerStatus
tiene traccia delle info relative allo stato di un giocatore 

- **PlayerStatus**
    - oppoOverThreshold [true, flase]
    - fstCard (seed, value) [0, 1, 2, 3] [0, 1, 2] 
    - sndCard (seed, value) [0, 1, 2, 3] [0, 1, 2] 
    - trdCard (seed, value) [0, 1, 2, 3] [0, 1, 2] 
- attributi
        - hand 
        - oppoOverThreshold 
        - points
- funzioni
    - reset()
    - getStatus()
    - addCard(card)
    - removeCard(handPosition)
    - toggleOppoOverThreshold()
    - victoryPassed()
    - getPoints() 
    - addPoints(points)

In [333]:
class PlayerStatus:

    def __init__(self) -> None:
        self.hand = [] # 3 x Cards
        self.oppoOverThreshold = 0 #[0, 1]
        self.points = 0
    
    def reset(self) -> None:
        self.__init__()
    
    def getStatus(self) -> tuple:
        handStatus = ()
        for card in self.hand:
            handStatus = handStatus + card.getStatus()

        return (self.oppoOverThreshold,) + handStatus
    
    def getCard(self, action) -> Card:
        return self.hand[action]
    
    def addCard(self, card) -> None:
        self.hand.append(card)
    
    def removeCard(self, handPosition) -> Card:
        return self.hand.pop(handPosition)

    def toggleOppoOverThreshold(self) -> None:
        self.oppoOverThreshold = 1
    
    def victoryPassed(self) -> bool:
        return self.points > 60
    
    def getPoints(self) -> int:
        return self.points
    
    def addPoints(self, points) -> None:
        self.points += points

## Status
- **EnvStatus**
    - briscolaOverThreshold [true, flase]
    - briscolaSeed [0, 1, 2, 3]
    - briscoleOverThreshold [true, false]
    - loadBySeed (denara, spade, bastoni, coppe) [true, false]
    - fstPlay (seed, pointValue) [0, 1, 2, 3] [0, 1, 2] 


- **PlayerStatus**
    - oppoOverThreshold [true, flase]
    - fstCard (seed, value) [0, 1, 2, 3] [0, 1, 2] 
    - sndCard (seed, value) [0, 1, 2, 3] [0, 1, 2] 
    - trdCard (seed, value) [0, 1, 2, 3] [0, 1, 2] 





tuple dimension -> 16 <br>
state dimension -> 2 * 4 * 2 * 2 * 2 * 2 * 2 * 4 * 3 * 2 * 4 * 3 * 4 * 3 * 4 * 3 = 10616832


## Action
- **play** [0, 1, 2]

# Environment
- **costanti**
    - STATUS_DIM
    - ACTION_DIM
    - BONUS_FACTOR
    - WIN_REWARD 
    - VICTORY_THRESHOLD 
    - BRISCOLA_THRESHOLD 
    - BRISCOLE_THRESHOLD 
- **attributi**
    - briscolaOverThreshold [true, flase]
    - briscolaSeed [0, 1, 2, 3]
    - briscoleOverThreshold [true, false]
    - briscoleOut
    - loadBySeed (denara, spade, bastoni, coppe) [true, false]
    - deck 
    - broker
    - slave 
    - players
    - winByPlayer

- **funzioni**
    - getShape() [restituisce la dimensionalità dell'ambiente]
    - envStatus()
    - getStatus(playerIndex) [restituisce lo stato riferito ad un player specifo]
    - reset(currPlayerIndex, nextPlayerIndex) [restituisce lo stato iniziale di entrambi i giocatori]
    - processPlaya(currPlay, nextPlay) [valuta le giocate restituendo il vincitore e il corrispondente reward]
    - updateStatus(currPlay, currPlayerIndex, nextPlay, nextPlayerIndex)
    - step(currPlayerAction, currPlayerIndex, nextPlayerAction, nextPlayerIndex) [restituisce il next state di p1 e p2, i reward di p1 e p2 e done a specificare se la partita è conclusa]
    

In [334]:
class Environment():

    WIN_REWARD = 200
    BONUS_FACTOR = 1
    VICTORY_THRESHOLD = 45
    BRISCOLA_THRESHOLD = 10
    BRISCOLE_THRESHOLD = 7
    
    Q1_STATUS_DIM = (2, 4, 2) + (2, 2, 2, 2) + (2, 4, 3, 4, 3, 4, 3)
    Q2_STATUS_DIM = (2, 4, 2) + (2, 2, 2, 2) + (2, 4, 3, 4, 3, 4, 3) + (4, 3)
    ACTION_DIM = (3,)

    def __init__(self) -> None:
        # info sullo stato della partita
        self.briscolaOverThreshold = 0  # [0, 1]
        self.briscolaSeed = None  # [0, 1, 2, 3]
        self.briscoleOut = 0
        self.briscoleOverThreshold = 0 # [0, 1]
        self.loadBySeed = [0, 0, 0, 0] # (denara, spade, bastoni, coppe) [0, 1]
        self.winByPlayer = [0,0,0] # [vittorie p1, vittorie p2, pareggi]

        # collegamenti esterni per la gestione dell'apprendimento della IA secondaria
        self.deck = Deck()
        self.broker = None
        self.slave = None
        self.players = (PlayerStatus(), PlayerStatus())
    
    def getShape(self) -> tuple:
        return (self.Q1_STATUS_DIM + self.ACTION_DIM, self.Q2_STATUS_DIM + self.ACTION_DIM)
            
    def envStatus(self) -> tuple:
        return (self.briscolaOverThreshold, self.briscolaSeed, self.briscoleOverThreshold) + tuple(self.loadBySeed)
    
    def getStatus(self, playerIndex) -> tuple:
        return self.envStatus() + self.players[playerIndex].getStatus()
    
    def getActionStatus(self, action, playerIndex):
        return self.players[playerIndex].getCard(action).getStatus()
    
    def reset(self, currPlayerIndex, nextPlayerIndex) -> tuple:
        self.players[0].reset()
        self.players[1].reset()
        self.deck.reset()

        # ripristino le info sullo stato della partita
        self.loadBySeed = [0, 0, 0, 0]
        self.briscoleOverThreshold = 0

        # aggiorno le info sulla nuova briscola
        self.briscolaSeed = self.deck.getLastCard().getSeed()
        self.briscolaOverThreshold = int(self.deck.getLastCard().getPoints() >= self.BRISCOLA_THRESHOLD)

        # ridistribuisco le carte ai giocatori
        for _ in range(3):
            self.players[nextPlayerIndex].addCard(self.deck.draw())
            self.players[currPlayerIndex].addCard(self.deck.draw())
        
        # il primo a giocare è colui che ha ricevuto le carte
        return (self.getStatus(currPlayerIndex), self.getStatus(nextPlayerIndex)) 
    
    # restituisce la coppia (winnerIndex, reward)
    def processPlays(self, currPlay, nextPlay, currPlayerIndex, nextPlayerIndex) -> tuple:
        totPoints = (currPlay.getPoints() + nextPlay.getPoints()) * self.BONUS_FACTOR
        winnerIndex = nextPlayerIndex

        if currPlay.getSeed() == nextPlay.getSeed():
            if currPlay.getValue() > nextPlay.getValue(): winnerIndex = currPlayerIndex
        elif nextPlay.getSeed() != self.briscolaSeed: winnerIndex = currPlayerIndex
        
        return (winnerIndex, totPoints)
    
    def updateStatus(self, currPlay, currPlayerIndex, nextPlay, nextPlayerIndex) -> None:
        if self.players[nextPlayerIndex].getPoints() > self.VICTORY_THRESHOLD: self.players[currPlayerIndex].toggleOppoOverThreshold()
        if self.players[currPlayerIndex].getPoints() > self.VICTORY_THRESHOLD: self.players[nextPlayerIndex].toggleOppoOverThreshold()

        # aggiorno il contatore delle briscole
        for play in [currPlay, nextPlay]:
            if play.getSeed() == self.briscolaSeed: 
                self.briscoleOut += 1
            if play.getPoints() >= 10: self.loadBySeed[play.getSeed()] = 1
        
        if self.briscoleOut > self.briscoleOverThreshold: self.briscoleOverThreshold = 1

    def step(self, currPlayerAction, currPlayerIndex, nextPlayerAction, nextPlayerIndex) -> tuple:
        currPlay = self.players[currPlayerIndex].removeCard(currPlayerAction)
        nextPlay = self.players[nextPlayerIndex].removeCard(nextPlayerAction)

        # valuto le giocate e aggiorno i punti
        stepWinner, reward = self.processPlays(currPlay, nextPlay, currPlayerIndex, nextPlayerIndex)
        self.players[stepWinner].addPoints(reward)
        print(stepWinner, reward)

        # genero i reward
        if stepWinner == currPlayerIndex: rewards = [reward, -reward]
        else: rewards = [-reward, reward]

        # aggiorno le info sullo stato
        self.updateStatus(currPlay, currPlayerIndex, nextPlay, nextPlayerIndex)

        # ridistribuisco le carte
        self.players[stepWinner].addCard(self.deck.draw())
        self.players[(stepWinner + 1) % 2].addCard(self.deck.draw())

        # controllo la vittoria o il fine partita
        done = False

        for playerIndex in range(len(self.players)):
            if self.players[playerIndex].victoryPassed():
                done = True
                self.winByPlayer[playerIndex] += 1
                if playerIndex == currPlayerIndex: 
                    rewards[0] += self.WIN_REWARD
                    rewards[1] -= self.WIN_REWARD
                else: 
                    rewards[0] -= self.WIN_REWARD
                    rewards[1] += self.WIN_REWARD
                break
        
        # se nessuno ha vinto e le carte rimaste sono 0 allora è un pareggio 60 a 60
        if self.deck.cardsLeft() == 0 and not done:
            done = True
            self.winByPlayer[2] += 1

        return (self.getStatus(currPlayerIndex), self.getStatus(nextPlayerIndex), rewards[0], rewards[1], done)


# IA

ha il compito di gestire l'apprendimento

- funzioni: 
    - epsGreedy()
    - greedy()
    - def run_episodes(numEpisodes, toPrint) [processo di test]
    - sarsaLearning(alpha, numEpisodes, eps, gamma, epsDecay) [processo di apprendimento]
    

In [335]:
class IA:
    EPS_THRESHOLD = 0.01
    TEST_THRESHOLD = 300
    TEST_EPISODES = 1000
    
    Q1_ID = 0
    Q2_ID = 1

    def __init__(self) -> None:
        self.env = Environment()
        self.Q1 = None # the first to roll
        self.Q2 = None # the second to roll

    def epsGreedy(self, Q, state, eps=0.1) -> int:
        # Epsilon greedy policy
        if np.random.uniform(0,1) < eps:
            # Choose a random action
            return np.random.randint(Q.shape[1])
        else:
            # Choose the action of a greedy policy
            return self.greedy(Q, state)


    def greedy(self, Q, state) -> int:
        #Greedy policy
        #return the index corresponding to the maximum action-state value
        return np.argmax(Q[state])
    
    def runTest(self, numEpisodes=100, toPrint=False) -> tuple:
        # Run some episodes to test the policy
        Q1Rew = []
        Q2Rew = []
        state1, state2 = self.env.reset(self.Q1_ID, self.Q2_ID)

        for _ in range(numEpisodes):
            done = False
            Q1StepRew = 0
            Q2StepRew = 0

            while not done:
                # select a greedy action
                action1 = self.greedy(self.Q1, state1)

                state2 = state2 + self.env.getActionStatus(action1, self.Q1_ID)
                action2 = self.greedy(self.Q2, state2)

                nextState1, nextState2, rew1, rew2, done = self.env.step(action1, self.Q1_ID, action2, self.Q2_ID)

                state1, state2 = nextState1, nextState2
                Q1StepRew += rew1
                Q2StepRew += rew2

                if done:
                    state1, state2 = self.env.reset(self.Q1_ID, self.Q2_ID)
                    Q1Rew.append(Q1StepRew)
                    Q2Rew.append(Q2StepRew)

        if toPrint:
            print('Q1 -> Mean score: %.3f of %i games!'%(np.mean(Q1Rew), numEpisodes))
            print('Q2 -> Mean score: %.3f of %i games!'%(np.mean(Q2Rew), numEpisodes))

        return np.mean(Q1Rew, Q2Rew)


    # research for optimal policy Q1 and Q2
    def sarsaLearning(self, alpha=0.01, numEpisodes=10000, eps=0.3, gamma=0.95, epsDecay=0.00005) -> None:
        
        Q1Shape, Q2Shape = self.env.getShape()

        # briscola need two players
        self.Q1 = np.zeros(Q1Shape) # always roll first
        self.Q2 = np.zeros(Q2Shape) # always roll second
        
        for ep in range(numEpisodes):
            state1, state2 = self.env.reset(self.Q1_ID, self.Q2_ID) # initial iniziale for Q1 and Q2
            done = False

            # decay the epsilon value until it reaches the threshold
            if eps > self.EPS_THRESHOLD:
                eps -= epsDecay

            action1 = self.epsGreedy(self.Q1, state1, eps)

            state2 = state2 + self.env.getActionStatus(action1, self.Q1_ID)
            action2 = self.epsGreedy(self.Q2, state2, eps)

            # loop the main body until the environment stops
            while not done:
                # Take one step in the environment
                nextState1, nextState2, rew1, rew2, done = self.env.step(action1, self.Q1_ID, action2, self.Q2_ID)

                # choose the next actions (needed for the SARSA update)
                nextAction1 = self.epsGreedy(nextState1, eps) 

                nextState2 = nextState2 +  self.env.getActionStatus(nextAction1, self.Q1_ID)
                nextAction2 = self.epsGreedy(nextState2, eps) 

                # SARSA updates
                self.Q1[state1][action1] = self.Q1[state1][action1] 
                + alpha * (rew1 + gamma * self.Q1[nextState1][nextAction1] - self.Q1[state1][action1])

                self.Q2[state2][action2] = self.Q2[state2][action2] 
                + alpha * (rew2 + gamma * self.Q2[nextState2][nextAction2] - self.Q2[state2][action2])

                state1, state2 = nextState1, nextState2

                action1, action2 = nextAction1, nextAction2

                # Test the policy 
                if (ep % self.TEST_THRESHOLD) == 0:
                    Q1Stats, Q2Stats = self.runTest(self.TEST_EPISODES)
                    print("Q1 -> Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, Q1Stats))
                    print("Q2 -> Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, Q2Stats))



In [339]:
STATUS_SHAPE = (2, 4, 2) + (2, 2, 2, 2) + (2, 4, 3, 4, 3, 4, 3)
ACTION_SHAPE = (3,)
Q = np.zeros(STATUS_SHAPE + ACTION_SHAPE)

state = (0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0)
action = 2
rew = 11


Q[state][action] = 10

print(Q[state][action])
print(Q[state][0])

10.0
0.0


In [337]:
env = Environment()
print(env.reset(0,1))
env.players[1].addPoints(61)
env.deck.cards = [Card(0), Card(1)]
print(env.players[0].getStatus(), env.players[1].getStatus())
print(env.step(0,1,0,0))
print(env.winByPlayer)

((0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0), (0, 1, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 2, 2))
(0, 0, 0, 1, 0, 2, 0) (0, 3, 0, 0, 0, 2, 2)
1 0
((0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2), (0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 0, 0), 200, -200, True)
[0, 1, 0]
