In [392]:
import random as rnd
import numpy as np
import pickle
import os
import time

In [393]:
class Card:
    POINT_SPLITTER = 9
    ACE = 1
    THREE = 3
    POINTS = 8
    POINTS_VALUE = 6

    def __init__(self, number) -> None:
        self.number = number
    
    def getSeed(self) -> int:
        return self.number // 10
    
    def getValue(self) -> int:
        return (self.number % 10) + 1
    
    def getPoints(self) -> int:
        match self.getValue():
            case self.ACE: return 11
            case self.THREE: return 10
            case x if x < self.POINTS: return 0
            case x: return x - self.POINTS_VALUE
    
    def getState(self) -> tuple:
        match self.getPoints():
                case 0 : cardZone = 0 
                case x if x < self.POINT_SPLITTER: cardZone = 1
                case _: cardZone = 2
             
        return (self.getSeed(), cardZone)

    

In [394]:
class Deck:
    DECK_CARDS = 40
    
    def __init__(self) -> None:
        self.cards = []
        for i in range(0,self.DECK_CARDS):
            self.cards.append(Card(i))
        
        # randomizing the deck
        rnd.shuffle(self.cards)
    
    def reset(self) -> None:
        self.__init__()
    
    def draw(self) -> Card:
        return self.cards.pop(0)

    def cardsLeft(self) -> int:
        return len(self.cards)
    
    def getLastCard(self) -> Card:
        return self.cards[-1]

In [395]:
class Player:
    HAND_MAX_CARD = 3
    CARD_NULL_VALUE = (4,3)

    def __init__(self) -> None:
        self.hand = [] # 3 x Cards (seed = 4 and value = 3 mean null value)
        self.oppoOverThreshold = 0 #[0, 1]
        self.points = 0
        self.wins = 0
    
    def reset(self) -> None:
        self.hand = [] 
        self.oppoOverThreshold = 0
        self.points = 0
    
    def getState(self) -> tuple:
        handState = ()
        for card in self.hand:
            handState = handState + card.getState()

        # enter null values for each missing card
        missingCards = self.HAND_MAX_CARD - len(self.hand) 
        for _ in range(missingCards):
            handState = handState + self.CARD_NULL_VALUE

        return (self.oppoOverThreshold,) + handState
    
    def getCard(self, index) -> Card:
        if index > len(self.hand) - 1: print(index, len(self.hand))
        return self.hand[index]
    
    def addCard(self, card) -> None:
        self.hand.append(card)
    
    def removeCard(self, index) -> Card:
        return self.hand.pop(index)

    def toggleOppoOverThreshold(self) -> None:
        self.oppoOverThreshold = 1
    
    def victoryPassed(self) -> bool:
        return self.points > 60
    
    def getPoints(self) -> int:
        return self.points
    
    def addPoints(self, points) -> None:
        self.points += points
    
    def remainingCards(self) -> int:
        return len(self.hand)
    
    def addWin(self) -> None:
        self.wins += 1
    
    def resetWins(self) -> int:
        tmp = self.wins
        self.wins = 0
        return tmp
    
    def handIsEmpty(self) -> bool:
        return len(self.hand) == 0

In [396]:
class Environment:
    WIN_REWARD = 200
    VICTORY_THRESHOLD = 45
    BRISCOLA_THRESHOLD = 10
    BRISCOLE_THRESHOLD = 7
    
    Q_STATUS_DIM = (2, 4, 2) + (2, 2, 2, 2) + (2, 5, 4, 5, 4, 5, 4) + (5, 4)
    ACTION_DIM = (3,)

    def __init__(self) -> None:
        self.briscolaOverThreshold = 0  # [0, 1]
        self.briscolaSeed = None  # [0, 1, 2, 3]
        self.briscoleOut = 0
        self.briscoleOverThreshold = 0 # [0, 1]
        self.loadBySeed = [0, 0, 0, 0] # (denara, spade, bastoni, coppe) [0, 1]
        
        self.deck = Deck()
        self.players = (Player(), Player())
    
    def getShape(self) -> tuple:
        return self.Q_STATUS_DIM + self.ACTION_DIM
            
    def envState(self) -> tuple:
        return (self.briscolaOverThreshold, self.briscolaSeed, self.briscoleOverThreshold) + tuple(self.loadBySeed)
    
    def getState(self, index) -> tuple:
        return self.envState() + self.players[index].getState()
    
    def getActionState(self, action, index):
        return self.players[index].getCard(action).getState()
    
    def reset(self) -> tuple:
        for player in self.players:
            player.reset()
        self.deck.reset()

        # resetting game state info
        self.loadBySeed = [0, 0, 0, 0]
        self.briscoleOverThreshold = 0

        # updating new briscola info
        self.briscolaSeed = self.deck.getLastCard().getSeed()
        self.briscolaOverThreshold = int(self.deck.getLastCard().getPoints() >= self.BRISCOLA_THRESHOLD)

        # dealing cards to the players
        for _ in range(3):
            for player in self.players:
                player.addCard(self.deck.draw())

        # returns the pair (fstPlayer state, sndPlayer state)
        return (self.getState(0), self.getState(1)) 
    
    def processPlays(self, fstPlay, sndPlay, fstPlayer, sndPlayer) -> tuple:
        totPoints = (fstPlay.getPoints() + sndPlay.getPoints())
       
        winner = sndPlayer
        if fstPlay.getSeed() == sndPlay.getSeed():
            if fstPlay.getValue() > sndPlay.getValue(): winner = fstPlayer
        elif sndPlay.getSeed() != self.briscolaSeed: winner = fstPlayer
        
        return (winner, totPoints)
    
    def stateUpdate(self, plays) -> None:
        if self.players[0].getPoints() > self.VICTORY_THRESHOLD: self.players[1].toggleOppoOverThreshold()
        if self.players[1].getPoints() > self.VICTORY_THRESHOLD: self.players[0].toggleOppoOverThreshold()

        # updating briscola counter
        for play in plays:
            if play.getSeed() == self.briscolaSeed: 
                self.briscoleOut += 1
            if play.getPoints() >= 10: self.loadBySeed[play.getSeed()] = 1
        
        if self.briscoleOut > self.briscoleOverThreshold: self.briscoleOverThreshold = 1

    def step(self, fstPlayerAction, fstPlayerIndex, sndPlayerAction, sndPlayerIndex) -> tuple:
        fstPlay = self.players[fstPlayerIndex].removeCard(fstPlayerAction)
        sndPlay = self.players[sndPlayerIndex].removeCard(sndPlayerAction)

        # evaluating plays and updating points
        stepWinner, points = self.processPlays(fstPlay, sndPlay, fstPlayerIndex, sndPlayerIndex)
        self.players[stepWinner].addPoints(points)
        #print(stepWinner, fstPlayerIndex, (fstPlay.getValue(), fstPlay.getSeed()), (sndPlay.getValue(), sndPlay.getSeed()), self.briscolaSeed)

        # generating rewards
        if stepWinner == fstPlayerIndex: rewards = [points, -points]
        else: rewards = [-points, points]

        # updating state info
        self.stateUpdate([fstPlay, sndPlay])

        # dealing cards
        if self.deck.cardsLeft() != 0:
            self.players[0].addCard(self.deck.draw())
            self.players[1].addCard(self.deck.draw())

        # checking players victory
        done = False
        
        for i in range(len(self.players)):
            if self.players[i].victoryPassed():
                done = True
                self.players[i].addWin()
                if i == fstPlayerIndex: 
                    rewards[0] += self.WIN_REWARD
                    rewards[1] -= self.WIN_REWARD
                else: 
                    rewards[0] -= self.WIN_REWARD
                    rewards[1] += self.WIN_REWARD
                break
        
        # nobody won if both players has empty hand
        if self.players[0].handIsEmpty() and not done:
            done = True
            
        return (self.getState(fstPlayerIndex), self.getState(sndPlayerIndex), rewards[0], rewards[1], 
                done, self.players[fstPlayerIndex].remainingCards(), self.players[sndPlayerIndex].remainingCards(), stepWinner)
    
    def getMatchStats(self) -> tuple:
        return (self.players[0].resetWins(), self.players[1].resetWins())
        


In [397]:
class IA:
    EPS_THRESHOLD = 0.01
    TEST_THRESHOLD = 20000
    TEST_EPISODES = 10000
    HAND_MAX_CARD = 3
    NULL_OPPO_PLAY = (4,3)
    FST_PLAYER_ID = 0
    SND_PLAYER_ID = 1

    def __init__(self) -> None:
        self.env = Environment()
        # briscola need two players
        QShape = self.env.getShape()
        self.Q = np.zeros(QShape) # training player
        self.trainedQ = self.Q.copy() # trained player

    
    def randomAction(self, remainingCards = HAND_MAX_CARD) -> int:
        return rnd.randint(0, remainingCards-1)

    def epsGreedy(self, state, Q, eps=0.1, remainingCards = HAND_MAX_CARD) -> int:
        # Epsilon greedy policy
        if np.random.uniform(0,1) < eps:
            # Choose a random action
            return self.randomAction(remainingCards)
        else:
            # Choose the action of a greedy policy
            return self.greedy(state, Q, remainingCards)


    def greedy(self, state, Q, remainingCards = HAND_MAX_CARD) -> int:
        #Greedy policy
        #return the index corresponding to the maximum action-state value
        return np.argmax(Q[state][:remainingCards])
    
    def runTest(self, numEpisodes=100, toPrint=False) -> tuple:
        # Run some episodes to test the policy against random player
        # in this case Q2 represents random player
        rewards = []
        fstStarting = True

        for _ in range(numEpisodes):
            done = False
            episodeRew = 0
            
            state, _ = self.env.reset()

            # choosing who starts the game
            if(fstStarting):
                # generating fstPlayer action
                state = state + self.NULL_OPPO_PLAY # null value on the opponent play
                fstAction = self.greedy(state, self.Q)
                    
                # generating sndPlayer action
                sndAction = self.randomAction()
                
                # take one step in the environment
                state, _, stepReward, _, done, fstCards, sndCards, stepWinner = self.env.step(fstAction, self.FST_PLAYER_ID, sndAction, self.SND_PLAYER_ID)
            else:
                # generating sndPlayer action
                sndAction = self.randomAction()

                #  generating fstPlayer action
                state = state + self.env.getActionState(sndAction, self.SND_PLAYER_ID)
                fstAction = self.greedy(state, self.Q)
                    
                # take one step in the environment
                _, state, _, stepReward, done, sndCards, fstCards, stepWinner = self.env.step(sndAction, self.SND_PLAYER_ID, fstAction, self.FST_PLAYER_ID)
            
            episodeRew += stepReward

            #playing the game
            while not done:
                # if fstPlayer is playing first
                if(stepWinner == self.FST_PLAYER_ID):
                    # generating fstPLayer action
                    state = state + self.NULL_OPPO_PLAY # null value on the opponent play
                    fstAction = self.greedy(state, self.Q, fstCards)

                    # generating sndPLayer action
                    sndAction = self.randomAction(sndCards)
                    
                    # take one step in the environment
                    state, _, stepReward, _, done, fstCards, sndCards, stepWinner = self.env.step(fstAction, self.FST_PLAYER_ID, sndAction, self.SND_PLAYER_ID)
                
                else: # if fstPlayer is playing second
                    # generating sndPLayer action
                    sndAction = self.randomAction(sndCards)

                    #  generating fstPLayer action
                    state = state + self.env.getActionState(sndAction, self.SND_PLAYER_ID)
                    fstAction = self.greedy(state, self.Q, fstCards)

                    # take one step in the environment
                    _, state, _, stepReward, done, sndCards, fstCards, stepWinner = self.env.step(sndAction, self.SND_PLAYER_ID, fstAction, self.FST_PLAYER_ID)

                episodeRew += stepReward

                if done:
                    rewards.append(episodeRew)
                    fstStarting = not fstStarting

        matchStats = self.env.getMatchStats()
        winPercentage =  (matchStats[0] * 100) / numEpisodes

        if toPrint:
            print('Mean score: %.3f Win percentage: %.2f out of %i games!'%(np.mean(rewards), winPercentage, numEpisodes))

        return (np.mean(rewards), winPercentage)
    
    # research for optimal policy Q
    def sarsaLearning(self, learningTime=60*10, alpha=0.1, eps=0.3, gamma=0.95, epsDecay=0.05) -> None:
        
        fstStarting = True
        ep = 0

        #for ep in range(numEpisodes):
        start = time.time()
        while time.time() - start < learningTime:
            fstState, sndState = self.env.reset() # initial iniziale for Q1 and Q2
            done = False
            ep += 1

            # choosing who starts the game
            if(fstStarting):
                # generating fstPlayer action
                fstState = fstState + self.NULL_OPPO_PLAY # null value on the opponent play
                fstAction = self.epsGreedy(fstState, self.Q, eps)
                    
                # generating sndPlayer action
                sndState = sndState + self.env.getActionState(fstAction, self.FST_PLAYER_ID)
                sndAction = self.greedy(sndState, self.trainedQ)
                
                # take one step in the environment
                fstNextState, sndNextState, stepReward, _, done, fstCards, sndCards, stepWinner = self.env.step(fstAction, self.FST_PLAYER_ID, sndAction, self.SND_PLAYER_ID)
            else:
                # generating sndPlayer action
                sndState = sndState + self.NULL_OPPO_PLAY
                sndAction = self.greedy(sndState, self.trainedQ)

                #  generating fstPLayer action
                fstState = fstState + self.env.getActionState(sndAction, self.SND_PLAYER_ID)
                fstAction = self.epsGreedy(fstState, self.Q, eps)
                    
                # take one step in the environment
                sndNextState, fstNextState, _, stepReward, done, sndCards, fstCards, stepWinner = self.env.step(sndAction, self.SND_PLAYER_ID, fstAction, self.FST_PLAYER_ID)

            # loop the main body until the environment stops
            while not done:
                # checking who's next playing first  
                # if first player won this round must play first            
                if(stepWinner == self.FST_PLAYER_ID): 
                    # generating first player next action (needed for the SARSA update)
                    fstNextState = fstNextState + self.NULL_OPPO_PLAY
                    fstNextAction = self.epsGreedy(fstNextState, self.Q, eps, fstCards)
                
                    # generating second player next action
                    sndNextState = sndNextState + self.env.getActionState(fstNextAction, self.FST_PLAYER_ID)
                    sndNextAction = self.greedy(sndNextState, self.trainedQ, sndCards)
                
                else: # if first player lost must play second 
                    # generating second player next action
                    sndNextState = sndNextState + self.NULL_OPPO_PLAY
                    sndNextAction = self.greedy(sndNextState, self.trainedQ, sndCards)

                    # generating first player next action
                    fstNextState = fstNextState + self.env.getActionState(sndNextAction, self.SND_PLAYER_ID)
                    fstNextAction = self.epsGreedy(fstNextState, self.Q, eps, fstCards)

                # SARSA update
                self.Q[fstState][fstAction] = (self.Q[fstState][fstAction] 
                                               + alpha * (stepReward + gamma * self.Q[fstNextState][fstNextAction] 
                                                          - self.Q[fstState][fstAction]))

                # updating players states and actions
                fstState, sndState = fstNextState, sndNextState
                fstAction, sndAction = fstNextAction, sndNextAction

                # take one step in the environment
                if(stepWinner == self.FST_PLAYER_ID):       
                    fstNextState, sndNextState, stepReward, _, done, fstCards, sndCards, stepWinner = self.env.step(fstAction, self.FST_PLAYER_ID, sndAction, self.SND_PLAYER_ID)
                else:
                    sndNextState, fstNextState, _, stepReward, done, sndCards, fstCards, stepWinner = self.env.step(sndAction, self.SND_PLAYER_ID, fstAction, self.FST_PLAYER_ID)
            
            # every few episodes trainedQ is updated
            if (ep % self.TEST_THRESHOLD) == 0:
                # updating trainedQ
                self.trainedQ = self.Q.copy()

                # decay the epsilon value until it reaches the threshold
                if eps > self.EPS_THRESHOLD: eps -= epsDecay

                # training stats
                trainingStats = self.env.getMatchStats()

                # testing policy Q against random player
                avgRew, winPercentage = self.runTest(self.TEST_EPISODES)
                print('Episode:%.5d  Epsylon: %.4f  Average Reward: %.4f Win Percentage: %.2f' %(ep, eps, avgRew, winPercentage))
        self.savePolicy()

    def savePolicy(self):
        with open("ia.pk1", "wb") as fp:
            pickle.dump(self.Q, fp)
            fp.close()

        # with open("infos.pk1", "wb") as fp:
        #     infos = {"Episodes": episodes,
        #              "Win Percentage": winPercentage}
        #     pickle.dump(infos, fp)
        #     fp.close()
        print("Done savings!")
        dir = os.getcwd()
        AIDim = int((os.stat(dir+"/ia.pk1").st_size)/(1024*1024))
        print("AI Dimension:", AIDim, "MB")

    def importPolicy(self):
        with open('ia.pk1', 'rb') as fp:
            self.Q = pickle.load(fp)
            fp.close()
        print("Done importing")




In [399]:
ia = IA()
ia.sarsaLearning(60*60, 0.1, 0.3, 0.95, 0.0005)


Episode:20000  Epsylon: 0.2995  Average Reward: -24.2400 Win Percentage: 43.64
