In [9]:
import random

PASS = 0
BET = 1
NUM_ACTIONS = 2

class Node:
    def __init__(self, info_set):
        self.info_set = info_set
        # Stores the cumulitive regret for each information set
        self.regret_sum = [0.0 for _ in range(NUM_ACTIONS)]
        #this stores the current strategy of this imformation set 
        self.strategy = [0.0 for _ in range(NUM_ACTIONS)]
        # used to calcualte the avergae strategy, it's the sum of strategies at each info set
        self.strategy_sum = [0.0 for _ in range(NUM_ACTIONS)]

        
    #Uses regret matching to get the current strategy for an info set
    def get_strategy(self, realisation_weight):
        normalising_sum = 0
        #Calculate the strategy for each action based on regret values
        for a in range(NUM_ACTIONS):
            #Make sure the strategy is positive or zero
            self.strategy[a] = max(self.regret_sum[a], 0)
            normalising_sum += self.strategy[a]
        #Normalise the strategy to ensure the probabilities sum to 1
        for a in range(NUM_ACTIONS):
            if normalising_sum > 0:
                self.strategy[a] /= normalising_sum
            else:
                #If no positive regrets, use a uniform strategy
                self.strategy[a] = 1.0 / NUM_ACTIONS
            #Update the strategy sum with the weighted strategy
            self.strategy_sum[a] += realisation_weight * self.strategy[a]
        return self.strategy
    
    #This method computes the average strategy over iterations
    def get_average_strategy(self):
        #Initialise a list to store the average strategy
        avg_strategy = [0.0 for _ in range(NUM_ACTIONS)]
        #Calculate the normalising sum for the strategy sums
        normalising_sum = sum(self.strategy_sum)
        
        #The average strategy for each action
        for a in range(NUM_ACTIONS):
            if normalising_sum > 0:
                avg_strategy[a] = self.strategy_sum[a] / normalising_sum
            else:
                avg_strategy[a] = 1.0 / NUM_ACTIONS
        return avg_strategy

    def __str__(self):
        return f"{self.info_set}: {self.get_average_strategy()}"
    


In [10]:

    def cfr(cards, history, p0, p1, node_map):
        plays = len(history)
        player = plays % 2
        opponent = 1 - player

        # Terminal state payoff
        if plays > 1:
            terminal_pass = history[-1] == 'p'
            double_bet = history[-2:] == "bb"
            is_player_card_higher = cards[player] > cards[opponent]
            if terminal_pass:
                if history == "pp":
                    return 1 if is_player_card_higher else -1
                else:
                    return 1
            elif double_bet:
                return 2 if is_player_card_higher else -2

        #Construct the information set for the current player
        info_set = str(cards[player]) + history
        
        #Create a new node in the node_map if it doesn't exist
        if info_set not in node_map:
            node_map[info_set] = Node(info_set)

        #Get the node for the current information set and get the players strategy for that info set
        node = node_map[info_set]
        strategy = node.get_strategy(p0 if player == 0 else p1)

        #Recursively call cfr with additional history and probability
        util = [0.0 for _ in range(NUM_ACTIONS)]
        node_util = 0
        for a in range(NUM_ACTIONS):
            next_history = history + ('p' if a == 0 else 'b')
            if player == 0:
                util[a] = -cfr(cards, next_history, p0 * strategy[a], p1, node_map)
            else:
                util[a] = -cfr(cards, next_history, p0, p1 * strategy[a], node_map)
            node_util += strategy[a] * util[a]

        # Update regrets for the curretn node
        for a in range(NUM_ACTIONS):
            regret = util[a] - node_util
            node.regret_sum[a] += (p1 if player == 0 else p0) * regret
        #retrun the utility
        return node_util
    


In [11]:
def shuffle_cards():
    cards = [1, 2, 3]
    random.shuffle(cards)
    return cards

#Method to train the agent
def train(iterations):
    node_map = {}
    util = 0
    for i in range(iterations):
        cards = shuffle_cards()
        util += cfr(cards, "", 1, 1, node_map)
    print(f"Average game value: {util / iterations}")
    return node_map

#displays the results of the trained node fore ach information set
def display_results(node_map):
    print("\nKuhn Poker Strategies for Each Information Set:")
    for info_set, node in sorted(node_map.items()):
        avg_strategy = node.get_average_strategy()
        print(f"Information Set {info_set}: PASS = {avg_strategy[PASS]:.2f}, BET = {avg_strategy[BET]:.2f}")
def main():
    iterations = 100000
    trained_node_map = train(iterations)
    display_results(trained_node_map)

main()



Average game value: -0.05495546979326854

Kuhn Poker Strategies for Each Information Set:
Information Set 1: PASS = 0.76, BET = 0.24
Information Set 1b: PASS = 1.00, BET = 0.00
Information Set 1p: PASS = 0.67, BET = 0.33
Information Set 1pb: PASS = 1.00, BET = 0.00
Information Set 2: PASS = 1.00, BET = 0.00
Information Set 2b: PASS = 0.66, BET = 0.34
Information Set 2p: PASS = 1.00, BET = 0.00
Information Set 2pb: PASS = 0.42, BET = 0.58
Information Set 3: PASS = 0.26, BET = 0.74
Information Set 3b: PASS = 0.00, BET = 1.00
Information Set 3p: PASS = 0.00, BET = 1.00
Information Set 3pb: PASS = 0.00, BET = 1.00
