In [None]:
import numpy as np

### 牌堆

In [None]:
#仅负责抽牌功能
class Cards:
    def __init__(self):
        self.cardset = ['2','3','4','5','6','7','8','9','10','J','Q','K','A']
        self.cardset = self.cardset * 4
        np.random.shuffle(self.cardset)
    def draw(self):
        np.random.shuffle(self.cardset)
        return self.cardset[0]
    def showallcards(self):
        return self.cardset

### 庄家

In [None]:
#庄家如果手牌点数低于17就会抽牌，高于17点停牌
class Dealer:
    def __init__(self):
        pass
    def hit(self,deck):#获知场上局势
        if deck.dealer_score < 17:
            deck.dealercards.append(deck.cards.draw())#小于17抽牌
            deck.dealer_score = deck.cal_score(deck.dealercards)#计算庄家手牌点数
        else:
            deck.dealer_stop = True
    

### 玩家

In [None]:
#玩家可以自己选择是否抽牌，如果手牌点数超过21点就会爆牌
#请自行完成代码填写，可以仿照dealer在牌库中抽牌
class Player:
    def __init__(self):
        self.saG = np.zeros((22, 22, 2))  # Q表
        self.gamma = 0.9  # 折扣因子
        self.alpha = 0.0005  # 学习率
        self.path = []  # 轨迹记录

    def hit_eval(self, deck):
        """ 测试时使用 softmax 策略选择动作 """
        action_probs = np.exp(self.saG[deck.player_score][deck.dealer_score])
        action_probs /= np.sum(action_probs)  # 归一化 softmax
        action = np.random.choice([0, 1], p=action_probs)

        if action == 0:  # 继续抽牌
            deck.playercards.append(deck.cards.draw())
            deck.player_score = deck.cal_score(deck.playercards)
        else:  # 停牌
            deck.player_stop = True

    def hit_train(self, deck):
        """ 训练时使用均匀随机策略 """
        action = np.random.choice([0, 1])
        self.path.append((deck.player_score, deck.dealer_score, action))  # 记录轨迹

        if action == 0:  # 继续抽牌
            deck.playercards.append(deck.cards.draw())
            deck.player_score = deck.cal_score(deck.playercards)
        else:  # 停牌
            deck.player_stop = True

    def training(self, reward):
        """ 蒙特卡洛策略重要性采样更新 Q 值 """
        G = reward
        pratio = 1  # 重要性采样比率
        
        for i, j, a in reversed(self.path):  # 反向遍历路径
            G = self.gamma * G
            behavior_prob = 0.5  # 训练时使用均匀随机策略
            policy_prob = np.exp(self.saG[i][j][a]) / np.sum(np.exp(self.saG[i][j]))  # 当前策略概率
            pratio *= policy_prob / behavior_prob  # 重要性采样比率
            self.saG[i][j][a] += self.alpha * (G*pratio - self.saG[i][j][a])

        self.path.clear()  # 训练后清空轨迹

### 桌子

In [None]:
#负责发牌和计算分数，以及决定游戏进程
class Deck:
    def __init__(self,player,dealer):
        self.rule ={'2':2,'3':3,'4':4,'5':5,'6':6,'7':7,'8':8,'9':9,'10':10,'J':10,'Q':10,'K':10,'A':1}
        self.cards = Cards()#表示本局游戏的牌堆
        self.player = player#表示玩家
        self.dealer = dealer
        self.playercards = []#表示玩家手中的牌
        self.dealercards = []#表示庄家手中的牌
        self.player_score = 0#表示玩家手中的牌点数
        self.dealer_score = 0#表示庄家手中的牌点数
        self.player_bust = False#表示玩家是否爆牌
        self.dealer_bust = False#表示庄家是否爆牌
        self.player_stop = False#表示玩家是否停牌
        self.dealer_stop = False#表示庄家是否停牌
        self.player_win = False#表示玩家是否获胜
        self.dealer_win = False#表示庄家是否获胜
    def cal_score(self,cards):
        #计算手牌点数
        score = 0
        for card in cards:
            score += self.rule[card]
        return score

    def deal(self):#测试模式
        #进行阶段
        print("游戏开始")
        turn = 0
        while self.player_stop == False or self.dealer_stop == False:
            print("第",turn,"轮:")
            if  self.player_stop == False:    
                self.player.hit_eval(self)
                print("玩家手牌：",self.playercards,"点数：",self.player_score)
            if  self.player_score > 21:
                self.player_bust = True
                print("玩家爆牌")
                break
            if self.dealer_stop == False:
                self.dealer.hit(self)
                print("庄家手牌：",self.dealercards,"点数：",self.dealer_score)
            if self.dealer_score > 21:
                self.dealer_bust = True
                print("庄家爆牌")
                break
            
        #结算阶段
        print("游戏结束,结果如下：")
        if self.player_bust == True or (self.dealer_bust == False and self.player_score < self.dealer_score):
            self.dealer_win = True
            print("庄家获胜")
        else:
            self.player_win = True
            print("玩家获胜")
        
    def train(self):#训练模式
        #进行阶段
        while self.player_stop == False or self.dealer_stop == False: 
            if self.player_stop==False:
                self.player.hit_train(self)
            if  self.player_score > 21:
                self.player_bust = True
                break
            if self.dealer_stop == False:
                self.dealer.hit(self)
            if self.dealer_score > 21:
                self.dealer_bust = True
                break
        #结算阶段
        if self.player_bust == True or (self.dealer_bust == False and self.player_score < self.dealer_score):
            self.dealer_win = True
        else:
            self.player_win = True
        #训练阶段
        reward = 5 if self.player_win == True else -5
        self.player.training(reward)
        

### 初始化

In [None]:
player = Player()
dealer = Dealer()

### 训练

In [None]:
train_rounds = 50000
for i in range(train_rounds):
    deck = Deck(player,dealer)
    deck.train()
    if i % 1000 == 0:
        playerwintime = 0
        for j in range(100):
            deck = Deck(player,dealer)
            deck.deal()
            if deck.player_win == True:
                playerwintime += 1
        print("第",i,"轮训练，玩家胜率：",playerwintime/100)
        

        

### 测试代码

In [None]:
print(player.saG)
def test():
    deck = Deck(player,dealer)
    deck.deal()
test()