In [108]:
import numpy as np

### 牌堆

In [109]:
#仅负责抽牌功能
class Cards:
    def __init__(self):
        self.cardset = ['2','3','4','5','6','7','8','9','10','J','Q','K','A']
        self.cardset = self.cardset * 4
        np.random.shuffle(self.cardset)
    def draw(self):
        np.random.shuffle(self.cardset)
        return self.cardset[0]
    def showallcards(self):
        return self.cardset

### 庄家

In [110]:
#庄家如果手牌点数低于17就会抽牌，高于17点停牌
class Dealer:
    def __init__(self):
        pass
    def hit(self,deck):#获知场上局势
        if deck.dealer_score < 17:
            deck.dealercards.append(deck.cards.draw())#小于17抽牌
            deck.dealer_score = deck.cal_score(deck.dealercards)#计算庄家手牌点数
        else:
            deck.dealer_stop = True
    

### 玩家

In [111]:
#玩家可以自己选择是否抽牌，如果手牌点数超过21点就会爆牌
#请自行完成代码填写，可以仿照dealer在牌库中抽牌
class Player:
    def __init__(self):#由于简化了牌局，目前仅有玩家点数，庄家点数，以及玩家是否停牌这些决定实际收益
        self.saG =np.zeros((22,22,2))#状态动作空间,saG[i][j][0]表示继续抽，saG[i][j][1]表示停牌，i表示玩家手牌点数,j表示庄家点数
        self.gamma = 0.9#折扣因子
        self.alpha = 0.001#学习率
        self.path = []#记录每一步
    def hit(self,deck,training = False):#测试函数
        action = self.saG[deck.player_score][deck.dealer_score]
        action = np.random.choice([0,1],p=np.exp(action)/np.sum(np.exp(action)))#依据softmax选择动作
        if training:#如果处于训练模式，则记录路径
            self.path.append([deck.player_score,deck.dealer_score,action])
        if action == 0:
            deck.playercards.append(deck.cards.draw())#抽牌
            deck.player_score = deck.cal_score(deck.playercards)#计算玩家手牌点数
        else:
            deck.player_stop = True
    def training(self,reward):#这里先使用同轨采样,因此本轮离线数据就等同于path，直接采样不用计算偏差
        G=reward
        for sa in self.path[-1::-1]:#从后往前遍历
            G = self.gamma * G#由于是蒙特卡洛，没有中途奖励
            i,j,a = sa
            self.saG[i][j][a] += self.alpha * (G - self.saG[i][j][a])#更新Q值
        self.path = []#清空路径
            

### 桌子

In [112]:
#负责发牌和计算分数，以及决定游戏进程
class Deck:
    def __init__(self,player,dealer):
        self.rule ={'2':2,'3':3,'4':4,'5':5,'6':6,'7':7,'8':8,'9':9,'10':10,'J':10,'Q':10,'K':10,'A':1}
        self.cards = Cards()#表示本局游戏的牌堆
        self.player = player#表示玩家
        self.dealer = dealer
        self.playercards = []#表示玩家手中的牌
        self.dealercards = []#表示庄家手中的牌
        self.player_score = 0#表示玩家手中的牌点数
        self.dealer_score = 0#表示庄家手中的牌点数
        self.player_bust = False#表示玩家是否爆牌
        self.dealer_bust = False#表示庄家是否爆牌
        self.player_stop = False#表示玩家是否停牌
        self.dealer_stop = False#表示庄家是否停牌
        self.player_win = False#表示玩家是否获胜
        self.dealer_win = False#表示庄家是否获胜
    def cal_score(self,cards):
        #计算手牌点数
        score = 0
        for card in cards:
            score += self.rule[card]
        return score

    def deal(self):#测试模式
        #进行阶段
        print("游戏开始")
        turn = 0
        while self.player_stop == False or self.dealer_stop == False:
            print("第",turn,"轮:")
            if self.player_stop:    
                self.player.hit(self)
            print("玩家手牌：",self.playercards,"点数：",self.player_score)
            if  self.player_score > 21:
                self.player_bust = True
                print("玩家爆牌")
                break
            if self.dealer_stop == False:
                self.dealer.hit(self)
            print("庄家手牌：",self.dealercards,"点数：",self.dealer_score)
            if self.dealer_score > 21:
                self.dealer_bust = True
                print("庄家爆牌")
                break
            
        #结算阶段
        print("游戏结束,结果如下：")
        if self.player_bust == True or (self.dealer_bust == False and self.player_score < self.dealer_score):
            self.dealer_win = True
            print("庄家获胜")
        else:
            self.player_win = True
            print("玩家获胜")
        
    def train(self):#训练模式
        #进行阶段
        while self.player_stop == False or self.dealer_stop == False: 
            if self.player_stop==False:
                self.player.hit(self, training = True)
            if  self.player_score > 21:
                self.player_bust = True
                break
            if self.dealer_stop == False:
                self.dealer.hit(self)
            if self.dealer_score > 21:
                self.dealer_bust = True
                break
        #结算阶段
        if self.player_bust == True or (self.dealer_bust == False and self.player_score < self.dealer_score):
            self.dealer_win = True
        else:
            self.player_win = True
        #训练阶段
        reward = 1 if self.player_win == True else -1
        self.player.training(reward)
        

### 初始化

In [113]:
player = Player()
dealer = Dealer()

### 训练

In [114]:
train_rounds = 5000
playerwintime = 0
for i in range(train_rounds):
    deck = Deck(player,dealer)
    deck.train()
    if deck.player_win == True:
        playerwintime += 1
    if i % 1000 == 0:
        print("已完成",i,"轮训练，玩家获胜率为",playerwintime/(i+1))
print("已完成",train_rounds,"轮训练，玩家获胜率为",playerwintime/train_rounds)

已完成 0 轮训练，玩家获胜率为 0.0
已完成 1000 轮训练，玩家获胜率为 0.2917082917082917
已完成 2000 轮训练，玩家获胜率为 0.3028485757121439
已完成 3000 轮训练，玩家获胜率为 0.3105631456181273
已完成 4000 轮训练，玩家获胜率为 0.31392151962009496
已完成 5000 轮训练，玩家获胜率为 0.3168


### 测试代码

In [115]:
print(player.saG)
def test():
    deck = Deck(player,dealer)
    deck.deal()
test()

[[[-2.22001479e-01 -3.17686465e-01]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00]]

 [[ 0.00000000e+00  0.00000000e+00]
  [ 1.56593152e-04 -2.69100900e-03]
  [-2.11038096e-03 -1.79640539e-03]
  [ 1.60984676e-03  2.68921169e-06]
  [ 8.22529519e-04  2.69103233e-03]
  [-1.45646423e-03  9.1165