In [1]:
from tqdm import tqdm
from random_agent import RandomAgent
from basic_strategy_agent import BasicStrategyAgent
from blackjackenv_extended import BlackjackEnv

In [4]:
KEY = {0: 'STAND', 1:'HIT', 2:'DOUBLE', 3: 'SPLIT'}

rewards = []

def play_game(env, episodes, agent, collect_data=False):

    for episode in tqdm(range(episodes)):
        observation, info = env.reset()
        done = False
        
        # print("hand 1", env.player)
        # print("hand 2", env.player2)
        # print("Start Observation: ", observation)
        
        
        while not done:
            
            action = agent.get_action(observation)
            # print("Action: ", KEY[action])

            # save data for training
            if collect_data:
                agent.collect_data(observation, action)
            
            observation, reward, terminated, truncated, info = env.step(action) 
            # print("hand 1", env.player)
            # print("hand 2", env.player2)
            # print("Observation: ", observation, "Reward: ", reward)
            

            if terminated or truncated:
                # print(f"Dealer hand: ", env.dealer)
                rewards.append(reward)
                observation = env.reset()
                done = True
            

    if collect_data:
        agent.save_data()

In [20]:
# Hyperparameters
episodes = 1000

env = BlackjackEnv(natural=True)
#agent = RandomAgent(env, filename="random_agent")
agent = BasicStrategyAgent(env, filename="basic_strategy_agent")
collect_data = True
rewards = []

play_game(env, episodes, agent, collect_data)

average_reward = sum(rewards)/episodes
variance = sum([((x - average_reward) ** 2) for x in rewards]) / (episodes - 1)

#print("Rewards: ", rewards)
print("Average Reward: ", average_reward)
print("Variance: ", variance)

 22%|██▏       | 217/1000 [00:00<00:00, 2060.95it/s]

hand 1 [5, 10]
hand 2 []
Start Observation:  (15, 9, 0, 1, 0)
Action:  HIT
hand 1 [5, 10, 9]
hand 2 []
Observation:  (24, 9, 0, 0, 0) Reward:  -1.0
hand 1 [10, 9]
hand 2 []
Start Observation:  (19, 10, 0, 1, 0)
Action:  STAND
hand 1 [10, 9]
hand 2 []
Observation:  (19, 10, 0, 1, 0) Reward:  -1.0
hand 1 [9, 2]
hand 2 []
Start Observation:  (11, 10, 0, 1, 0)
Action:  DOUBLE
hand 1 [9, 2, 10]
hand 2 []
Observation:  (21, 10, 0, 0, 0) Reward:  2.0
hand 1 [10, 10]
hand 2 []
Start Observation:  (20, 10, 0, 1, 1)
Action:  STAND
hand 1 [10, 10]
hand 2 []
Observation:  (20, 10, 0, 1, 1) Reward:  0.0
hand 1 [10, 10]
hand 2 []
Start Observation:  (20, 9, 0, 1, 1)
Action:  STAND
hand 1 [10, 10]
hand 2 []
Observation:  (20, 9, 0, 1, 1) Reward:  1.0
hand 1 [10, 10]
hand 2 []
Start Observation:  (20, 6, 0, 1, 1)
Action:  STAND
hand 1 [10, 10]
hand 2 []
Observation:  (20, 6, 0, 1, 1) Reward:  1.0
hand 1 [4, 2]
hand 2 []
Start Observation:  (6, 10, 0, 1, 0)
Action:  HIT
hand 1 [4, 2, 5]
hand 2 []
Obser

 92%|█████████▏| 919/1000 [00:00<00:00, 2072.62it/s]

hand 1 [5, 7]
hand 2 []
Start Observation:  (12, 3, 0, 1, 0)
Action:  HIT
hand 1 [5, 7, 10]
hand 2 []
Observation:  (22, 3, 0, 0, 0) Reward:  -1.0
hand 1 [8, 10]
hand 2 []
Start Observation:  (18, 5, 0, 1, 0)
Action:  STAND
hand 1 [8, 10]
hand 2 []
Observation:  (18, 5, 0, 1, 0) Reward:  0.0
hand 1 [4, 2]
hand 2 []
Start Observation:  (6, 10, 0, 1, 0)
Action:  HIT
hand 1 [4, 2, 5]
hand 2 []
Observation:  (11, 10, 0, 0, 0) Reward:  0.0
Action:  HIT
hand 1 [4, 2, 5, 10]
hand 2 []
Observation:  (21, 10, 0, 0, 0) Reward:  0.0
Action:  STAND
hand 1 [4, 2, 5, 10]
hand 2 []
Observation:  (21, 10, 0, 0, 0) Reward:  1.0
hand 1 [10, 9]
hand 2 []
Start Observation:  (19, 10, 0, 1, 0)
Action:  STAND
hand 1 [10, 9]
hand 2 []
Observation:  (19, 10, 0, 1, 0) Reward:  0.0
hand 1 [7, 6]
hand 2 []
Start Observation:  (13, 2, 0, 1, 0)
Action:  STAND
hand 1 [7, 6]
hand 2 []
Observation:  (13, 2, 0, 1, 0) Reward:  -1.0
hand 1 [9, 6]
hand 2 []
Start Observation:  (15, 2, 0, 1, 0)
Action:  STAND
hand 1 [9, 6

100%|██████████| 1000/1000 [00:00<00:00, 2023.08it/s]

hand 1 [8, 8]
hand 2 []
Start Observation:  (16, 2, 0, 1, 1)
Action:  SPLIT
hand 1 [8, 6]
hand 2 [8, 2]
Observation:  (14, 2, 0, 0, 0) Reward:  0.0
Action:  STAND
hand 1 [8, 6]
hand 2 [8, 2]
Observation:  (10, 2, 0, 0, 0) Reward:  0.0
Action:  HIT
hand 1 [8, 6]
hand 2 [8, 2, 2]
Observation:  (12, 2, 0, 0, 0) Reward:  0.0
Action:  HIT
hand 1 [8, 6]
hand 2 [8, 2, 2, 10]
Observation:  (22, 2, 0, 0, 0) Reward:  -1.0
hand 1 [7, 1]
hand 2 []
Start Observation:  (18, 5, 1, 1, 0)
Action:  DOUBLE
hand 1 [7, 1, 9]
hand 2 []
Observation:  (17, 5, 0, 0, 0) Reward:  -2.0
hand 1 [5, 6]
hand 2 []
Start Observation:  (11, 9, 0, 1, 0)
Action:  DOUBLE
hand 1 [5, 6, 10]
hand 2 []
Observation:  (21, 9, 0, 0, 0) Reward:  2.0
hand 1 [3, 5]
hand 2 []
Start Observation:  (8, 2, 0, 1, 0)
Action:  HIT
hand 1 [3, 5, 10]
hand 2 []
Observation:  (18, 2, 0, 0, 0) Reward:  0.0
Action:  STAND
hand 1 [3, 5, 10]
hand 2 []
Observation:  (18, 2, 0, 0, 0) Reward:  1.0
hand 1 [6, 3]
hand 2 []
Start Observation:  (9, 5, 0, 


