In [1]:
from tqdm import tqdm
import torch.nn as nn
import torch

from blackjackenv_extended import BlackjackEnv
from random_agent import RandomAgent
from basic_strategy_agent import BasicStrategyAgent
from backprop_agent import BackpropAgent
from FFNN_agent import FFNNAgent
from DQFFNN_agent import DQFFNNAgent

In [2]:
KEY = {0: 'STAND', 1:'HIT', 2:'DOUBLE', 3: 'SPLIT'}

rewards = []

def play_game(env, episodes, agent, collect_data=False):

    for episode in tqdm(range(episodes)):
        observation, info = env.reset()
        done = False
        
        # print(f"Episode: {episode+1}/{episodes}")
        # print("-"*40)
        # print("hand 1", env.player)
        # print("hand 2", env.player2)
        # print("Start Observation: ", observation)
        
        
        while not done:
            
            action = agent.get_action(observation)
            #print("Action: ", KEY[action])

            # save data for training
            if collect_data:
                agent.collect_data(observation, action)
            
            next_observation, reward, terminated, truncated, info = env.step(action) 
            agent.learn(observation, action, reward, next_observation, terminated or truncated)
            observation = next_observation
            
            # print("hand 1", env.player)
            # print("hand 2", env.player2)
            #print("Observation: ", observation, "Reward: ", reward)
            

            if terminated or truncated:
                # print(f"Dealer hand: ", env.dealer)
                rewards.append(reward)
                observation = env.reset()
                done = True
        
        agent.decay_epsilon()    

    if collect_data:
        agent.save_data()

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("using gpu: ", torch.cuda.get_device_name())
else:
    device = torch.device('cpu')
    print("using cpu")

using cpu


In [8]:
# opdelen

env = BlackjackEnv(natural=True)

# Hyperparameters
episodes = 2
collect_data = False

# Backprop hyperparameters
input_size = 5
output_size = 4
hidden_size = 30
activation_fn = nn.ReLU()

# FFNN hyperparameters
model_path = '../models/DQFFNN_model.pth'
layers = [9,100,100]
threshold = 0.5
lr = 0.03

# DQFFNN hyperparameters
layers = [9,100,100]
threshold = 3
lr = 0.03
epsilon = 1.0
epsilon_decay = epsilon / episodes
discount_factor = 0.99
batch_size = 2


#agent = RandomAgent(env, filename="random_agent")
#agent = BasicStrategyAgent(env, filename="basic_strategy_agent")
# agent = BackpropAgent(
#     env, 
#     model=torch.load('../models/backprop_model.pth'),
#     input_size=input_size, 
#     output_size=output_size, 
#     hidden_size=hidden_size, 
#     activation_fn=activation_fn,
#     filename="backprop_agent"
#                     )
# agent = FFNNAgent(
#     env,
#     model_path=model_path,
#     device=device,
#     layers=layers,
#     threshold=threshold,
#     lr=lr,
#     filename="ffnn_agent"
#                 )
agent = DQFFNNAgent(
    env,
    device=device,
    layers=layers,
    threshold=threshold,
    lr=lr,
    epsilon=epsilon,
    epsilon_decay=epsilon_decay,
    discount_factor=discount_factor,
    batch_size=batch_size,
    filename="dqffnn_agent"
)

rewards = []

play_game(env, episodes, agent, collect_data)

average_reward = sum(rewards)/episodes
variance = sum([((x - average_reward) ** 2) for x in rewards]) / (episodes - 1)

#print("Rewards: ", rewards)
print("Average Reward: ", average_reward)
print("Variance: ", variance)

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [08:43<00:00, 191.02it/s]

Average Reward:  -0.296475
Variance:  1.2807028814047645





In [9]:
agent.model.save_model(path='../models/DQFFNN_model.pth')