In [1]:
import torch 
import numpy as np

import flappy_bird_gymnasium
import gymnasium

from deepq_agent import DQNAgent_pytorch

from gymnasium.wrappers import FlattenObservation

In [2]:
TARGET_UPDATE = 10
DEVICE = 'cuda' #torch.device("cuda" if torch.cuda.is_available() else "cpu")
LR = 1e-3
GAMMA = 0.99
EPS = 1.0
EPS_DECAY = 0.0001 
EPS_END = 0.1
BATCH_SIZE = 128
PLAY_MEMORY = 10000
LAYERS_SIZES = [64, 64]


EPOCHS = 1000000

In [3]:
env = gymnasium.make("FlappyBird-v0")
state,_ = env.reset()

In [4]:
#get size of observation space
obs_space = 11 #len(state) #overriden
act_space = env.action_space.n

obs_space, act_space

(11, 2)

In [5]:
agent = DQNAgent_pytorch(
        device=DEVICE,
        act_space=act_space,
        obs_space=obs_space,
        training_batch_size=BATCH_SIZE,
        learn_rate=LR,
        gamma=GAMMA,
        eps=EPS,                                                               #rate of exploration
        eps_decay_rate=EPS_DECAY,                                                   
        eps_floor=EPS_END,                                                       
        network_shape=LAYERS_SIZES,
        pmem_buffer_size=PLAY_MEMORY
    )

In [6]:
#agent.load("model.pt")

In [7]:
env = gymnasium.make("FlappyBird-v0")

env = FlattenObservation(env)

def state_filter(state:np.ndarray):
    state = state[:-1]
    return state

import random

frames_since_jump = 0
for i in range(EPOCHS):
    total_reward = 0
    state, _ = env.reset(seed=random.randint(0,1000))
    state = torch.tensor(state_filter(state), dtype=torch.float32, device=DEVICE).unsqueeze(0)
    terminated = False

    while not terminated:
        action = agent.get_action(state)
        
        if action.item() == 1:
            if frames_since_jump < 5:
                action[0] = 0
                frames_since_jump += 1
            else: 
                frames_since_jump = 0
        else:
            frames_since_jump += 1
        
        new_state, reward, terminated, truncated, info = env.step(action.item())
        if terminated or truncated or (new_state[9] < 0 or new_state[9] > 1):
            terminated = True
            #reward = -1000.0
            #reset the environment
            state, _ = env.reset()
            state = torch.tensor(state_filter(state), dtype=torch.float32, device=DEVICE).unsqueeze(0)
        else:
            #reward *= 10
            pass
        
        total_reward += reward

        new_state = torch.tensor(state_filter(new_state), dtype=torch.float32, device=DEVICE).unsqueeze(0)
        reward = torch.tensor([reward], device=DEVICE)
        agent.remember(state, action, new_state, reward)
        agent.train()

        state = new_state
    print("Iteration: ", i, "Total reward: ", total_reward)





Iteration:  0 Total reward:  4.000000000000002
Iteration:  1 Total reward:  4.000000000000002
Iteration:  2 Total reward:  4.200000000000001
Iteration:  3 Total reward:  3.900000000000002
Iteration:  4 Total reward:  4.300000000000001
Iteration:  5 Total reward:  4.000000000000002
Iteration:  6 Total reward:  4.100000000000001
Iteration:  7 Total reward:  4.200000000000001
Iteration:  8 Total reward:  4.100000000000001
Iteration:  9 Total reward:  4.100000000000001
Iteration:  10 Total reward:  4.200000000000001
Iteration:  11 Total reward:  4.000000000000002
Iteration:  12 Total reward:  4.100000000000001
Iteration:  13 Total reward:  4.6
Iteration:  14 Total reward:  4.000000000000002
Iteration:  15 Total reward:  4.4
Iteration:  16 Total reward:  4.300000000000001
Iteration:  17 Total reward:  4.300000000000001
Iteration:  18 Total reward:  4.300000000000001
Iteration:  19 Total reward:  4.799999999999999
Iteration:  20 Total reward:  3.900000000000002
Iteration:  21 Total reward:  

KeyboardInterrupt: 

In [10]:
def evaluate_hyper(
    act_space,
    obs_space,
    batch_size,
    lr,
    gamma,
    eps_start,
    eps_decay,
    eps_floor,
    layer1_size,
    layer2_size,
    epochs=100
):
    #return mean reward over 100 episodes
    agent = DQNAgent_pytorch(
        act_space=act_space,
        obs_space=obs_space,
        training_batch_size=batch_size,
        learn_rate=lr,
        gamma=gamma,
        eps=eps_start,
        eps_decay_rate=eps_decay,
        eps_floor=eps_floor,
        network_shape=[layer1_size, layer2_size],
        pmem_buffer_size=10000,
        device='cuda',
    )
    env = gymnasium.make("FlappyBird-v0")
    state,_ = env.reset()
    
    total_reward = 0
    for i in range(epochs):
        state, _ = env.reset()
        state = torch.tensor(state_filter(state), dtype=torch.float32, device=DEVICE).unsqueeze(0)
        terminated = False

        frames_since_jump = 0
        while not terminated:
            action = agent.get_action(state)
            
            if action.item() == 1:
                if frames_since_jump < 5:
                    action[0] = 0
                    frames_since_jump += 1
                else: 
                    frames_since_jump = 0
            else:
                frames_since_jump += 1
            
            new_state, reward, terminated, truncated, info = env.step(action.item())
            if terminated or truncated or (new_state[9] < 0 or new_state[9] > 1):
                terminated = True
                #reward = -1000.0
                #reset the environment
                state, _ = env.reset()
                state = torch.tensor(state_filter(state), dtype=torch.float32, device=DEVICE).unsqueeze(0)
            else:
                #reward *= 10
                pass
            
            total_reward += reward

            new_state = torch.tensor(state_filter(new_state), dtype=torch.float32, device=DEVICE).unsqueeze(0)
            reward = torch.tensor([reward], device=DEVICE)
            agent.remember(state, action, new_state, reward)
            agent.train()

            state = new_state
        print("Iteration: ", i, "Total reward: ", total_reward)
    return total_reward/epochs
    


In [11]:
import hyperopt

batch_sizes = [32, 64, 128, 256, 512]
lrs = [1e-2, 1e-3, 1e-4, 1e-5]
gammas = [0.99]
eps_starts = [1.0]
eps_decays = [0.0001]
eps_floors = [0.01] 
layer1_sizes = [16, 32, 64, 128]
layer2_sizes = [16, 32, 64, 128]

space = {
    'batch_size': hyperopt.hp.choice('batch_size', batch_sizes),
    'lr': hyperopt.hp.choice('lr', lrs),
    'gamma': hyperopt.hp.choice('gamma', gammas),
    'eps_start': hyperopt.hp.choice('eps_start', eps_starts),
    'eps_decay': hyperopt.hp.choice('eps_decay', eps_decays),
    'eps_floor': hyperopt.hp.choice('eps_floor', eps_floors),
    'layer1_size': hyperopt.hp.choice('layer1_size', layer1_sizes),
    'layer2_size': hyperopt.hp.choice('layer2_size', layer2_sizes),
}

def objective(params):
    return -evaluate_hyper(
        act_space=2,
        obs_space=11,
        batch_size=params['batch_size'],
        lr=params['lr'],
        gamma=params['gamma'],
        eps_start=params['eps_start'],
        eps_decay=params['eps_decay'],
        eps_floor=params['eps_floor'],
        layer1_size=params['layer1_size'],
        layer2_size=params['layer2_size'],
        epochs=100
    )

trials = hyperopt.Trials()
best = hyperopt.fmin(
    fn=objective,
    space=space,
    algo=hyperopt.tpe.suggest,
    max_evals=100,
    trials=trials
)

print(best)
print(trials.best_trial)




  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

job exception: local variable 'frames_since_jump' referenced before assignment



  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]


UnboundLocalError: local variable 'frames_since_jump' referenced before assignment