In [8]:
import torch 
import numpy as np

import flappy_bird_gymnasium
import gymnasium

from deepq_agent import DQNAgent_pytorch

from gymnasium.wrappers import FlattenObservation

In [9]:
game = "CartPole-v1" #CartPole-v1 FlappyBird-v0
env = gymnasium.make(game) 
state,_ = env.reset()

In [10]:
#get size of observation space
obs_space = len(state) #overriden
act_space = env.action_space.n

obs_space, act_space

(4, 2)

In [11]:
TARGET_UPDATE = 100
DEVICE = 'cuda' #torch.device("cuda" if torch.cuda.is_available() else "cpu")
LR = 1e-4
GAMMA = 0.99
EPS = 0.9
EPS_DECAY = 1000 
EPS_END = 0.05
BATCH_SIZE = 128
PLAY_MEMORY = 10000
LAYERS_SIZES = [256, 256]
EPOCHS = 1000000
TAU = 0.005

In [12]:
agent = DQNAgent_pytorch(
        device=DEVICE,
        act_space=act_space,
        obs_space=obs_space,
        training_batch_size=BATCH_SIZE,
        learn_rate=LR,
        gamma=GAMMA,
        eps_start=EPS,                                                               #rate of exploration
        eps_decay_rate=EPS_DECAY,                                                   
        eps_floor=EPS_END,                                                       
        network_shape=LAYERS_SIZES,
        tau=TAU,
        pmem_buffer_size=PLAY_MEMORY
    )

In [13]:
#agent.load("model.pt")
def state_filter(state:np.ndarray):
    state = state#[:-1]
    return state

In [14]:
from itertools import count
import random

for epoch in range(EPOCHS):
    state, _ = env.reset()
    state = torch.tensor(state_filter(state), dtype=torch.float32, device=DEVICE).unsqueeze(0)

    total_reward = 0

    for t in count():
        action = agent.get_action(state, env.action_space)
        next_state, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([reward], device=DEVICE)
        next_state = torch.tensor(state_filter(next_state), dtype=torch.float32, device=DEVICE).unsqueeze(0)
        done = terminated or truncated


        # Store the transition in memory    
        agent.memory.push(state, action, next_state, reward, torch.tensor([done], device=DEVICE))

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        agent.train()

        total_reward += reward.item()

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        policy_dict = agent.policy_net.state_dict()
        target_dict = agent.target_net.state_dict()
        for key in policy_dict:
            target_dict[key] = agent.tau * policy_dict[key] + (1 - agent.tau) * target_dict[key]
        agent.target_net.load_state_dict(target_dict)

        if done:
            break
    print("Iteration: ", epoch, "Total reward: ", t )
    agent.save("model.pt")

Iteration:  0 Total reward:  26
Iteration:  1 Total reward:  44
Iteration:  2 Total reward:  23
Iteration:  3 Total reward:  33
Iteration:  4 Total reward:  40
Iteration:  5 Total reward:  15
Iteration:  6 Total reward:  20
Iteration:  7 Total reward:  16
Iteration:  8 Total reward:  18
Iteration:  9 Total reward:  12
Iteration:  10 Total reward:  9
Iteration:  11 Total reward:  9
Iteration:  12 Total reward:  12
Iteration:  13 Total reward:  9
Iteration:  14 Total reward:  19
Iteration:  15 Total reward:  11
Iteration:  16 Total reward:  16
Iteration:  17 Total reward:  16
Iteration:  18 Total reward:  18
Iteration:  19 Total reward:  23
Iteration:  20 Total reward:  11
Iteration:  21 Total reward:  11
Iteration:  22 Total reward:  17
Iteration:  23 Total reward:  8
Iteration:  24 Total reward:  13
Iteration:  25 Total reward:  13
Iteration:  26 Total reward:  11
Iteration:  27 Total reward:  9
Iteration:  28 Total reward:  9
Iteration:  29 Total reward:  11
Iteration:  30 Total rewar

KeyboardInterrupt: 

In [15]:
agent.load("modelstick.pt")
env = gymnasium.make(game, render_mode="human")
state, _ = env.reset()

for epoch in range(EPOCHS):
    state, _ = env.reset()
    state = torch.tensor(state_filter(state), dtype=torch.float32, device=DEVICE).unsqueeze(0)

    for t in count():
        action = agent.get_action(state, env.action_space)
        next_state, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([reward], device=DEVICE)
        next_state = torch.tensor(state_filter(next_state), dtype=torch.float32, device=DEVICE).unsqueeze(0)
        done = terminated or truncated
        state = next_state
        if done:
            break
    print("Iteration: ", epoch, "Total reward: ", t)
    agent.save("model.pt")

Iteration:  0 Total reward:  499
Iteration:  1 Total reward:  499
Iteration:  2 Total reward:  499


KeyboardInterrupt: 