In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import torch
from torch.optim import Adam, Optimizer
import numpy as np
import gymnasium as gym
import ale_py
import matplotlib.pyplot as plt

from reinforce_implementation import *

gym.register_envs(ale_py)

from IPython import display

In [4]:
env = gym.make("ALE/Pong-v5", obs_type="ram", 
               frameskip=8) # if we take image observation type Q-net need to be a vision model.

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Create the MLP model
number_observation_features = env.observation_space.shape[0]
number_actions = env.action_space.n
model = create_model(number_observation_features, number_actions)
model.to(device)

print(model)

cuda
Sequential(
  (0): Linear(in_features=128, out_features=512, bias=True)
  (1): LeakyReLU(negative_slope=0.01)
  (2): Linear(in_features=512, out_features=256, bias=True)
  (3): LeakyReLU(negative_slope=0.01)
  (4): Linear(in_features=256, out_features=6, bias=True)
)


In [5]:
# Define training parameters
num_trajectories = 20
trajectory_length = 30000
lr = 1e-4

num_epochs = 10000

In [6]:
# Create the optimizer
optimizer = Adam(model.parameters(), lr)

returns = []

# Loop for each epoch
for epoch in range(num_epochs):
    average_return = train_one_epoch(env, 
                                     model,
                                     optimizer,
                                     device,
                                     num_trajectories=num_trajectories,
                                     trajectory_length=trajectory_length)
    returns.append(average_return)

    print('epoch: %3d \t return: %.3f' % (epoch, average_return))

    plt.plot(returns)
    display.display(plt.gcf())
    display.clear_output(wait=True)


In [None]:
torch.save(model, "reinforce_model.ckpt")