# Cartpole Playground

### Import Packages

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

### Create and reset the environment

In [2]:
# Create the environment
env = gym.make("CartPole-v1") #, render_mode="human")
test_env = gym.make("CartPole-v1")
state, info = env.reset()

SEED = 1234

np.random.seed(SEED);
torch.manual_seed(SEED);

# Get the state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

### Create the policy network

In [3]:
# Define the policy function (neural network)
class Policy(nn.Module):
    def __init__(self, use_activation = True):
        super(Policy, self).__init__()
        
        # Policy activation and dropout
        self.use_activation = use_activation
        
        # Define paramaeters for layers
        self.num_hidden_layers = 1
        self.dimensions = [state_size, 128, action_size]
        
        # Define layers
        self.layers = nn.ModuleList()
        for i in range(self.num_hidden_layers):
            # Linear layer
            self.layers.append(nn.Linear(self.dimensions[i], self.dimensions[i+1]))
            # Activation layer
            if self.use_activation:
                self.layers.append(nn.ReLU())
        # Output layer
        self.layers.append(nn.Linear(self.dimensions[-2], self.dimensions[-1]))
        
    # Forward pass of the policy
    def forward(self, x):
        
        for idx, layer in enumerate(self.layers):
            x = layer(x)
            
        # Softmax to get probs
        return torch.softmax(x, dim=-1)

# Option to initialize under the glorot distribution
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0.001)

### Make the training graph

In [4]:
%matplotlib notebook
import matplotlib.pyplot as plt

max_episodes = 300

idxs = range(max_episodes)
fig, ax = plt.subplots(1, figsize=(10,6))
ax.set_xlabel('Steps')
ax.set_ylabel('Rewards')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Rewards')

### Import the necessary functions and train the policy network

In [None]:
from reinforce import *
import tqdm

# Configure parameters
n_runs = 1
print_every= 20
window_size = 25
train_batch_size = 5
test_batch_size = 30
max_eps_length = 500
normalize = True
reward_threshold = 400
discount_factor = 0.99

# Initialize reward storage points
train_rewards = torch.zeros(n_runs, max_episodes)
test_rewards = torch.zeros(n_runs, max_episodes)
losses = torch.zeros(n_runs, max_episodes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train policy using Reinforce with 0 baseline
for run in range(n_runs):
    
    policy = Policy()
    policy = policy.to(device)

    optimizer = optim.Adam(policy.parameters(), lr=1e-3)
    
    reinforce = Reinforce(normalize, max_eps_length, discount_factor, train_batch_size,
                          test_batch_size, policy, optimizer)
    
    for episode in tqdm.tqdm(range(max_episodes), desc=f'Run: {run}'):
        
        loss, train_reward = reinforce.train(env, device)
        
        test_reward = reinforce.evaluate(test_env, device)
        
        train_rewards[run][episode] = train_reward
        test_rewards[run][episode] = test_reward
        losses[run][episode] = loss
        
        if episode % print_every == 0:
            mean_train_rewards = torch.mean(train_rewards[run, max(0,episode-window_size):episode+1])
            mean_test_rewards = torch.mean(test_rewards[run, max(0,episode-window_size):episode+1])
            mean_losses = torch.mean(losses[run, max(0,episode-window_size):episode+1])
        
            print(f'| Episode: {episode:3} | Mean Train Rewards: {mean_train_rewards:5.1f} ',
                  f'| Mean Test Rewards: {mean_test_rewards:5.1f} | Mean Losses: {mean_losses:5.1f} |')
        
        if (mean_test_rewards>reward_threshold):
            
            print(f'Reached reward threshold in {episode} episodes')
            torch.save(policy, "VPG_128_1e3_unnorm.pth") # Save best model
            break
            
        ax.clear()
        ax.plot(idxs, test_rewards[:run+1, :].mean(0))
        ax.fill_between(idxs, test_rewards.min(0).values, test_rewards.max(0).values, alpha=0.1)
        
        fig.canvas.draw()

Run: 0:   0%|▏                                                                         | 1/300 [00:00<01:40,  2.99it/s]

| Episode:   0 | Mean Train Rewards:  20.6  | Mean Test Rewards:   9.2 | Mean Losses:  -0.1 |


Run: 0:   6%|████▌                                                                    | 19/300 [00:08<02:55,  1.60it/s]

In [None]:
plt.savefig("VPG_32times3_1e3_norm_thresh400.png") # Save training figure

In [None]:
torch.save(policy, "VPG_32times3_1e3_norm_thresh400.pth") # Save best model