# Cartpole Playground

### Import Packages

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

### Create and reset the environment

In [2]:
# Create the environment
env = gym.make("CartPole-v1") #, render_mode="human")
test_env = gym.make("CartPole-v1")
state, info = env.reset()

SEED = 1234

np.random.seed(SEED);
torch.manual_seed(SEED);

# Get the state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

### Create the policy network

In [3]:
# Define the policy function (neural network)
class Policy(nn.Module):
    def __init__(self, use_activation = True):
        super(Policy, self).__init__()
        
        # Policy activation and dropout
        self.use_activation = use_activation
        
        # Define paramaeters for layers
        self.num_hidden_layers = 3
        self.dimensions = [state_size, 32, 32, 32, action_size]
        
        # Define layers
        self.layers = nn.ModuleList()
        for i in range(self.num_hidden_layers):
            # Linear layer
            self.layers.append(nn.Linear(self.dimensions[i], self.dimensions[i+1]))
            # Activation layer
            if self.use_activation:
                self.layers.append(nn.ReLU())
        # Output layer
        self.layers.append(nn.Linear(self.dimensions[-2], self.dimensions[-1]))
        
    # Forward pass of the policy
    def forward(self, x):
        
        for idx, layer in enumerate(self.layers):
            x = layer(x)
            
        # Softmax to get probs
        return torch.softmax(x, dim=-1)

# Option to initialize under the glorot distribution
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0.001)

### Make the training graph

In [4]:
%matplotlib notebook
import matplotlib.pyplot as plt

max_episodes = 300

idxs = range(max_episodes)
fig, ax = plt.subplots(1, figsize=(10,6))
ax.set_xlabel('Steps')
ax.set_ylabel('Rewards')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Rewards')

### Import the necessary functions and train the policy network

In [5]:
from reinforce import *
from rl_functions import * 
import tqdm

# Configure parameters
n_runs = 1
print_every= 20
window_size = 25
train_batch_size = 5
test_batch_size = 30
max_eps_length = 500
normalize = False
reward_threshold = 400
discount_factor = 0.99

# Initialize reward storage points
train_rewards = torch.zeros(n_runs, max_episodes)
test_rewards = torch.zeros(n_runs, max_episodes)
losses = torch.zeros(n_runs, max_episodes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train policy using Reinforce with 0 baseline
for run in range(n_runs):
    
    policy = Policy()
    policy = policy.to(device)

    optimizer = optim.Adam(policy.parameters(), lr=1e-3)
    
    for episode in tqdm.tqdm(range(max_episodes), desc=f'Run: {run}'):
        
        loss, train_reward = train(env, policy, optimizer, update_policy_reinforce_static_baseline, normalize, 
                                   max_eps_length, discount_factor, train_batch_size, device)
        
        test_reward = evaluate(test_env, policy, max_eps_length, test_batch_size, device)
        
        train_rewards[run][episode] = train_reward
        test_rewards[run][episode] = test_reward
        losses[run][episode] = loss
        
        if episode % print_every == 0:
            mean_train_rewards = torch.mean(train_rewards[run, max(0,episode-window_size):episode+1])
            mean_test_rewards = torch.mean(test_rewards[run, max(0,episode-window_size):episode+1])
            mean_losses = torch.mean(losses[run, max(0,episode-window_size):episode+1])
        
            print(f'| Episode: {episode:3} | Mean Train Rewards: {mean_train_rewards:5.1f} ',
                  f'| Mean Test Rewards: {mean_test_rewards:5.1f} | Mean Losses: {mean_losses:5.1f} |')
        
        if (mean_test_rewards>reward_threshold):
            
            print(f'Reached reward threshold in {episode} episodes')
            torch.save(policy, "VPG_128_1e3_unnorm.pth") # Save best model
            break
            
        ax.clear()
        ax.plot(idxs, test_rewards[:run+1, :].mean(0))
        ax.fill_between(idxs, test_rewards.min(0).values, test_rewards.max(0).values, alpha=0.1)
        
        fig.canvas.draw()

Run: 0:   0%|▏                                                                         | 1/300 [00:00<00:50,  5.95it/s]

| Episode:   0 | Mean Train Rewards:  17.6  | Mean Test Rewards:   9.3 | Mean Losses: 115.7 |


Run: 0:   7%|█████                                                                    | 21/300 [00:03<00:42,  6.55it/s]

| Episode:  20 | Mean Train Rewards:  21.5  | Mean Test Rewards:   9.3 | Mean Losses: 191.4 |


Run: 0:  14%|█████████▉                                                               | 41/300 [00:06<00:49,  5.23it/s]

| Episode:  40 | Mean Train Rewards:  22.8  | Mean Test Rewards:   9.4 | Mean Losses: 220.7 |


Run: 0:  20%|██████████████▊                                                          | 61/300 [00:10<00:52,  4.53it/s]

| Episode:  60 | Mean Train Rewards:  26.2  | Mean Test Rewards:  10.4 | Mean Losses: 268.4 |


Run: 0:  27%|███████████████████▋                                                     | 81/300 [00:20<02:23,  1.53it/s]

| Episode:  80 | Mean Train Rewards:  28.2  | Mean Test Rewards:  70.1 | Mean Losses: 301.2 |


Run: 0:  34%|████████████████████████▏                                               | 101/300 [00:28<01:52,  1.77it/s]

| Episode: 100 | Mean Train Rewards:  28.2  | Mean Test Rewards: 103.4 | Mean Losses: 298.0 |


Run: 0:  40%|█████████████████████████████                                           | 121/300 [00:42<02:24,  1.24it/s]

| Episode: 120 | Mean Train Rewards:  36.4  | Mean Test Rewards: 145.2 | Mean Losses: 483.7 |


Run: 0:  47%|█████████████████████████████████▊                                      | 141/300 [00:51<01:12,  2.21it/s]

| Episode: 140 | Mean Train Rewards:  41.4  | Mean Test Rewards: 100.7 | Mean Losses: 580.3 |


Run: 0:  54%|██████████████████████████████████████▋                                 | 161/300 [01:02<01:46,  1.30it/s]

| Episode: 160 | Mean Train Rewards:  46.0  | Mean Test Rewards:  88.1 | Mean Losses: 688.8 |


Run: 0:  60%|███████████████████████████████████████████▍                            | 181/300 [01:20<01:33,  1.27it/s]

| Episode: 180 | Mean Train Rewards:  55.5  | Mean Test Rewards: 189.8 | Mean Losses: 955.1 |


Run: 0:  67%|████████████████████████████████████████████████▏                       | 201/300 [01:41<01:24,  1.17it/s]

| Episode: 200 | Mean Train Rewards:  63.1  | Mean Test Rewards: 203.8 | Mean Losses: 1233.1 |


Run: 0:  74%|█████████████████████████████████████████████████████                   | 221/300 [02:01<01:44,  1.32s/it]

| Episode: 220 | Mean Train Rewards:  62.5  | Mean Test Rewards: 202.5 | Mean Losses: 1221.5 |


Run: 0:  80%|█████████████████████████████████████████████████████████▊              | 241/300 [02:31<01:42,  1.74s/it]

| Episode: 240 | Mean Train Rewards:  72.2  | Mean Test Rewards: 351.8 | Mean Losses: 1555.2 |


Run: 0:  87%|██████████████████████████████████████████████████████████████▍         | 260/300 [03:04<00:28,  1.41it/s]

| Episode: 260 | Mean Train Rewards:  76.1  | Mean Test Rewards: 404.2 | Mean Losses: 1661.2 |
Reached reward threshold in 260 episodes





In [6]:
plt.savefig("VPG_32times3_1e3_unnorm.png") # Save training figure

In [8]:
torch.save(policy, "VPG_32times3_1e3_unnorm.pth") # Save best model