# Homework 3 - Deep Reinforcement Learning
## 2. Cartpole with pixels

In [1]:
#Autoreload imported functions
%load_ext autoreload
%autoreload 2

%matplotlib widget 
#Or use %matplotlib notebook
#I'm running jupyter inside of Visual Studio Code, so %matplotlib widget is needed for me.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl2latex import mpl2latex, latex_figsize
from plotting import COLUMNWIDTH

from pathlib import Path
import json
import random

import logging 
logging.basicConfig(filename='02_Cartpole_pixels.log', encoding='utf-8', level=logging.INFO, filemode='w')

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
import torchvision

import pytorch_lightning as pl

print("Torch version:", torch.__version__)

#Select device for training
#device = "cpu" #For this very simple dataset it is actually faster
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") #Uncomment for GPU 

logging.info(f"Using {device} for training")
print(f'Using "{device}" for training')

Torch version: 1.9.0
Using "cuda" for training


In [4]:
#---Rendering videos---#
import gym
import glob
import io
import base64
import os
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
from gym.wrappers import Monitor

display = Display(visible=0, size=(700, 450))
display.start()

<pyvirtualdisplay.display.Display at 0x7f45b081dfa0>

# Screen Pixels

3 pt: extend the notebook used in Lab 07, in order to learn to control the CartPole environment using directly the screen pixels, rather than the compact state representation used during the Lab (cart position, cart velocity, pole angle, pole angular velocity). This will require to change the “observation_space”.

In [5]:
import random
import numpy as np
import cv2
from utils import wrap_env, CartpolePixels

#---Test the CartPole with Pixels and a random policy---#
env = gym.make("CartPole-v0")
env = CartpolePixels(env)
env = wrap_env(env)

state = env.reset()
states = [state]

done=False
i = 0

while not done:
    state, reward, done, info = env.step(random.choice([0, 1]))
    i += 1
    states.append(state)
    #print(env.unwrapped.state[0])

env.close()

print(f"Total steps: {i}")

#---Save a video of what the network sees---#
size = states[0].shape
fps = 5
out = cv2.VideoWriter('output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (size[1], size[0]), False)
for state in states:
    out.write(np.array(state * 255, dtype=np.uint8))
out.release()

Total steps: 32


In [6]:
import functools
import operator

class DuelingConvDQN(nn.Module):

    def __init__(self,
                 state_space_dim : tuple,
                 action_space_dim : int):

        """Convolutional Neural Network for Reinforcement Learning of the
        Cartpole environment using pixels.

        Architecture is taken from the Atari paper by Deepmind:
        https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

        "Dueling networks" are an improvement of RL shown in this paper: 
        https://arxiv.org/abs/1511.06581

        Parameters
        ----------
        state_space_dim : tuple of int
            Dimensions of a state
        """
        
        super().__init__()

        self.input_shape = state_space_dim

        self.conv = nn.Sequential(
            nn.Conv2d(state_space_dim[0], 16, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=4, stride=2),
            nn.ReLU(),
        )

        num_features = functools.reduce(operator.mul, list(self.conv(torch.rand(1, *state_space_dim)).shape))


        self.advantage = nn.Sequential(
            nn.Linear(num_features, 256), 
            nn.ReLU(),
            nn.Linear(256, action_space_dim) 
        )

        self.value = nn.Sequential(
            nn.Linear(num_features, 256), 
            nn.ReLU(),
            nn.Linear(256, 1) 
        )

    def forward(self, x : "torch.tensor"):
        x = self.conv(x)
        x = x.view(x.size(0), -1) #Flatten

        advantage = self.advantage(x)
        value = self.value(x)

        return value + advantage - advantage.mean()

In [7]:
#---Temperature/Epsilon---#

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 2500 #1300

def epsilon_per_frame(xs):
    return epsilon_final + (epsilon_start - epsilon_final) * np.exp(-1 * xs / epsilon_decay)
      

num_frames = 100000 
temperature = epsilon_per_frame(np.arange(num_frames))
#Here I use "temperature" for both epsilon and temperature, since there is the option of using either "epsilon-greedy" or "softmax" as decision policies.

In [8]:
from model import DeepQLearner
from gym.wrappers import FrameStack
from utils import CartpolePixels

def preprocess_env(env):
    """Preprocess a CartPole env"""

    env = CartpolePixels(env) #Return pixels instead of the state
    env = FrameStack(env, 4)  #Stack 4 frames in each observation,
    # so that the network can infer velocity/acceleration

    return env

hyper_parameters = {
    "gamma" : .99,
    "replay_memory_capacity" : 10000, 
    "min_samples_for_training" : 1000, 
    "batch_size" : 32,
    "target_net_update_steps" : 300, 
    "loss_function" : "SmoothL1Loss",
    "temperature_policy" : temperature,
    "steps_per_epoch" : 1000,
    "learning_rate" : 1e-4
    
}
# RL_net = DeepQLearner(env="CartPole-v1",
#                       preprocess_env=preprocess_env,
#                       Network=DuelingConvDQN, 
#                       hyper_parameters=hyper_parameters,
#                       chosen_policy="epsilon-greedy",
#                       count_steps=True,
#                       update_temp_every_frame=True,
#                       update_target_every_frame=True,
#                       save_video_every_n_episodes=25,
#                       add_to_reward = lambda state, reward : -1 * np.abs(state[0])
#                       ) 
# [UNCOMMENT] the previous line if you want to initialize the model (for re-training it)

In [9]:
from pytorch_lightning import Callback

class LearningRateAdjust(Callback):
    def __init__(self):
        """
        Adjusts the learning rate based on the current value of the reward.
        Higher reward = lower learning rate, which helps avoiding "catastrophic forgetting".
        """

        super().__init__()
        self.n_episodes = 0
        
    def on_batch_start(self, trainer, pl_module):
        lr = 1e-4

        if pl_module.n_episodes > self.n_episodes: #Adjust just after the end of an episode
            reward = pl_module.total_reward
            if reward > 100:
                lr = .5e-4
            if reward > 200:
                lr = 1e-5
            if reward > 300:
                lr = 1e-6
            if reward > 400:
                lr = 1e-7

            for param_group in pl_module.optimizers().param_groups:
                param_group['lr'] = lr
            

In [10]:
from callbacks import NotebookProgressBar, StopAfterNEpisodes


bar = NotebookProgressBar()
lr_adj = LearningRateAdjust()
stop = StopAfterNEpisodes(1000)

trainer = pl.Trainer(gpus=1, max_steps=num_frames, callbacks=[bar, stop, lr_adj], gradient_clip_val=2) 

#trainer.fit(RL_net) 
# [UNCOMMENT] the previous line to re-run training. Otherwise, the next cell will load a saved checkpoint.
# During training, scores of episodes are saved in `02_Cartpole_pixels.log`. 

In [11]:
#---Save the model and the learning stats---#

import pickle
from copy import deepcopy
from datetime import datetime

save = False
# [SET] save to True if you want to save a checkpoint for the trained model
# (unnecessary if training is not re-executed)

if save:
    all_info = deepcopy(RL_net.hyper_parameters)
    all_info["score"] = deepcopy(RL_net.episode_history)
    all_info["temp"]  = deepcopy(RL_net.temp_history)

    DATE_FMT = "%d_%m_%y-%Hh%Mm%S"
    now = datetime.now()
    date = now.strftime(DATE_FMT) #Current time

    os.makedirs("SavedModels/2", exist_ok=True)

    with open(f"SavedModels/2/{date}.result", 'wb') as file:
        file.write(pickle.dumps(all_info))

    trainer.save_checkpoint(f"SavedModels/2/{date}.ckpt")

In [15]:
from glob import glob
import pickle

#---Plot learning curves---#

def moving_average(x : "np.ndarray", w_size : int):
    """Compute the rolling average of a 1D array `x`, averaging the values
    within a window of length `w_size'.

    Returns
    -------
    xs : np.ndarray
        Indices of the original array representing the "centers" of convolved points
    ys : np.ndarray
        Convolved points
    """

    return (np.arange(w_size // 2, len(x) - (w_size - w_size // 2) + 1), 
           np.convolve(x, np.ones(w_size), 'valid') / w_size)

for file in glob("SavedModels/2/*.result"):
    with open(file, 'rb') as f:
        new_var = pickle.loads(f.read()) #works 

    with mpl2latex(True):
        fig, ax1 = plt.subplots(figsize=latex_figsize(wf=1., columnwidth=COLUMNWIDTH))

        color = 'tab:red'
        ax1.plot(new_var['score'], color=color, label='Score (Raw)', alpha=.4)
        ax1.plot(*moving_average(new_var['score'], 10), '--', color=color, label='Score (Avg. 10)')
        ax1.set_xlabel('Episode')
        ax1.set_ylabel('Score', color=color)
        ax1.tick_params(axis='y', labelcolor=color)

        ax2 = ax1.twinx()
        
        color = 'tab:blue'
        ax2.set_ylabel('Epsilon', color=color, rotation=270, labelpad=15)
        ax2.plot(new_var['temp'], color=color, label='Epsilon')
        ax2.tick_params(axis='y', labelcolor=color)

        ax1.set_title("CartPole-v1-Pixels - Training")

        textstr = f"""replay\_capacity: {new_var['replay_memory_capacity']}
gamma: {new_var['gamma']}
lr: {new_var['learning_rate']}
batch: {new_var['batch_size']}
update\_every: {new_var['target_net_update_steps']}"""

        props = dict(boxstyle='round, pad=.3', facecolor='gray', alpha=.1)
        ax1.text(0.35, 0.95, textstr, transform=ax1.transAxes, fontsize=10, verticalalignment='top', bbox=props)
        fig.tight_layout()

        ax1.legend(loc=(0.05, 0.1))
        ax2.legend(loc=(0.05, 0.3))

        filename = os.path.splitext(os.path.basename(file))[0]
        fig.savefig(f"Plots/2/{filename}.pdf", transparent=True, bbox_inches='tight')


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [12]:
from model import DeepQLearner
from utils import wrap_env, CartpolePixels
from gym.wrappers import FrameStack
from tqdm.notebook import tqdm
import cv2

def preprocess_env(env):
    """Preprocess a CartPole env"""

    env = CartpolePixels(env) #Return pixels instead of the state
    env = FrameStack(env, 4)  #Stack 4 frames in each observation,
    # so that the network can infer velocity/acceleration

    return env

#---Test the final training---#

model = DeepQLearner.load_from_checkpoint("SavedModels/2/25_06_21-02h24m16.ckpt", min_samples_for_training=0) 

# Initialize the Gym environment
env = gym.make('CartPole-v1') 
env = CartpolePixels(env)
env = FrameStack(env, 4)
env.seed(1) # Set a random seed for the environment (reproducible results)

# This is for creating the output video in Colab, not required outside Colab
env = wrap_env(env, video_callable=lambda episode_id: True) # Save a video every episode

scores = []
states = []
model.eval()
# Let's try for a total of 10 episodes
for num_episode in tqdm(range(10)): 
    state = env.reset()
    score = 0
    done = False
  
    while not done:
      states.append(state)
      with torch.no_grad():
        action = int(model.policy_net(torch.tensor(state, dtype=torch.float32).unsqueeze(0)).argmax())

      next_state, reward, done, info = env.step(action)
      env.render()
      score += reward 
      state = next_state

    # Print the final score
    scores.append(score)
    print(f"EPISODE {num_episode + 1} - FINAL SCORE: {score}") 

    # Save a video of what the network sees
    size = states[0][0].shape
    fps = 25
    out = cv2.VideoWriter(f'video/cnn_{num_episode}.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (size[1], size[0]), False)
    for state in states:
        out.write(np.array(state[-1] * 255, dtype=np.uint8))
    out.release()

env.close()
print(f"Average score (10 trials): {np.mean(scores):.3f}")

0it [00:00, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

EPISODE 1 - FINAL SCORE: 500.0
EPISODE 2 - FINAL SCORE: 392.0
EPISODE 3 - FINAL SCORE: 500.0
EPISODE 4 - FINAL SCORE: 500.0
EPISODE 5 - FINAL SCORE: 500.0
EPISODE 6 - FINAL SCORE: 500.0
EPISODE 7 - FINAL SCORE: 408.0
EPISODE 8 - FINAL SCORE: 500.0
EPISODE 9 - FINAL SCORE: 454.0
EPISODE 10 - FINAL SCORE: 500.0
Average score (10 trials): 475.400
