# Homework 3 - Deep Reinforcement Learning
## 3. Pong with pixels

In [1]:
#Autoreload imported functions
%load_ext autoreload
%autoreload 2

%matplotlib widget 
#Or use %matplotlib notebook
#I'm running jupyter inside of Visual Studio Code, so %matplotlib widget is needed for me.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl2latex import mpl2latex, latex_figsize
from plotting import COLUMNWIDTH

from pathlib import Path
import json
import random

import logging 
logging.basicConfig(filename='03_pong.log', encoding='utf-8', level=logging.INFO, filemode='w')

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split
import torchvision

import pytorch_lightning as pl

print("Torch version:", torch.__version__)

#Select device for training
#device = "cpu" #For this very simple dataset it is actually faster
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") #Uncomment for GPU 

logging.info(f"Using {device} for training")
print(f'Using "{device}" for training')

Torch version: 1.9.0
Using "cuda" for training


In [4]:
#---Rendering videos---#
import gym
import glob
import io
import base64
import os
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
from gym.wrappers import Monitor

display = Display(visible=0, size=(700, 450))
display.start()

<pyvirtualdisplay.display.Display at 0x7f4154e84850>

# Different environment
3 pt: train a deep RL agent on a different Gym environment. You are free to choose whatever Gym environment you like from the available list, or even explore other simulation platforms: https://gym.openai.com/envs

In [5]:
# [INSTALL NOTE] Download ROMs from: https://github.com/openai/atari-py#roms
#Then run this command:
#python -m atari_py.import_roms <full path to ROM folder>
#e.g. python -m atari_py.import_roms C:\Users\franc\Documents\GitHub\DLNAssignments\Homeworks\3\ROMS\

In [6]:
from utils import wrap_env

#---Test the Atari environment---#
env = gym.make('PongNoFrameskip-v4')
# NOTE: PongNoFrameskip-v4 is chosen instead of Pong-v0, because the latter
# by default adds randomness to actions, by repeating them for [2,4] frames,
# which makes learning more difficult.
 
env = wrap_env(env, video_callable=lambda episode_id: True)

state = env.reset()

done = False
n = 0
total_reward = 0
while not done:
    state, reward, done, info = env.step(env.action_space.sample())
    env.render()
    
    n += 1
    total_reward += reward

env.close()

print(f"Run for {n} frames")
print(f"Final reward: {total_reward}")

Run for 3648 frames
Final reward: -21.0


In [7]:
import functools
import operator

class DuelingConvDQN(nn.Module):

    def __init__(self,
                 state_space_dim : tuple,
                 action_space_dim : int):
        """Convolutional Neural Network for Reinforcement Learning of the
        Cartpole environment using pixels.

        Architecture is taken from the Atari paper by Deepmind:
        https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

        "Dueling networks" are an improvement of RL shown in this paper: 
        https://arxiv.org/abs/1511.06581

        Parameters
        ----------
        state_space_dim : tuple of int
            Dimensions of a state
        action_space_dim : int
            How many actions are available
        """
        super().__init__()

        self.input_shape = state_space_dim

        self.conv = nn.Sequential(
            nn.Conv2d(state_space_dim[0], 16, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=4, stride=2),
            nn.ReLU(),
            # nn.Conv2d(64, 64, kernel_size=3, stride=1),
            # nn.ReLU()
        )

        num_features = functools.reduce(operator.mul, list(self.conv(torch.rand(1, *state_space_dim)).shape))

        self.advantage = nn.Sequential(
            nn.Linear(num_features, 256), 
            nn.ReLU(),
            nn.Linear(256, action_space_dim) 
        )

        self.value = nn.Sequential(
            nn.Linear(num_features, 256), 
            nn.ReLU(),
            nn.Linear(256, 1) 
        )

    def forward(self, x : "torch.tensor"):
        x = self.conv(x)
        x = x.view(x.size(0), -1) #Flatten

        advantage = self.advantage(x)
        value = self.value(x)

        return value + advantage - advantage.mean()


In [8]:
#---Wrappers---#
import random
import cv2 

class PongPixels(gym.ObservationWrapper):
    
    def __init__(self, env):
        """
        Resize Pong frames to 84x84, as it was done in the Atari paper by Deepmind:
        https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
        
        Parameters
        ----------
        env : "gym.Env"
            Instance of Pong environment
        """

        super().__init__(env)

        self.image_width = 84
        self.image_height = 84
        self.observation_space = gym.spaces.Box(low=0., high=1., shape=(1, self.image_height, self.image_width), dtype=np.float32)

    def observation(self, frame):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) #convert to grayscale
        frame = cv2.resize(frame, (self.image_width, self.image_height), interpolation=cv2.INTER_AREA)

        return np.array(frame[np.newaxis, :, :] / 255., dtype=np.float32) #shape (1, 84, 84)

#---Print an image of what the network sees---#
env = gym.make("PongNoFrameskip-v4")
env = PongPixels(env)

state = env.reset()
for i in range(161): #Step a bit
    state, reward, done, info = env.step(0)

print(state.shape)
fig = plt.figure()
plt.imshow(state[0], cmap='gist_gray')
plt.show()

#---Save a video of an episode---#
state = env.reset()
states = [state]

done = False
while not done:
    state, reward, done, info = env.step(random.randint(0, 5))
    states.append(state)

fps = 25
out = cv2.VideoWriter('cnn_pong.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (84, 84), False)
for state in states:
    out.write(np.array(state[0] * 255, dtype=np.uint8))
out.release()

(1, 84, 84)


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [9]:
class EndEpisodeOnLosingLife(gym.Wrapper):
    def __init__(self, env : "gym.Env"):
        """
        An episode is terminated when a life is lost, and not on game over
        (all lives lost). This makes episodes shorter, and so learning faster.
        Moreover, it helps the RL agent understand that "losing a life = bad".

        It was originally done in the Nature paper by DeepMind researchers:
        https://www.nature.com/articles/nature14236

        However, it's usefulness is debated, for instance in https://arxiv.org/abs/1709.06009 it is recommended against. In this homework, it is mainly done for 
        performance reasons (shorter episodes).

        Parameters
        ----------
        env : "gym.Env"
            Pong Environment
        """

        super().__init__(env)

        self.env = env

        self.previous_lives = 0
        self.final_done = True

    def get_lives(self):
        return self.env.unwrapped.ale.lives() #How many lives the player has

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        self.final_done = done

        current_lives = self.get_lives()
        if current_lives < self.previous_lives and current_lives > 0: #If the player has lost a life, but there are other lives left
            done = True #Terminate the episode

        self.previous_lives = current_lives

        return state, reward, done, info
    
    def reset(self, **kwargs):
        if self.final_done: #Reset only on losing the last life
            state = self.env.reset(**kwargs)
        else:
            state, reward, done, info = self.env.step(0) #Skip an action
            #(The game has a reset frame after losing a life)
        
        self.previous_lives = self.get_lives()

        return state


In [10]:
class SkipFrames(gym.Wrapper):
    def __init__(self, env : "gym.Env", skip : int = 4):
        """
        Each action is repeated for `skip` frames, and only the `skip`-th frame
        is returned as an observation. Reward is summed over the skipped frames.
        """

        super().__init__(env)
        self.env = env
        self.skip = skip
    
    def reset(self, **kwargs):
        return self.env.reset(**kwargs)
    
    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self.skip):
            state, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        
        return state, total_reward, done, info

In [11]:
from model import DeepQLearner
import numpy as np
import math

#---Make environment---#
env = gym.make("PongNoFrameskip-v4")
env = SkipFrames(env, 4)
env = EndEpisodeOnLosingLife(env)
env = PongPixels(env)

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000

#---Define epsilon decay---#
def epsilon_per_frame(xs):
    return epsilon_final + (epsilon_start - epsilon_final) * np.exp(-1 * xs / epsilon_decay)

num_frames = 1000000
temperature = epsilon_per_frame(np.arange(num_frames))

hyper_parameters = {
    "gamma" : .99,
    "replay_memory_capacity" : 100000,
    "min_samples_for_training" : 10000,
    "batch_size" : 32,
    "target_net_update_steps" : 1000,
    "loss_function" : "SmoothL1Loss",
    "temperature_policy" : temperature,
    "steps_per_epoch" : 10000,
    "learning_rate" : 1e-4
    
}
# RL_net = DeepQLearner(env=env, 
#                       Network=DuelingConvDQN, 
#                       hyper_parameters=hyper_parameters,
#                       chosen_policy="epsilon-greedy",
#                       count_steps=False,
#                       update_temp_every_frame=True,
#                       update_target_every_frame=True,
#                       save_video_every_n_episodes=1,
#                       ) 
# [UNCOMMENT] the previous line if you want to initialize the model (for re-training it)

In [12]:
from callbacks import NotebookProgressBar

bar = NotebookProgressBar()

trainer = pl.Trainer(gpus=1, max_steps=num_frames, callbacks=[bar], gradient_clip_val=2) 

#trainer.fit(RL_net) 
# [UNCOMMENT] the previous line to re-run training. Otherwise, the next cell will load a saved checkpoint.
# During training, scores of episodes are saved in `03_pong.log`. 

In [13]:
import pickle
from copy import deepcopy
from datetime import datetime

save = False
# [SET] save to True if you want to save a checkpoint for the trained model
# (unnecessary if training is not re-executed)

if save:
    all_info = deepcopy(RL_net.hyper_parameters)
    all_info["score"] = deepcopy(RL_net.episode_history)
    all_info["temp"]  = deepcopy(RL_net.temp_history)

    DATE_FMT = "%d_%m_%y-%Hh%Mm%S"
    now = datetime.now()
    date = now.strftime(DATE_FMT) #Current time

    os.makedirs("SavedModels/3/", exist_ok=True)

    with open(f"SavedModels/3/{date}.result", 'wb') as file:
        file.write(pickle.dumps(all_info))

    trainer.save_checkpoint(f"SavedModels/3/{date}.ckpt")

In [16]:
from glob import glob
import pickle

#---Plot learning curves---#

def moving_average(x : "np.ndarray", w_size : int):
    """Compute the rolling average of a 1D array `x`, averaging the values
    within a window of length `w_size'.

    Returns
    -------
    xs : np.ndarray
        Indices of the original array representing the "centers" of convolved points
    ys : np.ndarray
        Convolved points
    """

    return (np.arange(w_size // 2, len(x) - (w_size - w_size // 2) + 1), 
           np.convolve(x, np.ones(w_size), 'valid') / w_size)

for file in glob("SavedModels/3/*.result"):
    with open(file, 'rb') as f:
        new_var = pickle.loads(f.read()) #works 

    with mpl2latex(True):
        fig, ax1 = plt.subplots(figsize=latex_figsize(wf=1., columnwidth=COLUMNWIDTH))

        color = 'tab:red'
        ax1.plot(new_var['score'], color=color, label='Score (Raw)', alpha=.4)
        ax1.plot(*moving_average(new_var['score'], 10), '--', color=color, label='Score (Avg. 10)')
        ax1.set_xlabel('Episode')
        ax1.set_ylabel('Score', color=color)
        ax1.tick_params(axis='y', labelcolor=color)

        ax2 = ax1.twinx()
        
        color = 'tab:blue'
        ax2.set_ylabel('Epsilon', color=color, rotation=270, labelpad=15)
        ax2.plot(new_var['temp'], color=color, label='Epsilon')
        ax2.tick_params(axis='y', labelcolor=color)

        ax1.set_title("Pong-Pixels - Training")

        textstr = f"""replay\_capacity: {new_var['replay_memory_capacity']}
gamma: {new_var['gamma']}
lr: {new_var['learning_rate']}
batch: {new_var['batch_size']}
update\_every: {new_var['target_net_update_steps']}"""

        props = dict(boxstyle='round, pad=.3', facecolor='gray', alpha=.1)
        ax1.text(0.65, 0.3, textstr, transform=ax1.transAxes, fontsize=10, verticalalignment='top', bbox=props)
        fig.tight_layout()

        ax1.legend(loc=(0.1, 0.7))
        ax2.legend(loc=(0.1, 0.9))

        filename = os.path.splitext(os.path.basename(file))[0]
        fig.savefig(f"Plots/3/{filename}.pdf", transparent=True, bbox_inches='tight')


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [15]:
#---Test---#

from model import DeepQLearner
from utils import wrap_env
from tqdm.notebook import tqdm

#---Test the final training---#

model = DeepQLearner.load_from_checkpoint("SavedModels/3/25_06_21-22h54m13.ckpt", min_samples_for_training=0) 

# Initialize the Gym environment
env = gym.make("PongNoFrameskip-v4")
env = SkipFrames(env, 4)
#env = EndEpisodeOnLosingLife(env)
env = PongPixels(env)
env.seed(1) # Set a random seed for the environment (reproducible results)

env = wrap_env(env, video_callable=lambda episode_id: True) # Save a video every episode

scores = []
states = []
model.eval()

for num_episode in tqdm(range(10)): 
    state = env.reset()
    score = 0
    done = False

    while not done:
      states.append(state)

      with torch.no_grad():
        action = int(model.policy_net(torch.tensor(state, dtype=torch.float32).unsqueeze(0)).argmax())
      
      next_state, reward, done, info = env.step(action)

      env.render()
      score += reward 
      state = next_state

    print(f"EPISODE {num_episode + 1} - FINAL SCORE: {score}") 
    scores.append(score)

    # Save a video of what the network sees
    size = states[0][0].shape
    fps = 25
    out = cv2.VideoWriter(f'video/cnn_{num_episode}.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (size[1], size[0]), False)
    for state in states:
        out.write(np.array(state[-1] * 255, dtype=np.uint8))
    out.release()
  
print(f"Average score (10 trials): {np.mean(scores):.3f}")
env.close()



0it [00:00, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

EPISODE 1 - FINAL SCORE: 20.0
EPISODE 2 - FINAL SCORE: 20.0
EPISODE 3 - FINAL SCORE: 20.0
EPISODE 4 - FINAL SCORE: 20.0
EPISODE 5 - FINAL SCORE: 20.0
EPISODE 6 - FINAL SCORE: 20.0
EPISODE 7 - FINAL SCORE: 20.0
EPISODE 8 - FINAL SCORE: 20.0
EPISODE 9 - FINAL SCORE: 20.0
EPISODE 10 - FINAL SCORE: 20.0
Average score (10 trials): 20.000
