Note : This module requires GPU

## CellStrat Hub Pack - Reinforcement Learning
### RL5 - Deep Q Networks

#### Instructions to be followed:
* This kernel must require **GPU** and with CPU training will take  longer time
* Change the kernel to **PyTorch 1.9**


In [1]:
#==============================================================================
# If any library needs to be installed, install with following command :-
# pip install <library-name>
# This pip command should be in an independent cell with no other code or comments in this cell.
#==============================================================================


In [None]:
## This could take few minutes to run

# -*- coding: utf-8 -*-

#==============================================================================
# Install packages
#==============================================================================
!pip install gym > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install tensorboardX > /dev/null 2>&1
!python -m atari_py.import_roms '/home/ec2-user/SageMaker/RL_Pack/RL5 - Deep Q Newtworks/Roms/Roms' > /dev/null 2>&1
!pip install torch > /dev/null 2>&1

#### Install Packages

In [1]:
pip install gym > /dev/null 2>&1

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install gym[atari] > /dev/null 2>&1

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install git+https://github.com/Kojoley/atari-py.git

Collecting git+https://github.com/Kojoley/atari-py.git
  Cloning https://github.com/Kojoley/atari-py.git to /tmp/pip-req-build-8uwthymt
  Running command git clone -q https://github.com/Kojoley/atari-py.git /tmp/pip-req-build-8uwthymt
Building wheels for collected packages: atari-py
  Building wheel for atari-py (setup.py) ... [?25ldone
[?25h  Created wheel for atari-py: filename=atari_py-1.2.2-cp38-cp38-linux_x86_64.whl size=4080204 sha256=48608e47cf36452618ad360bebb5f8c7b55252c9880c53ac3129075e5372b4ad
  Stored in directory: /tmp/pip-ephem-wheel-cache-vboigsiu/wheels/25/11/0c/d37ea19ecec588ab95c4199a485d3a4de5284e9a08b89c8f3f
Successfully built atari-py
Installing collected packages: atari-py
Successfully installed atari-py-1.2.2
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install tensorboardX 

Collecting tensorboardX
  Using cached tensorboardX-2.4-py2.py3-none-any.whl (124 kB)
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.4
Note: you may need to restart the kernel to use updated packages.


In [5]:
!python -m atari_py.import_roms '/home/ec2-user/SageMaker/RL_Pack/RL5 - Deep Q Newtworks/Roms/Roms' > /dev/null 2>&1

In [6]:
pip install torch > /dev/null 2>&1

Note: you may need to restart the kernel to use updated packages.


In [7]:

#==============================================================================
# Import packages
#==============================================================================
import os
import gc
import warnings
from lib import wrappers
from lib import dqn_model
import argparse
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim

from tensorboardX import SummaryWriter

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [8]:
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 10 # We will train the model until this reward is achieved
#MEAN_REWARD_BOUND = 19.5
GAMMA = 0.99  # Gamma value
BATCH_SIZE = 32 
REPLAY_SIZE = 10000 #Maximum Capacity of the buffer
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000 # How frequently we update target model weights with actual model weights
REPLAY_START_SIZE = 10000

EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02

In [9]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

In [10]:
class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)

In [11]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):
      # Play a step using epsilon greedy Policy and return the reward.
        done_reward = None
        # Epsilon Greesy Policy
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

In [12]:
def calc_loss(batch, net, tgt_net, device="cpu"):
    states, actions, rewards, dones, next_states = batch

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    # apply the target network to our next state observations and calculate the maximum Q-value along the same action dimension 1.
    # Function max() returns both maximum values and indices of those values (so it calculates both max and argmax),
    # which is very convenient. However, in this case, we're interested only in values, so we take the first entry of the result.
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    # detach the value from its computation graph to prevent gradients from flowing into the neural network used to calculate 
    # Q approximation for next states.
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [13]:
device = torch.device("cpu")
device

device(type='cpu')

In [14]:
env = wrappers.make_env(DEFAULT_ENV_NAME)

In [15]:
env.action_space.n

6

In [16]:
env.observation_space.shape

(4, 84, 84)

In [None]:
# Training take 1 - 1.5 hours. This Step can be skipped and you can directly go to the next step. We will use a pre trained model.
env = wrappers.make_env(DEFAULT_ENV_NAME)
# Create two networks - Training network (net) and target network (tgt_net)
net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
writer = SummaryWriter(comment="-" + DEFAULT_ENV_NAME)
print(net)

buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
frame_idx = 0
ts_frame = 0
ts = time.time()
best_mean_reward = None

while True:
    frame_idx += 1
    # Reduce the value of Epsilon for every timestep
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
    # Agent plays a Step and Collects the reward
    reward = agent.play_step(net, epsilon, device=device)
    if reward is not None:
        total_rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        mean_reward = np.mean(total_rewards[-100:])
        print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
            frame_idx, len(total_rewards), mean_reward, epsilon,
            speed
        ))
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("speed", speed, frame_idx)
        writer.add_scalar("reward_100", mean_reward, frame_idx)
        writer.add_scalar("reward", reward, frame_idx)
        if best_mean_reward is None or best_mean_reward < mean_reward:
            torch.save(net.state_dict(), DEFAULT_ENV_NAME + "mean_score10.dat")
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward
            if mean_reward > MEAN_REWARD_BOUND:
                print("Solved in %d frames!" % frame_idx)
                break

    if len(buffer) < REPLAY_START_SIZE:
        continue
    # Fixed Q Targets : For every 1000 steps update the Target weights with the weigths from training network
    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device)
    loss_t.backward()
    optimizer.step()
writer.close()


DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)
882: done 1 games, mean reward -21.000, eps 0.99, speed 610.70 f/s
1944: done 2 games, mean reward -21.000, eps 0.98, speed 517.12 f/s
3045: done 3 games, mean reward -20.667, eps 0.97, speed 811.87 f/s
Best mean reward updated -21.000 -> -20.667, model saved
3897: done 4 games, mean reward -20.750, eps 0.96, speed 790.37 f/s
4808: done 5 games, mean reward -20.800, eps 0.95, speed 788.55 f/s
5764: done 6 games, mean reward -20.667, eps 0.94, speed 778.46 f/s
6526: done 7 games, mean reward -20.714, eps 0.93, speed 757.78 f/s
7487: done 8 games, mean reward -20.625, eps 0.9

In [17]:
import gym
import time
import argparse
import numpy as np

import torch

from lib import wrappers
from lib import dqn_model

import collections

DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
FPS = 25

In [18]:
pip install gym pyvirtualdisplay > /dev/null 2>&1


Note: you may need to restart the kernel to use updated packages.


In [19]:
#!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [20]:
pip install xvfbwrapper

Collecting xvfbwrapper
  Downloading xvfbwrapper-0.2.9.tar.gz (5.6 kB)
Building wheels for collected packages: xvfbwrapper
  Building wheel for xvfbwrapper (setup.py) ... [?25ldone
[?25h  Created wheel for xvfbwrapper: filename=xvfbwrapper-0.2.9-py3-none-any.whl size=5010 sha256=b52604245fd14882db195a2d5696313e711e633842b89f611636e223b4c7fd06
  Stored in directory: /home/ubuntu/.cache/pip/wheels/e1/da/b4/57ac130c024104997ae76f389fe0e7e43922ec3cfdffaf1b1e
Successfully built xvfbwrapper
Installing collected packages: xvfbwrapper
Successfully installed xvfbwrapper-0.2.9
Note: you may need to restart the kernel to use updated packages.


In [21]:

pip install gym pyvirtualdisplay > /dev/null 2>&1


Note: you may need to restart the kernel to use updated packages.


In [24]:
#!conda install -c conda-forge ffmpeg

In [25]:
!sudo yum install -y Xvfb

sudo: yum: command not found


In [26]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [27]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7faa957c9b50>

In [28]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

In [29]:
def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [30]:
env = wrap_env(wrappers.make_env(DEFAULT_ENV_NAME))

In [31]:
net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
net.load_state_dict(torch.load(DEFAULT_ENV_NAME + "-best.dat"))
state = env.reset()
total_reward = 0.0
c = collections.Counter()

while True:
  start_ts = time.time()  
  state_v = torch.tensor(np.array([state], copy=False))
  q_vals = net(state_v).data.numpy()[0]
  env.render()
  action = np.argmax(q_vals)
  c[action] += 1
  state, reward, done, _ = env.step(action)
  total_reward += reward
  if done:
    break
env.close()
show_video()  
print("Total reward: %.2f" % total_reward)
print("Action counts:", c)
if True:
  env.env.close()

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.