In [1]:
!pip install wandb -qq
import wandb

wandb.init(project='ml', entity='b1boid')

config = wandb.config
config.learning_rate = 0.01

[K     |████████████████████████████████| 2.1MB 7.6MB/s 
[K     |████████████████████████████████| 102kB 11.5MB/s 
[K     |████████████████████████████████| 163kB 34.6MB/s 
[K     |████████████████████████████████| 133kB 32.4MB/s 
[K     |████████████████████████████████| 71kB 10.8MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [2]:
import numpy as np
import gym
import torch
import torchvision
import torch.nn as nn
from tqdm import tqdm
from torch.nn.functional import smooth_l1_loss, leaky_relu
from collections import deque
from skimage.color import rgb2grey
from skimage.transform import rescale
import random
import time

In [3]:
class DeepQNetwork(nn.Module):
    def __init__(self, num_frames, num_actions):
        super(DeepQNetwork, self).__init__()
        self.num_frames = num_frames
        self.num_actions = num_actions

        self.conv1 = nn.Conv2d(
            in_channels=num_frames,
            out_channels=16,
            kernel_size=8,
            stride=4,
            padding=2
            )
        self.conv2 = nn.Conv2d(
            in_channels=16,
            out_channels=32,
            kernel_size=4,
            stride=2,
            padding=1
            )
        self.fc1 = nn.Linear(
            in_features=3200,
            out_features=256,
            )
        self.fc2 = nn.Linear(
            in_features=256,
            out_features=num_actions,
            )

    def flatten(self, x):
        return x.view(x.size()[0], -1)

    def forward(self, x, k_relu=0.02):
        x = leaky_relu(self.conv1(x), k_relu)
        x = leaky_relu(self.conv2(x), k_relu)
        x = self.flatten(x)                  
        x = leaky_relu(self.fc1(x), k_relu)
        x = self.fc2(x)

        return x

In [10]:
class DeepQLearningAtari:
    def __init__(self, env, lr=0.0001):
        self.env = env
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.action_num = env.action_space.n
        self.state_num = env.observation_space.shape[0]
        self.num_frames = 4
        self.last_info = {'ale.lives': 5}
        self.model = DeepQNetwork(self.num_frames, self.action_num).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

    def save(self, model_path):
        torch.save(self.model.state_dict(), model_path)  

    def load(self, model_path):
        self.model.load_state_dict(torch.load(model_path))   

    def process_image(self, state):
        state = rgb2grey(state[35:195, :, :])
        state = rescale(state, scale=0.5)[np.newaxis, np.newaxis, :, :] 
        return torch.tensor(state).float().to(self.device)    

    def play(self, train=True, gamma=0.99, eps=0.9, batch_size=32, memory_size=50000, games=3000, model_path="model2"):
        def choose_action():
            random_action = random.randint(0, self.action_num - 1)
            if not train and info != self.last_info: return 1
            return random_action if random.random() < eps else output.argmax().item()

        if not train:
          self.load(model_path) 

        memory = deque(maxlen=memory_size)
        state = self.process_image(self.env.reset())

        for game in tqdm(range(games), position=0, leave=True):
            total_reward = 0
            done = False
            info = self.last_info
            while not done:
                if not train:
                  self.env.render()
                  time.sleep(0.1)   
                while state.size()[1] < self.num_frames:
                    action = 1
                    new_frame, reward, done, _ = self.env.step(action)
                    new_frame = self.process_image(new_frame)
                    state = torch.cat([state, new_frame], 1)
                output = self.model(torch.tensor(state).float())

                action = choose_action()

                self.last_info = info
                next_frame, reward, done, info = self.env.step(action)
                next_frame = self.process_image(next_frame)
                next_state = torch.cat([state, next_frame], 1)[:, 1:, :, :]

                memory.append((state,next_state,reward, done, action))

                total_reward += reward
                if done and train:
                    wandb.log({"total_reward": total_reward})
                    state = self.process_image(self.env.reset())
                else:
                    state = next_state
                
      
                if train and len(memory) > batch_size:
                    batch = list(zip(*random.sample(list(memory), batch_size)))
                    state_batch = torch.cat(batch[0], 0).to(self.device)
                    next_state_batch = torch.cat(batch[1], 0).to(self.device)
                    reward_batch = torch.tensor(batch[2]).to(self.device)
                    done_batch = torch.tensor(batch[3]).to(self.device)
                    action_batch = torch.tensor(batch[4]).to(self.device)

                    with torch.no_grad():
                        target_q, _ = self.model(next_state_batch).max(dim=1)
                        target_q[done_batch] = 0

                    target_q = reward_batch + gamma * target_q

                    q = self.model(state_batch).gather(dim=1, index=action_batch.unsqueeze(dim=1)).squeeze()

                    self.optimizer.zero_grad()
                    smooth_l1_loss(q, target_q).backward()

                    self.optimizer.step()

            eps = max(0.05, eps - eps * (game / games))

        if train:    
          self.save(model_path)  

In [6]:
env = gym.make('BreakoutDeterministic-v4')
dq = DeepQLearningAtari(env)
dq.play()
env.close()

100%|██████████| 3000/3000 [2:28:32<00:00,  2.97s/it]


In [7]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

from pyvirtualdisplay import Display

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

Collecting setuptools
[?25l  Downloading https://files.pythonhosted.org/packages/11/1b/7012b145cb228aed20f9b2b8b259df49e7963d900799ea44791f54d06ab9/setuptools-56.1.0-py3-none-any.whl (785kB)
[K     |▍                               | 10kB 21.0MB/s eta 0:00:01[K     |▉                               | 20kB 17.3MB/s eta 0:00:01[K     |█▎                              | 30kB 14.4MB/s eta 0:00:01[K     |█▊                              | 40kB 13.1MB/s eta 0:00:01[K     |██                              | 51kB 9.2MB/s eta 0:00:01[K     |██▌                             | 61kB 9.6MB/s eta 0:00:01[K     |███                             | 71kB 10.0MB/s eta 0:00:01[K     |███▍                            | 81kB 11.2MB/s eta 0:00:01[K     |███▊                            | 92kB 10.3MB/s eta 0:00:01[K     |████▏                           | 102kB 9.0MB/s eta 0:00:01[K     |████▋                           | 112kB 9.0MB/s eta 0:00:01[K     |█████                           | 122kB 9

In [11]:
display = Display(visible=0, size=(1400, 900))
display.start()
env = wrap_env(gym.make('BreakoutDeterministic-v4'))
dq = DeepQLearningAtari(env)
dq.play(train=False, games=1, eps=0)
env.close()
show_video()

100%|██████████| 1/1 [01:03<00:00, 63.73s/it]
