## Watch a Smart Agent!

### 1.Start the Environment for Trained Agent

In [1]:
import numpy as np
import torch
import gym
import os
import time

from agent import Agent

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def rgb2gray(rgb, norm=True):
        # rgb image -> gray [0, 1]
    gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
    if norm:
        # normalize
        gray = gray / 128. - 1.
    return gray

seed = 0
img_stack = 4
action_repeat = 10
env = gym.make('CarRacing-v0', verbose=0)
state = env.reset()
reward_threshold = env.spec.reward_threshold



In [2]:
class Wrapper():
    """
    Environment wrapper for CarRacing 
    """

    def __init__(self, env):
        self.env = env  

    def reset(self):
        self.counter = 0
        self.av_r = self.reward_memory()

        self.die = False
        img_rgb = env.reset()
        img_gray = rgb2gray(img_rgb)
        self.stack = [img_gray] * img_stack  # four frames for decision
        return np.array(self.stack)

    def step(self, action):
        total_reward = 0
        for i in range(action_repeat):
            img_rgb, reward, die, _ = env.step(action)
            # don't penalize "die state"
            if die:
                reward += 100
            # green penalty
            if np.mean(img_rgb[:, :, 1]) > 185.0:
                reward -= 0.05
            total_reward += reward
            # if no reward recently, end the episode
            done = True if self.av_r(reward) <= -0.1 else False
            if done or die:
                break
        img_gray = rgb2gray(img_rgb)
        self.stack.pop(0)
        self.stack.append(img_gray)
        assert len(self.stack) == img_stack
        return np.array(self.stack), total_reward, done, die


    @staticmethod
    def reward_memory():
        # record reward for last 100 steps
        count = 0
        length = 100
        history = np.zeros(length)

        def memory(reward):
            nonlocal count
            history[count] = reward
            count = (count + 1) % length
            return np.mean(history)

        return memory
    
agent = Agent(device)

env_wrap = Wrapper(env)    

### 2. Prepare Load

In [3]:
def load(agent, directory, filename):
    agent.net.load_state_dict(torch.load(os.path.join(directory,filename)))

### 3. Prepare Player

In [4]:
from collections import deque
import os

def play(env, agent, n_episodes):
    state = env_wrap.reset()
    
    scores_deque = deque(maxlen=100)
    scores = []
    
    for i_episode in range(1, n_episodes+1):
        state = env_wrap.reset()        
        score = 0
        
        time_start = time.time()
        
        while True:
            action, a_logp = agent.select_action(state)
            env.render()
            next_state, reward, done, die = env_wrap.step( \
                action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.]))

            state = next_state
            score += reward
            
            if done or die:
                break 

        s = (int)(time.time() - time_start)
        
        scores_deque.append(score)
        scores.append(score)

        print('Episode {}\tAverage Score: {:.2f},\tScore: {:.2f} \tTime: {:02}:{:02}:{:02}'\
                  .format(i_episode, np.mean(scores_deque), score, s//3600, s%3600//60, s%60))  


### 3. Load and Play: Score = 350-550

In [5]:
load(agent, 'dir_chk', 'model_weights_350-550.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 664.18,	Score: 664.18 	Time: 00:00:10
Episode 2	Average Score: 441.62,	Score: 219.07 	Time: 00:00:06
Episode 3	Average Score: 497.46,	Score: 609.12 	Time: 00:00:09
Episode 4	Average Score: 523.27,	Score: 600.73 	Time: 00:00:09
Episode 5	Average Score: 538.48,	Score: 599.30 	Time: 00:00:09


### 4. Load and Play: Score = 580-660

In [6]:
load(agent, 'dir_chk', 'model_weights_480-660.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 603.72,	Score: 603.72 	Time: 00:00:12
Episode 2	Average Score: 593.94,	Score: 584.16 	Time: 00:00:11
Episode 3	Average Score: 432.31,	Score: 109.06 	Time: 00:00:08
Episode 4	Average Score: 480.99,	Score: 627.01 	Time: 00:00:11
Episode 5	Average Score: 517.67,	Score: 664.38 	Time: 00:00:11


### 5. Load and Play: Score = 820-980

In [6]:
load(agent, 'dir_chk', 'model_weights_820-980.pth')
play(env, agent, n_episodes=5)

Episode 1	Average Score: 858.52,	Score: 858.52 	Time: 00:00:09
Episode 2	Average Score: 890.43,	Score: 922.33 	Time: 00:00:09
Episode 3	Average Score: 927.22,	Score: 1000.80 	Time: 00:00:09
Episode 4	Average Score: 889.14,	Score: 774.92 	Time: 00:00:10
Episode 5	Average Score: 891.12,	Score: 899.02 	Time: 00:00:09


In [5]:
load(agent, 'dir_chk', 'model_weights_final2.pth')
play(env, agent, n_episodes=2)

Episode 1	Average Score: 1003.20,	Score: 1003.20 	Time: 00:00:13
Episode 2	Average Score: 999.96,	Score: 996.71 	Time: 00:00:10


In [6]:
env.close()

In [31]:
model_parameters = filter(lambda p: p.requires_grad, agent.net.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])

In [32]:
params

445955

In [27]:
agent.net.to(torch.device("cpu"),dtype = torch.float)

Net(
  (cnn_base): Sequential(
    (0): Conv2d(4, 8, kernel_size=(4, 4), stride=(2, 2))
    (1): ReLU()
    (2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2))
    (5): ReLU()
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2))
    (7): ReLU()
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
    (9): ReLU()
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
    (11): ReLU()
  )
  (v): Sequential(
    (0): Linear(in_features=256, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=1, bias=True)
  )
  (fc): Sequential(
    (0): Linear(in_features=256, out_features=100, bias=True)
    (1): ReLU()
  )
  (alpha_head): Sequential(
    (0): Linear(in_features=100, out_features=3, bias=True)
    (1): Softplus(beta=1, threshold=20)
  )
  (beta_head): Sequential(
    (0): Linear(in_features=100, out_features=3, bias=True)
    (1): Softplus(beta=1, thres

In [28]:
from torchsummary import summary
summary(agent.net, (4,96,96),device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 47, 47]             520
              ReLU-2            [-1, 8, 47, 47]               0
            Conv2d-3           [-1, 16, 23, 23]           1,168
              ReLU-4           [-1, 16, 23, 23]               0
            Conv2d-5           [-1, 32, 11, 11]           4,640
              ReLU-6           [-1, 32, 11, 11]               0
            Conv2d-7             [-1, 64, 5, 5]          18,496
              ReLU-8             [-1, 64, 5, 5]               0
            Conv2d-9            [-1, 128, 3, 3]          73,856
             ReLU-10            [-1, 128, 3, 3]               0
           Conv2d-11            [-1, 256, 1, 1]         295,168
             ReLU-12            [-1, 256, 1, 1]               0
           Linear-13                  [-1, 100]          25,700
             ReLU-14                  [