## Watch a Smart Agent!

### 1.Start the Environment for Trained Agent

In [1]:
import numpy as np
import torch
import gym
import argparse
import os
import time

from td3_agent import TD3

env = gym.make('BipedalWalker-v3')

# Set seeds
seed = 12345
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

state_size = env.observation_space.shape[0]
action_size=env.action_space.shape[0]
action_high= float(env.action_space.high[0])
print('state_size: ', state_size, ', action_size: ', action_size, ', action_high: ', action_high)
    
agent = TD3(state_dim=state_size, action_dim=action_size, max_action=action_high, hidden_dim = [400,300])




state_size:  24 , action_size:  4 , action_high:  1.0


### 2. Prepare Load

In [2]:
def load(agent, dir, prefix):
    agent.actor.load_state_dict(
        torch.load(os.path.join(dir,'%s_actor.pth' % prefix)))
    agent.critic.load_state_dict(
        torch.load(os.path.join(dir,'%s_critic.pth' % prefix)))
    agent.actor_target.load_state_dict(
        torch.load(os.path.join(dir,'%s_actor_t.pth' % prefix)))
    agent.critic_target.load_state_dict(
        torch.load(os.path.join(dir,'%s_critic_t.pth' % prefix)))
base_old = "dir_chk_td3_old"
base_dir = "dir_chk_td3"

In [5]:
import torch.nn as nn
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1)

        # Q2 architecture
        self.l4 = nn.Linear(state_dim + action_dim, 400)
        self.l5 = nn.Linear(400, 300)
        self.l6 = nn.Linear(300, 1)


    def forward(self, x, u):
        xu = torch.cat([x, u], 1)

        x1 = F.relu(self.l1(xu))
        x1 = F.relu(self.l2(x1))
        x1 = self.l3(x1)

        x2 = F.relu(self.l4(xu))
        x2 = F.relu(self.l5(x2))
        x2 = self.l6(x2)
        return x1, x2
class DeterministicActor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(DeterministicActor, self).__init__()

        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)

        self.max_action = max_action

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = self.max_action * torch.tanh(self.l3(x))
        return x


In [6]:
def convert_baseq(model):
    model.linear1 = model.l1
    model.linear2 = model.l2
    model.linear3 = model.l3
    model.linear4 = model.l4
    model.linear5 = model.l5
    model.linear6 = model.l6
    del model.l1
    del model.l2
    del model.l3
    del model.l4
    del model.l5
    del model.l6

def convert_basea(model):
    model.linear1 = model.l1
    model.linear2 = model.l2
    model.linear3 = model.l3
    del model.l1
    del model.l2
    del model.l3

In [7]:

def convert_base2(agent,dir,prefix):
    base_new = 'dir_chk_td3'
    critic = Critic(24,4)
    critic_target= Critic(24,4)
    actor = DeterministicActor(24,4,1)
    actor_target = DeterministicActor(24,4,1)
    critic.load_state_dict(
        torch.load(os.path.join(dir,'%s_critic.pth' % prefix)))
    critic_target.load_state_dict(
        torch.load(os.path.join(dir,'%s_critic_t.pth' % prefix)))
    actor.load_state_dict(
        torch.load(os.path.join(dir,'%s_actor.pth' % prefix)))
    actor_target.load_state_dict(
        torch.load(os.path.join(dir,'%s_actor_t.pth' % prefix)))
    convert_baseq(critic)
    convert_baseq(critic_target)
    convert_basea(actor)
    convert_basea(actor_target)
    
    torch.save(critic.state_dict(),os.path.join(base_new,'%s_critic.pth' % prefix))
    torch.save(critic_target.state_dict(),os.path.join(base_new,'%s_critic_t.pth' % prefix))
    torch.save(actor.state_dict(),os.path.join(base_new,'%s_actor.pth' % prefix))
    torch.save(actor_target.state_dict(),os.path.join(base_new,'%s_actor_t.pth' % prefix))

In [8]:
# convert_base2(0,base_old,'checkpoint_293')

### 3. Prepare Player

In [3]:
from collections import deque
import os

def play(env, agent, n_episodes):
    state = env.reset()
    
    scores_deque = deque(maxlen=100)
    scores = []

    for i_episode in range(1, n_episodes+1):
        state = env.reset()        
        score = 0
        
        time_start = time.time()
        
        while True:
            action = agent.select_action(np.array(state))
            env.render()
            next_state, reward, done, _ = env.step(action)
            state = next_state
            score += reward
            if done:
                break 

        s = (int)(time.time() - time_start)
        
        scores_deque.append(score)
        scores.append(score)
    
        print('Episode {}\tAverage Score: {:.2f},\tScore: {:.2f} \tTime: {:02}:{:02}:{:02}'\
                  .format(i_episode, np.mean(scores_deque), score, s//3600, s%3600//60, s%60))  
    env.close()

### 3. Load and Play: Score = 293

In [4]:
load(agent, base_dir, 'checkpoint_293')
play(env, agent, n_episodes=1)

Episode 1	Average Score: 295.46,	Score: 295.46 	Time: 00:00:13


### 4. Load and Play: Score = 300.5,  training time = 9h 44m

In [17]:
load(agent, base_dir, 'chpnt_88seed_300-5sc_9h44m')
play(env, agent, n_episodes=1)

Episode 1	Average Score: 305.03,	Score: 305.03 	Time: 00:00:12


### 5. Load and Play: Score = 306

In [20]:
load(agent, base_dir, 'checkpoint_306')
play(env, agent, n_episodes=1)

Episode 1	Average Score: 305.48,	Score: 305.48 	Time: 00:00:11


In [9]:
env.close()