In [1]:
from gym import make
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import copy
from collections import deque

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Actor

In [3]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.linear1 = nn.Linear(state_size, 32)
        self.linear2 = nn.Linear(32, 32)
        self.linear3 = nn.Linear(32, action_size)
        self.reset_parameters()
        
    def reset_parameters(self):
        self.linear1.weight.data.normal_(0, 1e-1)
        self.linear2.weight.data.normal_(0, 1e-1)
        self.linear3.weight.data.normal_(0, 1e-2)
    
    def forward(self, state):
        x = state
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        return torch.tanh(x)

## Critic

In [4]:
class Critic(nn.Module):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.linear1 = nn.Linear(state_size + action_size, 64)
        self.linear2 = nn.Linear(64, 64)
        self.linear3 = nn.Linear(64, 1)
        self.reset_parameters()
        
    def reset_parameters(self):
        self.linear1.weight.data.normal_(0, 1e-1)
        self.linear2.weight.data.normal_(0, 1e-1)
        self.linear3.weight.data.normal_(0, 1e-2)
    
    def forward(self, state, action):
        x = torch.cat((state, action), dim=1)        
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = self.linear3(x)
        return x

## Buffer

In [5]:
class Memory:
    def __init__(self, buffer_size, batch_size):
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = []
        self.position = 0

    def push(self, element):
        if len(self.memory) < self.buffer_size:
            self.memory.append(None)
        self.memory[self.position] = element
        self.position = (self.position + 1) % self.buffer_size

    def sample(self):
        return list(zip(*random.sample(self.memory, self.batch_size)))

    def __len__(self):
        return len(self.memory)

## Agent

In [6]:

class Agent:
    def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau):
         # Actor Network and Target Network
        self.actor = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-3)

        # Critic Network and Target Network
        self.critic = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)
        
        # copy weights
        self.hard_update(self.actor_target, self.actor)
        self.hard_update(self.critic_target, self.critic)
        
        self.memory = Memory(buffer_size, batch_size)
        self.gamma = gamma
        self.tau = tau
        self.sd = 1
        
    def hard_update(self, target, network):
        for target_param, param in zip(target.parameters(), network.parameters()):
            target_param.data.copy_(param.data)
            
    def soft_update(self, target, network):
        for target_param, param in zip(target.parameters(), network.parameters()):
            target_param.data.copy_(self.tau*param.data + (1-self.tau)*target_param.data)
            
    def learn(self, batch):
        
        state, action, reward, next_state, done = batch

        state = torch.tensor(state).to(device).float()
        next_state = torch.tensor(next_state).to(device).float()
        reward = torch.tensor(reward).to(device).float()
        action = torch.tensor(action).to(device)
        done = torch.tensor(done).to(device).int()
        
        # update critic
        next_action = self.actor_target(next_state)

        Q_target = self.critic_target(next_state, next_action).detach()
        Q_target = reward.unsqueeze(1) + (self.gamma*Q_target*((1-done).unsqueeze(1)))

        
        critic_loss = F.mse_loss(self.critic(state, action), Q_target)        
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        # update actor
        
        action_prediction = self.actor(state)
        actor_loss = -self.critic(state, action_prediction).mean()
        
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # update actor_target and critic_target
        
        self.soft_update(self.critic_target, self.critic)
        self.soft_update(self.actor_target, self.actor)
        
    def act(self, state, noise = True):
        state =  torch.tensor(state).to(device).float()
        action = self.actor(state).cpu().data.numpy()
        
        if noise:
            noise = np.random.normal(0, self.sd)
            action = action + noise
        
        if action[0] > 1:
            action[0] = 1
        if action[0] < -1:
            action[0] = -1
        return action
    
    def step(self, state, action, reward, next_state, done):
        self.memory.push((state, action, reward, next_state, done))
        if len(self.memory) >= self.memory.batch_size:
            self.learn(self.memory.sample())
        
    def save(self):
        torch.save(self.actor, "actor.pkl")
        torch.save(self.critic, "critic.pkl")
        
    def test(self):
        new_env = make("MountainCarContinuous-v0")
        new_env.seed(9)
        reward = []
        for i in range(50):
            state = new_env.reset()
            local_reward = 0
            done = False
            while not done:
                action = self.act(state, noise = False)
                state, r, done, _ = new_env.step(action)
                local_reward += r
            reward.append(local_reward)
        return reward
            


## MountainCarContinuous-v0

In [7]:
env = make("MountainCarContinuous-v0")
np.random.seed(9)
env.seed(9)

action_size = env.action_space.shape[0]
print(f'size of eche action = {action_size}')
state_size = env.observation_space.shape[0]
print(f'size of state = {state_size}')

size of eche action = 1
size of state = 2




## Train the Agent

In [8]:
BUFFER_SIZE = int(1e6)  
BATCH_SIZE = 64
GAMMA = 0.99            
TAU = 1e-3                    
EPISODES = 500

In [9]:
def ddpg(episodes):
    agent = Agent(state_size = state_size, action_size = action_size,
              buffer_size = BUFFER_SIZE, batch_size = BATCH_SIZE,
              gamma = GAMMA, tau = TAU)
    reward_list = []
    mean_reward = -20000
    for i in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            total_reward +=reward
            state = next_state
            
        reward_list.append(total_reward)
        agent.sd = max(agent.sd - 0.01, 0.1)
        if total_reward > 50:
            r = agent.test()
            local_mean = np.mean(r)
            print(f"episode: {i+1}, current reward: {total_reward}, max reward: {np.max(r)}, mean reward: {local_mean}")
            if local_mean > mean_reward:
                mean_reward = local_mean
                agent.save()
                print("Saved")
        else:
            print(f"episode: {i+1}, current reward: {total_reward}")
            
            
    return reward_list

reward = ddpg(EPISODES)           

episode: 1, current reward: -53.14649849925293
episode: 2, current reward: -51.18537799228434
episode: 3, current reward: -47.6784317369832
episode: 4, current reward: -50.41504956254138
episode: 5, current reward: -50.20164968504307
episode: 6, current reward: -48.04421274303699
episode: 7, current reward: -49.10299902205487
episode: 8, current reward: -47.90219743382987
episode: 9, current reward: -45.90659546515292
episode: 10, current reward: 70.31940161171292, max reward: -0.1671011956508624, mean reward: -0.1675511717809639
Saved


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


episode: 11, current reward: -45.481127945608485
episode: 12, current reward: -46.77662850898423
episode: 13, current reward: -46.07504396321992
episode: 14, current reward: -46.70766253888688
episode: 15, current reward: -45.73336123741364
episode: 16, current reward: -44.46730489070606
episode: 17, current reward: -45.18101737811812
episode: 18, current reward: -44.54450868555374
episode: 19, current reward: -42.575676645489615
episode: 20, current reward: -42.66587927884428
episode: 21, current reward: -42.07691309894329
episode: 22, current reward: -40.610491788458255
episode: 23, current reward: -41.81188579535066
episode: 24, current reward: -43.177421322604765
episode: 25, current reward: -40.12162912171294
episode: 26, current reward: -39.050530328081834
episode: 27, current reward: -39.88279051708597
episode: 28, current reward: -40.3046859177759
episode: 29, current reward: -39.71056518473935
episode: 30, current reward: -42.096173301496364
episode: 31, current reward: -39.29

episode: 106, current reward: 93.85514439342722, max reward: 95.04757696229204, mean reward: 94.52736844128205
episode: 107, current reward: 95.18805179756325, max reward: 95.08555149768864, mean reward: 94.30867155191494
episode: 108, current reward: 95.00574102825938, max reward: 95.09466500597135, mean reward: 94.54465952061629
episode: 109, current reward: 93.7933504665332, max reward: 94.97931338055065, mean reward: 94.45459951254335
episode: 110, current reward: 94.59527192049231, max reward: 94.82623057922397, mean reward: 94.50218744144335
episode: 111, current reward: 94.79850622844322, max reward: 95.12354240943755, mean reward: 93.80464740737442
episode: 112, current reward: 92.63657588156033, max reward: 94.91657993003935, mean reward: 94.54551991375145
episode: 113, current reward: 94.63586067059025, max reward: 94.95865358785576, mean reward: 94.54322213763959
episode: 114, current reward: 94.82679571721349, max reward: 95.16155175880314, mean reward: 94.48725319027352
ep

episode: 181, current reward: 91.97057642355544, max reward: 94.45282251206905, mean reward: 93.89331885798023
episode: 182, current reward: 91.9001571718646, max reward: 94.4352181365683, mean reward: 93.94289190542477
episode: 183, current reward: 94.01373760064558, max reward: 94.45463641070359, mean reward: 93.8295784164128
episode: 184, current reward: 94.22424138737885, max reward: 94.54140757414393, mean reward: 94.14822240887015
episode: 185, current reward: 94.8476464099151, max reward: 94.66796386859554, mean reward: 94.07759357872365
episode: 186, current reward: 93.90717124471107, max reward: 94.68394847314511, mean reward: 93.76187104994398
episode: 187, current reward: 94.44970678640136, max reward: 94.47507941303972, mean reward: 93.69156374803931
episode: 188, current reward: 94.60687568673436, max reward: 94.4385216557855, mean reward: 93.41940868202924
episode: 189, current reward: 91.76909100192341, max reward: 94.4760345790131, mean reward: 93.43999936871232
episode

episode: 256, current reward: 92.48726180031379, max reward: 93.4768340863681, mean reward: 90.65969965393933
episode: 257, current reward: 90.35226023674224, max reward: 93.53932694260972, mean reward: 90.56801656469668
episode: 258, current reward: 87.38243606931096, max reward: 93.4529083644023, mean reward: 90.57776021042699
episode: 259, current reward: 90.83738289941954, max reward: 93.41240254877964, mean reward: 90.1688160644587
episode: 260, current reward: 90.97066878478256, max reward: 93.49524083793462, mean reward: 90.46840736043558
episode: 261, current reward: 89.61737385118477, max reward: 93.42717788145171, mean reward: 90.64614797055819
episode: 262, current reward: 90.27650887202225, max reward: 93.36126421297972, mean reward: 89.99512471867854
episode: 263, current reward: 90.67945416558167, max reward: 93.35437310288285, mean reward: 90.45573427592382
episode: 264, current reward: 90.38481250918122, max reward: 93.4104235547272, mean reward: 90.06934698449112
episo

episode: 331, current reward: 94.01425703335073, max reward: 93.67487440789574, mean reward: 93.43827080896075
episode: 332, current reward: 94.04282775975055, max reward: 93.67193964484801, mean reward: 93.43450551420287
episode: 333, current reward: 93.01919970187086, max reward: 93.68184335889966, mean reward: 92.89917198662596
episode: 334, current reward: 91.03496244746036, max reward: 93.6778208857416, mean reward: 92.68724011792521
episode: 335, current reward: 90.86340768758481, max reward: 93.67560765316772, mean reward: 92.63579728449744
episode: 336, current reward: 93.81541841470562, max reward: 93.66314213436084, mean reward: 93.42989697797222
episode: 337, current reward: 93.91694830860067, max reward: 93.67677152405157, mean reward: 93.57009186610858
episode: 338, current reward: 94.16391108450708, max reward: 93.67617323088577, mean reward: 92.62875437176433
episode: 339, current reward: 93.86520387541617, max reward: 93.66358348647523, mean reward: 92.90589593592445
ep

episode: 405, current reward: 93.12177864879013, max reward: 92.99794326101375, mean reward: 92.89034317976575
episode: 406, current reward: 93.47983674390464, max reward: 92.96439930089313, mean reward: 92.84747028088809
episode: 407, current reward: 93.44185337660659, max reward: 92.88765284042826, mean reward: 92.7981903910325
episode: 408, current reward: 93.14486790106514, max reward: 92.9600625540471, mean reward: 92.84414059624649
episode: 409, current reward: 93.12859830854873, max reward: 92.84792645628391, mean reward: 92.7654898229603
episode: 410, current reward: 93.30318805488864, max reward: 93.00211036718173, mean reward: 92.91247732234919
episode: 411, current reward: 93.22592613849177, max reward: 92.91103593653813, mean reward: 92.83294532171718
episode: 412, current reward: 93.26851345576215, max reward: 92.89704340069059, mean reward: 92.81000709302518
episode: 413, current reward: 93.19662019700705, max reward: 92.75248314682545, mean reward: 92.66357433110646
epis

episode: 480, current reward: 92.98666137772902, max reward: 93.38232026835136, mean reward: 93.11551749196724
episode: 481, current reward: 92.89148431314982, max reward: 93.32976922790458, mean reward: 93.08702208586622
episode: 482, current reward: 93.12957008027564, max reward: 93.26068654834208, mean reward: 93.04382475977359
episode: 483, current reward: 93.2385391693501, max reward: 93.25465405859075, mean reward: 93.04121866263908
episode: 484, current reward: 93.50955685635287, max reward: 93.0774029200683, mean reward: 92.8824791708951
episode: 485, current reward: 93.43955922490106, max reward: 93.17894643500365, mean reward: 92.9786476426964
episode: 486, current reward: 93.04143899422188, max reward: 93.13067548045673, mean reward: 92.95705515612727
episode: 487, current reward: 93.35856787486951, max reward: 93.09025551646964, mean reward: 92.93401881635619
episode: 488, current reward: 93.45258550529094, max reward: 93.17085798471516, mean reward: 93.01856001707161
episo

In [10]:
actor = torch.load("actor.pkl")
actor.to(device)

#critic = torch.load("critic.pkl")
#critic.to(device)

Actor(
  (linear1): Linear(in_features=2, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=32, bias=True)
  (linear3): Linear(in_features=32, out_features=1, bias=True)
)

In [11]:
def game_act(state):
        state =  torch.tensor(state).to(device).float()
        action = actor(state).cpu().data.numpy()
        return action

## Test the Agent

In [17]:
reward = []
for i in range(100):
    state = env.reset()
    local_reward = 0
    done = False
    while not done:
        action = game_act(state)
        state, r, done, _ = env.step(action)
        local_reward += r
    reward.append(local_reward)
print("max reward: ", np.max(reward))
print("mean reward: ", np.mean(reward))

max reward:  96.27186511366071
mean reward:  92.64992503483003
