In [0]:
!pip install torch
!pip install gym
!pip install box2d-py
!pip install gym[Box_2D]
!pip install torch torchvision tqdm gym pyvirtualdisplay
!apt-get install xvfb
!apt-get install python-opengl

import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from functools import reduce
from matplotlib import pyplot as plt
import Box2D

from IPython.display import display as idisplay
from IPython.display import HTML
from matplotlib import animation, rc
from pyvirtualdisplay import Display

import matplotlib.pyplot as plt
import os

# Cast the magic incantations to make it possible to render
# animations in Colab.
display = Display(visible=0, size=(1400, 900))
display.start()
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
  %env DISPLAY=:1    
%matplotlib notebook

Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/49/0e/e382bcf1a6ae8225f50b99cc26effa2d4cc6d66975ccf3fa9590efcbedce/torch-0.4.1-cp36-cp36m-manylinux1_x86_64.whl (519.5MB)
[K    100% |████████████████████████████████| 519.5MB 27kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x58c26000 @  0x7f0df3ed62a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641
[?25hInstalling collected packages: torch
Successfully installed torch-0.4.1
Collecting gym
[?25l  Downloading https://files.pythonhosted.org/packages/d4/22/4ff09745ade385ffe707fb5f053548f0f6a6e7d5e98a2b9d6c07f5b931a7/gym-0.10.9.tar.gz (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 7.1MB/s 
Collecting pyglet>=1.2.0 (from gym)
[?25l  Downloading https://files.pythonhosted.org/pa

In [0]:
class PolicyNetwork(nn.Module):
    
  def __init__(self, state_dim, action_dim):
    """
    Takes in a state and decides which action to take based on the given state
    """
    super(PolicyNetwork, self).__init__()
    self.net = nn.Sequential(
        nn.Linear(state_dim, 12),
        nn.ReLU(),
        nn.Linear(12, 12),
        nn.ReLU(),
        nn.Linear(12, 12),
        nn.ReLU(),
        nn.Linear(12, action_dim)
    )

    self.soft_max =nn.Softmax(dim=1)
      
  def forward(self, x):
    return self.soft_max(self.net(x))
  
class ValueNetwork(nn.Module):
  """
  Takes in a state and gives a value for that state
  """
  def __init__(self, state_dim):
    super(ValueNetwork, self).__init__()
    self.net = nn.Sequential(
        nn.Linear(state_dim, 12),
        nn.ReLU(),
        nn.Linear(12, 12),
        nn.ReLU(),
        nn.Linear(12, 12),
        nn.ReLU(),
        nn.Linear(12, 1)
    )

    
  def forward(self, x):
    return self.net(x)

class AdvantageDataset(Dataset):   
  """
  Takes care of the Data set for the advantages
  """
  def __init__(self, experience):                                                                                                                 
    super(AdvantageDataset, self).__init__()                                                                                                    
    self._exp = experience                                                                                                                      
    self._num_runs = len(experience)                                                                                                            
    self._length = reduce(lambda acc, x: acc + len(x), experience, 0)                                                                           

  def __getitem__(self, index):                                                                                                                   
    idx = 0                                                                                                                                     
    seen_data = 0                                                                                                                               
    current_exp = self._exp[0]                                                                                                                  
    while seen_data + len(current_exp) - 1 < index:                                                                                             
        seen_data += len(current_exp)                                                                                                           
        idx += 1                                                                                                                                
        current_exp = self._exp[idx]                                                                                                            
    chosen_exp = current_exp[index - seen_data]                                                                                                 
    return chosen_exp[0], chosen_exp[4]                                                                                                         

  def __len__(self):                                                                                                                              
    return self._length                                                                                                                         
  
class PolicyDataset(Dataset):   
  """
  Takes care of the policys 
  """
  def __init__(self, experience):                                                                                                                 
    super(PolicyDataset, self).__init__()                                                                                                       
    self._exp = experience                                                                                                                      
    self._num_runs = len(experience)                                                                                                            
    self._length = reduce(lambda acc, x: acc + len(x), experience, 0)                                                                           

  def __getitem__(self, index):                                                                                                                   
    idx = 0                                                                                                                                     
    seen_data = 0                                                                                                                               
    current_exp = self._exp[0]                                                                                                                  
    while seen_data + len(current_exp) - 1 < index:                                                                                             
        seen_data += len(current_exp)                                                                                                           
        idx += 1                                                                                                                                
        current_exp = self._exp[idx]                                                                                                            
    chosen_exp = current_exp[index - seen_data]                                                                                                 
    return chosen_exp                                                                                                                           

  def __len__(self):                                                                                                                              
    return self._length 
  

class Calculate:
  """
  Calculates the the returns and the advantages for each rollout
  """
  def __init__(self):
    pass
  
  def returns(self, rollouts, gamma):
    for i, rollout in enumerate(rollouts):
      projected_reward = 0
      for j in range(len(rollout))[::-1]:
        state, probabilities, action_id, reward = rollout[j]
        projected_reward = reward + (gamma * projected_reward)
        rollout[j] = [state, probabilities, action_id, reward, projected_reward]
  
  def advantages(self, rollouts, value):
    for i, rollout in enumerate(rollouts):
      for j, expererience in enumerate(rollout):
        expected_reward = expererience[4]
        state = expererience[0]
        advantage = expected_reward - value(torch.from_numpy(state).float().unsqueeze(0)).squeeze(0).item()
        rollout[j] = expererience + [advantage]
        
class Animation():
  """
  Animates the output of an OpenAI Gym environment.  Example usage:
  
  anim = Animation()
  for _ in steps:
    # Do some machine learning...

    # Add a frame to the animation.
    anim.add(env.render(mode = 'rgb_array'))

    # Do some more machine learning...
    
  # Render the animation in Colab.
  anim.show()
  """
  
  def __init__(self, fps = 30):
    self.frames = []
    self.fps = fps
  
  def add(self, frame):
    self.frames.append(frame)
    
  def show(self):
    frames = self.frames
    plt.figure(figsize = (frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0),
               dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(plt.gcf(),
                                   lambda i: patch.set_data(frames[i]),
                                   frames=len(frames),
                                   interval = 1000 / self.fps,
                                   blit=True)
    
    rc('animation', html='jshtml')
    idisplay(HTML(anim.to_jshtml()))
    plt.close()

    
def show_vid(policy):
  anim = Animation()
  state = env.reset()
  while True:
    action = policy(torch.from_numpy(state).float().view((1,-1))).detach().numpy()
    action_id = np.argmax(np.random.multinomial(1, action.reshape((action_dim))))
    new_state, reward, done, _ = env.step(action_id)
    state = new_state
    if(done):
      break

    # Add a frame to the animation.
    anim.add(env.render(mode = 'rgb_array'))
    
  # Render the animation in Colab.
  anim.show()

In [0]:
def train(epochs, env, policy, value):

  calculate = Calculate()
  policy_optim = optim.Adam(policy.parameters(), lr=1e-2, weight_decay=0.01)
  value_optim = optim.Adam(value.parameters(), lr=1e-3, weight_decay=1)

  value_loss = nn.MSELoss()

  # Hyperparameters
  num_rollouts = 100
  episode_length = 200
  gamma = 0.9
  value_epochs = 2
  policy_epochs = 5
  batch_size = 32
  policy_batch_size = 256
  epsilon = 0.2

  epoch_losses = [game]
  for _ in range(epochs):
    # generate rollouts
    print("Epoch: " + str(_+1))
    rollouts = []
    steps = 0
    state = env.reset()
    for _ in range(num_rollouts):
      cur_rollout = []
      state = env.reset()
#       print(state)
#       for _ in range(episode_length):
      done = False
      while(not done):
        # don't forget to reset the environment at the beginning of each episode!
        # rollout for a certain number of steps!
        action = policy(torch.from_numpy(state).float().view((1,-1))).detach().numpy()
        action_id = np.argmax(np.random.multinomial(1, action.reshape((action_dim))))
        new_state, reward, done, _ = env.step(action_id)

        cur_rollout.append([state, action.reshape(-1), action_id, reward])
        steps += 1
        state = new_state

      rollouts.append(cur_rollout)
      
    avg_steps = steps / num_rollouts
    if(game == 'CartPole-v1'):
      print('Average Standing time:')
    elif(game == 'Acrobot-v1'):
      print("Number of steps to get above the line")
    elif(game == 'MountainCar-v0'):
      print("Number of steps to get over the hill")
    elif(game == 'MountainCar-v0'):
      print("Number of steps to get over the hill")
    print(avg_steps)
      
      
    calculate.returns(rollouts, gamma)

    # Approximate the value function
    value_dataset = AdvantageDataset(rollouts)
    value_loader = DataLoader(value_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    value_losses = []
#     print("Train Value")
    for _ in range(value_epochs):
        # train value network
        total_loss = 0
        for state, returns in value_loader:
          value_optim.zero_grad()
          returns = returns.unsqueeze(1).float()
          expected_returns = value(state.float())
          loss = value_loss(expected_returns, returns)
          loss.backward()
          total_loss += loss.item()
          value_optim.step()
        value_losses.append(total_loss)
#         print("Value Loss", total_loss/value_epochs)
        
    calculate.advantages(rollouts, value)

    # Learn a policy
    policy_dataset = PolicyDataset(rollouts)
    policy_loader = DataLoader(policy_dataset, batch_size=policy_batch_size, shuffle=True, pin_memory=True)
    policy_losses = []
    
#     print("Train Policy")
    for _ in range(policy_epochs):
      # train policy network
      total_loss = 0
      for state, old_policy_dist, action_id, reward, return_reward, advantage in policy_loader:
        policy_optim.zero_grad()
        cur_batch_size = reward.size()[0]
        advantage = return_reward.float()

        new_policy_dist = policy(state.float())

        ratio = new_policy_dist[range(cur_batch_size), action_id] / old_policy_dist[range(cur_batch_size), action_id] 

        left = ratio * advantage
        right = torch.clamp(ratio*advantage, 1 - epsilon, 1 + epsilon ) * advantage
        loss = -torch.mean(torch.min(left, right))
        loss.backward()
        total_loss += loss.item()
        policy_optim.step()
        
      policy_losses.append(total_loss)
#       print("Policy Loss", total_loss/policy_epochs)
      
    epoch_losses.append([avg_steps, np.mean(value_losses), np.mean(policy_losses)])
  return policy, epoch_losses

        

In [0]:
def plot(losses):
  game = losses[0]
  losses = losses[1:]
  standing = []
  policy_losses = []
  value_losses = []

  for loss in losses:
    standing.append(loss[0])
    policy_losses.append(loss[1])
    value_losses.append(loss[2])
  
  plt.title("LOSSES")
  plt.plot(range(len(losses)), policy_losses, label='Policy Loss')
  plt.plot(range(len(losses)), value_losses, label='Value Loss')
  plt.legend()
  plt.xlabel("Epochs")
  plt.ylabel("Loss")
  plt.show()
  
  if(game == 'MountainCar-v0'):
    plt.title("MountainCar")
    plt.ylabel("Actions to get over the hill")
  elif(game == 'Acrobot-v1'):
    plt.title("Acrobot")
    plt.ylabel("Acrobot Actions to get Above the line")
  elif(game == 'CartPole-v1'):
    plt.title("CartPole")
    plt.ylabel("CartPole Average Standing")
  elif(game == 'LunarLander-v2'):
    plt.title("LunarLander")
    plt.ylabel("CartPole Average Standing")

  plt.plot(range(len(losses)), standing, label='Standing')
  plt.xlabel("Epochs")
  plt.show();


In [0]:
games = ['CartPole-v1', 'Acrobot-v1', 'LunarLander-v2']
game = games[0]
env = gym.make(game)
state_dim = len(env.reset())
action_dim = env.action_space.n
policy = PolicyNetwork(state_dim, action_dim)
value = ValueNetwork(state_dim)

  result = entry_point.load(False)


In [0]:
# Get Before training video
show_vid(policy)

<IPython.core.display.Javascript object>

In [0]:
# Train the network
epochs = 100
policy, losses = train(epochs, env, policy, value)
plot(losses)

Epoch: 1
Average Standing time:
21.83
Epoch: 2
Average Standing time:
25.57
Epoch: 3
Average Standing time:
33.87
Epoch: 4
Average Standing time:
52.65
Epoch: 5
Average Standing time:
92.83
Epoch: 6
Average Standing time:
180.83
Epoch: 7
Average Standing time:
253.12
Epoch: 8
Average Standing time:
219.53
Epoch: 9
Average Standing time:
167.64
Epoch: 10
Average Standing time:
214.34
Epoch: 11
Average Standing time:
292.02
Epoch: 12
Average Standing time:
325.57
Epoch: 13
Average Standing time:
341.84
Epoch: 14
Average Standing time:
376.2
Epoch: 15
Average Standing time:
269.95
Epoch: 16
Average Standing time:
198.48
Epoch: 17
Average Standing time:
215.26
Epoch: 18
Average Standing time:
268.92
Epoch: 19
Average Standing time:
271.35
Epoch: 20
Average Standing time:
241.2
Epoch: 21
Average Standing time:
279.14
Epoch: 22
Average Standing time:
247.66
Epoch: 23
Average Standing time:
264.37
Epoch: 24
Average Standing time:
286.41
Epoch: 25
Average Standing time:
275.77
Epoch: 26
Averag

<IPython.core.display.Javascript object>

In [0]:
plot(losses)

In [0]:
# Show Trained network video
show_vid(policy)

<IPython.core.display.Javascript object>

In [0]:
# LUNAR LANDER GAME
game = games[2]
env = gym.make(game)
state_dim = len(env.reset())
action_dim = env.action_space.n
policy = PolicyNetwork(state_dim, action_dim)
value = ValueNetwork(state_dim)
# Get Before training video
show_vid(policy)

  result = entry_point.load(False)


<IPython.core.display.Javascript object>

In [0]:
# Train the network
epochs = 50
policy, losses = train(epochs, env, policy, value)
plot(losses)

Epoch: 1
110.89
Epoch: 2
104.65
Epoch: 3
107.6
Epoch: 4
134.54
Epoch: 5
175.54
Epoch: 6
174.37
Epoch: 7
169.07
Epoch: 8
265.4
Epoch: 9
391.49
Epoch: 10
552.55
Epoch: 11
636.86
Epoch: 12
669.23
Epoch: 13
717.2
Epoch: 14
835.38
Epoch: 15
797.14
Epoch: 16
811.17
Epoch: 17
875.85
Epoch: 18
991.73
Epoch: 19
1000.0
Epoch: 20
1000.0
Epoch: 21
1000.0
Epoch: 22
1000.0
Epoch: 23
994.55
Epoch: 24
989.68
Epoch: 25
1000.0
Epoch: 26
994.77
Epoch: 27
972.92
Epoch: 28
1000.0
Epoch: 29
991.01
Epoch: 30
992.24
Epoch: 31
1000.0
Epoch: 32
1000.0
Epoch: 33
999.82
Epoch: 34
999.38
Epoch: 35
998.15
Epoch: 36
1000.0
Epoch: 37
982.61
Epoch: 38
979.17
Epoch: 39
978.33
Epoch: 40
991.46
Epoch: 41
998.19
Epoch: 42
1000.0
Epoch: 43
1000.0
Epoch: 44
1000.0
Epoch: 45
1000.0
Epoch: 46
1000.0
Epoch: 47
991.28
Epoch: 48
992.76
Epoch: 49
1000.0
Epoch: 50
994.26


In [0]:
plot(losses)

In [0]:
# Show Trained network video
show_vid(policy)

<IPython.core.display.Javascript object>