### Actor-Critic version Lunar Lander

In [0]:
!pip install torch
!pip install 'gym[all]'

Collecting mujoco-py<2.0,>=1.50; extra == "all"
  Using cached https://files.pythonhosted.org/packages/cf/8c/64e0630b3d450244feef0688d90eab2448631e40ba6bdbd90a70b84898e7/mujoco-py-1.50.1.68.tar.gz
Building wheels for collected packages: mujoco-py
  Building wheel for mujoco-py (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for mujoco-py[0m
[?25h  Running setup.py clean for mujoco-py
Failed to build mujoco-py
Installing collected packages: mujoco-py
    Running setup.py install for mujoco-py ... [?25l[?25herror
[31mERROR: Command errored out with exit status 1: /usr/bin/python3 -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-f37kpk8y/mujoco-py/setup.py'"'"'; __file__='"'"'/tmp/pip-install-f37kpk8y/mujoco-py/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' install --record /tmp/pip-record-djuyd3t8/install-record.tx

In [0]:
import torch
import gym
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.affine = nn.Linear(8, 128)
        
        self.action_layer = nn.Linear(128, 4)
        self.value_layer = nn.Linear(128, 1)
        
        self.logprobs = []
        self.state_values = []
        self.rewards = []

    def forward(self, state):
        state = torch.from_numpy(state).double()
        state = F.relu(self.affine(state))
        
        state_value = self.value_layer(state)
        
        action_probs = F.softmax(self.action_layer(state))
        action_distribution = Categorical(action_probs)
        action = action_distribution.sample()
        
        self.logprobs.append(action_distribution.log_prob(action))
        self.state_values.append(state_value)
        
        return action.item()
    
    def calculateLoss(self, gamma=0.99):
        
        # calculating discounted rewards:
        rewards = []
        dis_reward = 0
        for reward in self.rewards[::-1]:
            dis_reward = reward + gamma * dis_reward
            rewards.insert(0, dis_reward)
                
        # normalizing the rewards:
        rewards = torch.tensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std())
        
        loss = 0
        for logprob, value, reward in zip(self.logprobs, self.state_values, rewards):
            advantage = reward  - value.item()
            action_loss = -logprob * advantage
            value_loss = F.smooth_l1_loss(value, reward)
            loss += (action_loss + value_loss)   
        return loss
    
    def clearMemory(self):
        del self.logprobs[:]
        del self.state_values[:]
        del self.rewards[:]

In [0]:
from PIL import Image

def test(n_episodes=5, name='LunarLander_v1.pth'):
    env = gym.make('LunarLander-v2')
    policy = ActorCritic()
    
    policy.load_state_dict(torch.load(mydrive+'{}'.format(name)))
    
    render = True
    save_gif = True

    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        running_reward = 0
        for t in range(10000):
            action = policy(state)
            state, reward, done, _ = env.step(action)
            running_reward += reward
            if render:
                 env.render()
                 if save_gif:
                     img = env.render(mode = 'rgb_array')
                     img = Image.fromarray(img)
                     img.save(mydrive+'gif_{}.jpg'.format(t))
            if done:
                break
        print('Episode {}\tReward: {}'.format(i_episode, running_reward))
    env.close()

In [0]:
import torch.optim as optim
from google.colab import drive
drive.mount('/content/drive')
mydrive ="/content/drive/My Drive/Colab Notebooks/DS_hw4_lunar_lander/"

def train():

    render = False
    gamma = 0.99
    lr = 0.02
    betas = (0.9, 0.999)
    random_seed = 543
    
    torch.manual_seed(random_seed)
    
    env = gym.make('LunarLander-v2')
    env.seed(random_seed)
    
    policy = ActorCritic()
    optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
    print(lr,betas)
    
    running_reward = 0
    for i_episode in range(0, 10000):
        state = env.reset()
        for t in range(10000):
            action = policy(state)
            state, reward, done, _ = env.step(action)
            policy.rewards.append(reward)
            running_reward += reward
            if render and i_episode > 1000:
                env.render()
            if done:
                break
                    
        # Updating the policy :
        optimizer.zero_grad()
        loss = policy.calculateLoss(gamma)
        loss.backward()
        optimizer.step()        
        policy.clearMemory()
        
        # saving the model if episodes > 999 OR avg reward > 200 
        #if i_episode > 999:
        #    torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
        
        if running_reward > 4000:
            #torch.save(policy.state_dict(), mydrive+'LunarLander_v1.pth'.format(lr, betas[0], betas[1]))
            print("########## Solved! ##########")
            test(name='LunarLander_v1.pth'.format(lr, betas[0], betas[1]))
            break
        
        if i_episode % 20 == 0:
            running_reward = running_reward/20
            print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward))
            running_reward = 0
            


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Below is for Colab to have the gym display window

In [0]:
!apt-get install xvfb
!pip install pyvirtualdisplay
!pip install Pillow
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.4).
0 upgraded, 0 newly installed, 0 to remove and 31 not upgraded.


<pyvirtualdisplay.display.Display at 0x7f870ea9cac8>

### Train the model

In [0]:
torch.set_default_tensor_type(torch.DoubleTensor)
train()

0.02 (0.9, 0.999)
Episode 0	length: 81	reward: -6.557279878072909




Episode 20	length: 154	reward: -396.04188633687227
Episode 40	length: 118	reward: -267.82300554392606
Episode 60	length: 150	reward: -517.5924279392121
Episode 80	length: 153	reward: -423.0069357130834
Episode 100	length: 113	reward: -326.1209005643042
Episode 120	length: 98	reward: -242.49694728043406
Episode 140	length: 89	reward: -449.4194637776465
Episode 160	length: 87	reward: -482.7429554841181
Episode 180	length: 69	reward: -202.55485482418368
Episode 200	length: 81	reward: -72.84484324908954
Episode 220	length: 153	reward: -67.53457492940767
Episode 240	length: 98	reward: -55.19903008630845
Episode 260	length: 142	reward: -71.64755129128199
Episode 280	length: 114	reward: -154.9368873101258
Episode 300	length: 164	reward: -94.65030034212754
Episode 320	length: 114	reward: -65.86661435346787
Episode 340	length: 240	reward: -142.0397544048803
Episode 360	length: 113	reward: -7.190940598384961
Episode 380	length: 90	reward: -13.850279174129543
Episode 400	length: 999	reward: -31.8

### Clear out all 

In [0]:
import os, signal
os.kill(os.getpid(), signal.SIGKILL)