In [1]:
!pip install torch
!pip install 'gym[all]'

Collecting mujoco-py<2.0,>=1.50; extra == "all"
  Using cached https://files.pythonhosted.org/packages/cf/8c/64e0630b3d450244feef0688d90eab2448631e40ba6bdbd90a70b84898e7/mujoco-py-1.50.1.68.tar.gz
Building wheels for collected packages: mujoco-py
  Building wheel for mujoco-py (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for mujoco-py[0m
[?25h  Running setup.py clean for mujoco-py
Failed to build mujoco-py
Installing collected packages: mujoco-py
    Running setup.py install for mujoco-py ... [?25l[?25herror
[31mERROR: Command errored out with exit status 1: /usr/bin/python3 -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-x2m0vugo/mujoco-py/setup.py'"'"'; __file__='"'"'/tmp/pip-install-x2m0vugo/mujoco-py/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' install --record /tmp/pip-record-x8ei1mq3/install-record.tx

In [2]:
import torch
import torch.nn as nn
from torch.distributions import Categorical
import gym
from google.colab import drive
drive.mount('/content/drive')
mydrive ="/content/drive/My Drive/Colab Notebooks/DS_hw4_lunar_lander/"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
class AC(nn.Module):
    def __init__(self, state_dim, action_dim, n_latent):
        super(AC, self).__init__()
        self.action_layer = nn.Sequential(nn.Linear(state_dim, n_latent), nn.Tanh(), # actor
                                          nn.Linear(n_latent, n_latent), nn.Tanh(),
                                          nn.Linear(n_latent, action_dim), nn.Softmax(dim = -1))
        self.value_layer = nn.Sequential(nn.Linear(state_dim, n_latent), nn.Tanh(), # critic
                                         nn.Linear(n_latent, n_latent), nn.Tanh(),
                                         nn.Linear(n_latent, 1))  

    def action(self, state, ex):
        state = torch.from_numpy(state).double().to(device)
        action_probability = self.action_layer(state)
        distance = Categorical(action_probability)
        action = distance.sample()

        ex.states.append(state) # add conditions in experience replay
        ex.actions.append(action) 
        ex.logprobability.append(distance.log_prob(action))

        return action.item()

    def evaluation(self, state, action):
        action_probability = self.action_layer(state)
        distance = Categorical(action_probability)
        action_logprobability = distance.log_prob(action)
        d_entropy = distance.entropy()
        state_value = self.value_layer(state)

        return action_logprobability, torch.squeeze(state_value), d_entropy
    

                               

In [0]:
class Experience:
    def __init__(self):
        self.actions = []
        self.rewards = []
        self.terminal_state = []
        self.states = []
        self.logprobability = []

    def clear_out_ex(self):
        del self.actions[:]
        del self.rewards[:]
        del self.terminal_state[:]
        del self.states[:]
        del self.logprobability[:]

In [0]:
class PPO:
    def __init__(self, state_dim, action_dim, n_latent, lr, betas, gamma, K_epochs, clip):
        self.lr = lr
        self.gamma = gamma
        self.clip = clip
        self.betas = betas
        self.K_epochs = K_epochs

        self.policy = AC(state_dim, action_dim, n_latent).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.old_policy = AC(state_dim, action_dim, n_latent).to(device)
        self.old_policy.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()

    def update(self, ex):
        rewards = []
        minus_reward = 0
        for reward, terminal_state in zip(reversed(ex.rewards), reversed(ex.terminal_state)):
            if terminal_state == True:
                minus_reward = 0
            minus_reward = reward + (self.gamma * minus_reward)
            rewards.insert(0, minus_reward)

        rewards = torch.tensor(rewards).to(device) # normalized the rewards
        rewards = (rewards - rewards.mean()) / (rewards.std() + 0.00001)

        old_states = torch.stack(ex.states).to(device).detach() # convert list to tensor
        old_actions = torch.stack(ex.actions).to(device).detach()
        old_logprob = torch.stack(ex.logprobability).to(device).detach()

        for i in range(self.K_epochs):
            logprobs, state_val, d_entropy = self.policy.evaluation(old_states, old_actions)
            ratios = torch.exp(logprobs-old_logprob.detach())

            advantages = rewards - state_val.detach()
            surrogate1 = ratios*advantages
            surrogate2 = torch.clamp(ratios, 1-self.clip, 1+self.clip)*advantages
            loss = -torch.min(surrogate1, surrogate2)+0.5*self.MseLoss(state_val, rewards)-0.01*d_entropy

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        self.old_policy.load_state_dict(self.policy.state_dict())


In [0]:
def main():
    env_name = "LunarLander-v2"
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    render = False
    solve_reward = 150
    log_interval = 20
    max_epoch = 50000
    max_timestep = 300
    update_timestep = 2000

    ex = Experience()
    ppo = PPO(state_dim, 4, 64, 0.002, (0.9, 0.999), 0.99, 4, 0.2)

    running_reward, avg_length, timestep = 0, 0, 0
    

    for i in range(1, max_epoch+1):
        state = env.reset()
        for t in range(max_timestep):
            timestep += 1
            action = ppo.old_policy.action(state, ex)
            state, reward, done, _ = env.step(action)

            ex.rewards.append(reward)
            ex.terminal_state.append(done)

            if timestep % update_timestep == 0:
                ppo.update(ex)
                ex.clear_out_ex()
                timestep = 0

            running_reward += reward
            if render: 
                env.render()
            if done:
                break
        avg_length += t

        if running_reward > (log_interval*solve_reward):
            print("solveeeee")
            torch.save(ppo.policy.state_dict(), mydrive+'PPO_v2.pth')
            break

        if i % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            print('Episode {} \t avg length: {} \t reward: {}'.format(i, avg_length, running_reward))
            running_reward = 0
            avg_length = 0



In [7]:
!apt-get install xvfb
!pip install pyvirtualdisplay
!pip install Pillow
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.4).
0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.


<pyvirtualdisplay.display.Display at 0x7f2bf2bfd6a0>

In [0]:
torch.set_default_tensor_type(torch.DoubleTensor)
#main()

In [0]:
from PIL import Image
import gym

def test():
    env_name = "LunarLander-v2"
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    
    max_timestep = 500
    render = True
    save_gif = False
    
    ex = Experience()
    ppo = PPO(state_dim, 4, 64, 0.0007, (0.9, 0.999), 0.99, 4, 0.2)
    ppo.old_policy.load_state_dict(torch.load(mydrive+"PPO_v2.pth"))
    
    for ep in range(1, 4):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timestep):
            action = ppo.old_policy.action(state, ex)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
            if save_gif:
                 img = env.render(mode = 'rgb_array')
                 img = Image.fromarray(img)
                 img.save(mydrive+'gif_{}.jpg'.format(t))  
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()

In [10]:
test()

Episode: 1	Reward: 166
Episode: 2	Reward: 124
Episode: 3	Reward: 176


In [0]:
import os, signal
os.kill(os.getpid(), signal.SIGKILL)