# Initialization

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!sudo apt-get install xvfb
!pip install swig
!pip install gym[box2d]

In [3]:
import gym
from gym import logger as gymlogger
from gym.wrappers.record_video import RecordVideo
from IPython.display import HTML
from IPython import display as ipythondisplay
import glob
import io
import base64
import pygame
import numpy as np
import random
from IPython.display import clear_output
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
%matplotlib inline
import seaborn as sns
sns.set()
gymlogger.set_level(40) #error only
pygame.init()
pygame.font.init()

In [2]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Could not find video")

def wrap_env(env):
  env = RecordVideo(env, './video',  episode_trigger = lambda episode_number: True)
  return env

# Test random agent

In [3]:
env = wrap_env(gym.make("LunarLander-v2"))
observation = env.reset()
total_reward = 0
done = False
while not done:

    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    total_reward += reward

env.close()
show_video()
print(total_reward)

-149.61095243545253


# Advantage Actor-Critic

In [1]:
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class ActorCriticNetwork(nn.Module):
    def __init__(self, lr, input_dims, n_actions, fc1_dims=256, fc2_dims=256):
        super(ActorCriticNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.pi = nn.Linear(fc2_dims, n_actions) #  First Head
        self.v = nn.Linear(fc2_dims, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        pi = self.pi(x)
        v = self.v(x)

        return (pi, v)

class Agent():
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions, gamma=0.99):
        self.gamma = gamma
        self.lr = lr
        self.fc1 = fc1_dims
        self.fc2 = fc2_dims
        self.actor_critic = ActorCriticNetwork(lr, input_dims, n_actions, fc1_dims, fc2_dims)
        self.log_prob = None

    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor_critic.device)
        probabilities, _ = self.actor_critic.forward(state)
        probabilities = F.softmax(probabilities, dim=1)
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_prob = action_probs.log_prob(action)
        self.log_prob = log_prob

        return action.item()

    def learn(self, state, reward, state_, done):
        self.actor_critic.optimizer.zero_grad()

        state = T.tensor([state], dtype=T.float).to(self.actor_critic.device)
        state_ = T.tensor([state_], dtype=T.float).to(self.actor_critic.device)
        reward = T.tensor(reward, dtype=T.float).to(self.actor_critic.device)

        _, critic_value = self.actor_critic.forward(state)
        _, critic_value_ = self.actor_critic.forward(state_)

        delta = reward + self.gamma * critic_value_ * (1-int(done)) - critic_value

        actor_loss = -self.log_prob * delta
        critic_loss = delta**2

        (actor_loss + critic_loss).backward()
        self.actor_critic.optimizer.step()

# Training the agent

In [4]:
env = gym.make('LunarLander-v2')
agent = Agent(gamma=0.99, lr=5e-6, input_dims=8, n_actions=4, fc1_dims=2048, fc2_dims=1536)
n_games = 3000

scores = []
for i in range(n_games):
    done = False
    observation = env.reset()
    score = 0
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.learn(observation, reward, observation_, done)
        observation = observation_
    scores.append(score)

    avg_scores = np.mean(scores[-100:])
    print(f"episode {i}, score {score}, avg_scores {avg_scores}")

episode 0, score -97.20126517386937, avg_scores -97.20126517386937
episode 1, score -470.4805672865419, avg_scores -283.8409162302056
episode 2, score -291.4291650429718, avg_scores -286.3703325011277
episode 3, score -236.11016702139182, avg_scores -273.8052911311937
episode 4, score -34.96040415370608, avg_scores -226.03631373569618
episode 5, score -57.93896119043231, avg_scores -198.02008831148555
episode 6, score -154.31867827642276, avg_scores -191.77702973504802
episode 7, score -53.0242003125763, avg_scores -174.43292605723903
episode 8, score -36.13876077862251, avg_scores -159.06690769294832
episode 9, score -103.3621446983352, avg_scores -153.496431393487
episode 10, score -381.5306026929589, avg_scores -174.2268106025299
episode 11, score -90.39594712253312, avg_scores -167.24090531253015
episode 12, score -511.4048281390647, avg_scores -193.71505322226358
episode 13, score -86.5614041783488, avg_scores -186.06122114769823
episode 14, score -41.98904067821475, avg_scores -1

KeyboardInterrupt: 