# **Initialization**

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!sudo apt-get install xvfb
!pip install swig
!pip install gym[box2d]

In [1]:
import gym
from gym import logger as gymlogger
from gym.wrappers.record_video import RecordVideo
from IPython.display import HTML
from IPython import display as ipythondisplay
import glob
import io
import base64
import pygame
import numpy as np
import random
from IPython.display import clear_output
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
%matplotlib inline
import seaborn as sns
sns.set()
gymlogger.set_level(40) #error only
pygame.init()
pygame.font.init()

In [2]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Could not find video")

def wrap_env(env):
  env = RecordVideo(env, './video',  episode_trigger = lambda episode_number: True)
  return env

# **Test a random agent**

In [8]:
env = wrap_env(gym.make("LunarLander-v2"))
observation = env.reset()
total_reward = 0
done = False
while not done:

    env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    total_reward += reward

env.close()
show_video()
print(total_reward)

-240.0313352399656


# Import necessary libraries

In [9]:
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Policy network

In [10]:
T.cuda.is_available()

False

In [11]:
class PolicyNetwork(nn.Module):
    def __init__(self, lr, input_dims, n_actions):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dims, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)

        # CPU ==> GPU
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x

# Agent

In [17]:
class PolicyGradientAgent():
    def __init__(self, lr, input_dims, gamma=0.99, n_actions=4):
        self.gamma = gamma
        self.lr = lr
        self.reward_memory = []
        self.action_memory = []

        self.policy = PolicyNetwork(self.lr, input_dims, n_actions)

    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.policy.device)
        probabilities = F.softmax(self.policy.forward(state))
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_probs = action_probs.log_prob(action)
        self.action_memory.append(log_probs)

        return action.item()

    def srote_rewards(self, reward):
        self.reward_memory.append(reward)

    def learn(self):
        self.policy.optimizer.zero_grad()

        # G_t = R_t+1 + gamma * R_t+2 + gamma**2 * R_t+3 + ....
        # G_t = sum from k=0 to k=T {gamma**k * R_t+k+1}

        G = np.zeros_like(self.reward_memory, dtype=np.float64)
        for t in range(len(self.reward_memory)):
            G_sum = 0
            discount = 1
            for k in range(t, len(self.reward_memory)):
                G_sum += self.reward_memory[k] * discount
                discount *= self.gamma
            G[t] = G_sum
        G = T.tensor(G, dtype=T.float).to(self.policy.device)

        loss = 0
        for g, logprob in zip(G, self.action_memory):
            loss += -g * logprob
        loss.backward()
        self.policy.optimizer.step()

        self.reward_memory = []
        self.action_memory = []


# Main code

In [21]:
env = gym.make("LunarLander-v2")
num_epiosde = 10
# agent = PolicyGradientAgent(gamma=0.99, lr=0.0005, input_dims=8, n_actions=4)

avg_rewards = []
total_rewards = []

for episode in range(num_epiosde):
    done = False
    observation = env.reset()
    episode_total_reward = 0
    while not done:
        action = agent.choose_action(observation)
        next_observation, reward, done, _ = env.step(action)
        episode_total_reward += reward
        agent.srote_rewards(reward)
        observation = next_observation
    # agent.learn()

    total_rewards.append(episode_total_reward)
    avg_reward = np.mean(total_rewards[-100:])
    avg_rewards.append(avg_reward)
    print(f"episode: {episode} reward: {episode_total_reward}, avg_rewards: {avg_reward}")

episode: 0 reward: 90.97413956317841, avg_rewards: 90.97413956317841
episode: 1 reward: 91.23205691971984, avg_rewards: 91.10309824144912
episode: 2 reward: 125.27921252972907, avg_rewards: 102.49513633754243
episode: 3 reward: 50.218334970470295, avg_rewards: 89.42593599577441
episode: 4 reward: 120.64689375727892, avg_rewards: 95.67012754807531
episode: 5 reward: 179.4784262076426, avg_rewards: 109.63817732466987
episode: 6 reward: 49.0993748004272, avg_rewards: 100.98977696406378
episode: 7 reward: 123.6749220841917, avg_rewards: 103.82542010407974
episode: 8 reward: 89.44459686713508, avg_rewards: 102.22755085553032
episode: 9 reward: 129.88628169046996, avg_rewards: 104.9934239390243


In [19]:
agent.policy.load_state_dict(T.load('reinforce_policy.pth'))

<All keys matched successfully>

# Test our agent

In [25]:
env = wrap_env(gym.make("LunarLander-v2"))
observation = env.reset()
total_reward = 0
done = False
while not done:

    env.render()
    action = agent.choose_action(observation)
    observation, reward, done, info = env.step(action)
    total_reward += reward

env.close()
show_video()
print(total_reward)

118.76620501635881


# Save agent parameters

In [26]:
T.save(agent.policy.state_dict(), 'TA_claa.pth')