In [None]:
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip install pyvirtualdisplay
!pip install pyglet==1.5.1

In [1]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7fd5c056ac40>

In [None]:
!pip install git+https://github.com/ntasfi/PyGame-Learning-Environment.git
!pip install git+https://github.com/simoninithomas/gym-games
!pip install huggingface_hub
!pip install imageio-ffmpeg
!pip install pyyaml==6.0

In [None]:
!pip install -U gym==0.25.2

In [2]:
import numpy as np

from collections import deque

import matplotlib.pyplot as plt
%matplotlib inline

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Gym
import gym
import gym_pygame

# Hugging Face Hub
from huggingface_hub import notebook_login
import imageio

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
print(device)

cuda:0


## First Agent: CartPole

In [5]:
env_id = "CartPole-v1"
env = gym.make(env_id)

eval_env = gym.make(env_id)

s_size = env.observation_space.shape[0]
a_size = env.action_space.n

  deprecation(
  deprecation(


In [6]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample())

_____OBSERVATION SPACE_____ 

The State Space is:  4
Sample observation [ 4.6159973e+00 -4.2668840e+37 -7.9091087e-02  9.7825327e+37]


In [7]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample())


 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample 1


In [8]:
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

In [9]:
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        for t in range(n_steps)[::-1]:
            disc_return_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft( gamma*disc_return_t + rewards[t]   )

        eps = np.finfo(np.float32).eps.item()
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))

    return scores

In [10]:
cartpole_hyperparameters = {
    "h_size": 32,
    "n_training_episodes": 50_000,
    "n_evaluation_episodes": 10,
    "max_t": 1000,
    "gamma": 0.99,
    "lr": 1e-2,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [11]:
cartpole_policy = Policy(cartpole_hyperparameters["state_space"], cartpole_hyperparameters["action_space"], cartpole_hyperparameters["h_size"]).to(device)
cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])

In [12]:
scores = reinforce(cartpole_policy,
                   cartpole_optimizer,
                   cartpole_hyperparameters["n_training_episodes"],
                   cartpole_hyperparameters["max_t"],
                   cartpole_hyperparameters["gamma"],
                   100)

Episode 100	Average Score: 155.14
Episode 200	Average Score: 110.21
Episode 300	Average Score: 377.55
Episode 400	Average Score: 465.76
Episode 500	Average Score: 445.65
Episode 600	Average Score: 101.47
Episode 700	Average Score: 115.25
Episode 800	Average Score: 102.12
Episode 900	Average Score: 124.23
Episode 1000	Average Score: 188.68
Episode 1100	Average Score: 488.21
Episode 1200	Average Score: 472.66
Episode 1300	Average Score: 500.00
Episode 1400	Average Score: 331.85
Episode 1500	Average Score: 500.00
Episode 1600	Average Score: 500.00
Episode 1700	Average Score: 500.00
Episode 1800	Average Score: 500.00
Episode 1900	Average Score: 494.05
Episode 2000	Average Score: 500.00
Episode 2100	Average Score: 500.00
Episode 2200	Average Score: 497.14
Episode 2300	Average Score: 384.01
Episode 2400	Average Score: 491.81
Episode 2500	Average Score: 498.53
Episode 2600	Average Score: 187.32
Episode 2700	Average Score: 112.71
Episode 2800	Average Score: 207.98
Episode 2900	Average Score: 5

In [13]:
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
    """
    Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
    :param env: The evaluation environment
    :param n_eval_episodes: Number of episode to evaluate the agent
    :param policy: The Reinforce agent
    """
    episode_rewards = []
    for episode in range(n_eval_episodes):
        state = env.reset()
        step = 0
        done = False
        total_rewards_ep = 0

        for step in range(max_steps):
            action, _ = policy.act(state)
            new_state, reward, done, info = env.step(action)
            total_rewards_ep += reward

            if done:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

In [14]:
evaluate_agent(eval_env,
               cartpole_hyperparameters["max_t"],
               cartpole_hyperparameters["n_evaluation_episodes"],
               cartpole_policy)

(500.0, 0.0)

In [15]:
torch.save(cartpole_policy, "./4-cart-pole.pt")

In [16]:
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import datetime
import json
import imageio

import tempfile

import os

In [17]:
def record_video(env, policy, out_directory, fps=30):
    """
    Generate a replay video of the agent
    :param env
    :param Qtable: Qtable of our agent
    :param out_directory
    :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
    """
    images = []
    done = False
    state = env.reset()
    img = env.render(mode='rgb_array')
    images.append(img)
    while not done:
        # Take the action (index) that have the maximum expected future reward given that state
        action, _ = policy.act(state)
        state, reward, done, info = env.step(action) # We directly put next_state = state for recording logic
        img = env.render(mode='rgb_array')
        images.append(img)
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [18]:
record_video(eval_env, cartpole_policy, "./4-cart-pole.mp4")

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


In [31]:
%%html
<video controls autoplay><source src="./4-cart-pole.mp4" type="video/mp4"></video>

## Second agent: PixelCopter

In [20]:
env_id = "Pixelcopter-PLE-v0"
env = gym.make(env_id)
eval_env = gym.make(env_id)
s_size = env.observation_space.shape[0]
a_size = env.action_space.n

couldn't import doomish
Couldn't import doom


In [21]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  7
Sample observation [-0.6999531  -0.99876493 -0.21001318  1.0711119  -2.3947988  -1.2707206
 -0.96637326]


In [22]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample 0


In [23]:
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size*2)
        self.fc3 = nn.Linear(h_size*2, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

In [24]:
pixelcopter_hyperparameters = {
    "h_size": 64,
    "n_training_episodes": 50000,
    "n_evaluation_episodes": 10,
    "max_t": 10000,
    "gamma": 0.99,
    "lr": 1e-4,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [25]:
# torch.manual_seed(50)
pixelcopter_policy = Policy(pixelcopter_hyperparameters["state_space"], pixelcopter_hyperparameters["action_space"], pixelcopter_hyperparameters["h_size"]).to(device)
pixelcopter_optimizer = optim.Adam(pixelcopter_policy.parameters(), lr=pixelcopter_hyperparameters["lr"])

In [26]:
scores = reinforce(pixelcopter_policy,
                   pixelcopter_optimizer,
                   pixelcopter_hyperparameters["n_training_episodes"],
                   pixelcopter_hyperparameters["max_t"],
                   pixelcopter_hyperparameters["gamma"],
                   1000)

  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.deprecation(
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Episode 1000	Average Score: 4.32
Episode 2000	Average Score: 4.45
Episode 3000	Average Score: 7.15
Episode 4000	Average Score: 12.55
Episode 5000	Average Score: 13.12
Episode 6000	Average Score: 12.67
Episode 7000	Average Score: 16.58
Episode 8000	Average Score: 21.05
Episode 9000	Average Score: 23.51
Episode 10000	Average Score: 26.23
Episode 11000	Average Score: 21.49
Episode 12000	Average Score: 26.59
Episode 13000	Average Score: 21.72
Episode 14000	Average Score: 20.74
Episode 15000	Average Score: 27.50
Episode 16000	Average Score: 22.38
Episode 17000	Average Score: 25.81
Episode 18000	Average Score: 25.81
Episode 19000	Average Score: 17.84
Episode 20000	Average Score: 30.01
Episode 21000	Average Score: 29.25
Episode 22000	Average Score: 35.58
Episode 23000	Average Score: 30.52
Episode 24000	Average Score: 33.40
Episode 25000	Average Score: 25.08
Episode 26000	Average Score: 26.02
Episode 27000	Average Score: 26.13
Episode 28000	Average Score: 40.24
Episode 29000	Average Score: 17.

In [27]:
evaluate_agent(eval_env,
               pixelcopter_hyperparameters["max_t"],
               pixelcopter_hyperparameters["n_evaluation_episodes"],
               pixelcopter_policy)

(34.1, 22.77037549097511)

In [28]:
torch.save(pixelcopter_policy, "./4-pixel-copter.pt")

In [29]:
record_video(eval_env, pixelcopter_policy, "./4-pixel-copter.mp4")

  logger.warn(


In [32]:
%%html
<video controls autoplay><source src="./4-pixel-copter.mp4" type="video/mp4"></video>