#  RL Course Hugging-Face

### Unit 4 -> Policy Gradient

In [2]:
%%capture
!apt install python-opengl

In [4]:
from pyvirtualdisplay import Display

vd = Display(visible=0, size=(1400, 900))
vd.start()

<pyvirtualdisplay.display.Display at 0x7feb9c274940>

In [5]:
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gymnasium as gym 
import imageio


In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [13]:
env = gym.make('CartPole-v1')

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

In [32]:
env.reset()[0]

array([ 0.00430577, -0.04738127,  0.03564657,  0.04864531], dtype=float32)

In [27]:
s = env.observation_space.sample()

In [30]:
torch.from_numpy(s).float().unsqueeze(0).to(device)

tensor([[4.2173e+00, 2.7516e+38, 1.6021e-01, 5.5015e+37]], device='cuda:0')

In [None]:
eval_env = gym.make('CartPole-v1')


### REINFORCE Algorithm

In [20]:
class Policy(nn.Module):
    def __init__(self, state_dim, action_dim, h_size):
        super(Policy, self).__init__()
        
        self.fc1 = nn.Linear(state_dim, h_size)
        self.fc2 = nn.Linear(h_size, action_dim)
        
    def forward(self, state):
        # x = state.reshape(1, -1)
        x = F.relu(self.fc1(state))
        x = self.fc2(x)
        return F.softmax(x, dim=1)
    
    def act(self, state):
        ''' Select an action from the input state '''
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        
        action = m.sample()
        return action.item(), m.log_prob(action)        
        

In [33]:
def reinforce(policy: Policy, optimizer, n_training_episodes, max_t, gamma, log):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state, _ = env.reset()
        
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, terminated, truncated, _ = env.step(action)
            rewards.append(reward)
            if terminated:
                break
        scores.append(sum(rewards))
        scores_deque.append(sum(rewards))
        
        
        returns = deque(maxlen=max_t)
        n_steps = len(rewards)
        
        for t in range(n_steps)[::-1]:
            disc_return_t = returns[0] if len(returns) > 0 else 0
            returns.appendleft(disc_return_t * gamma + rewards[t])
            
        eps = np.finfo(np.float32).eps.item()
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)
        
        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % log == 0:
            print(f'Episode {i_episode}\tAverage Score: {np.mean(scores_deque):.2f}')
    
    return scores
        
        
        

In [17]:
env_id = 'CartPole-v1'

cartpole_hyperparams = {
    "h_size": 16,
    "n_training_episodes": 1000,
    "max_t": 1000,
    "n_evaluation_episodes": 10,
    "gamma": 0.99,
    "lr": 0.01,
    "env_id": env_id,
    "state_dim": state_dim,
    "action_dim": action_dim,
}

In [34]:
cartpole_policy = Policy(
    state_dim=cartpole_hyperparams["state_dim"],
    action_dim=cartpole_hyperparams["action_dim"],
    h_size=cartpole_hyperparams["h_size"],
).to(device)
cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparams["lr"])


In [35]:
scores = reinforce(
    policy=cartpole_policy,
    optimizer=cartpole_optimizer,
    n_training_episodes=cartpole_hyperparams["n_training_episodes"],
    max_t=cartpole_hyperparams["max_t"],
    gamma=cartpole_hyperparams["gamma"],
    log=100,
)

Episode 100	Average Score: 37.90
Episode 200	Average Score: 146.89
Episode 300	Average Score: 342.20
Episode 400	Average Score: 698.21
Episode 500	Average Score: 735.36
Episode 600	Average Score: 789.30
Episode 700	Average Score: 978.14
Episode 800	Average Score: 1000.00
Episode 900	Average Score: 1000.00
Episode 1000	Average Score: 994.67


In [40]:
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
    scores = []
    for i in range(1, n_eval_episodes+1):
        state = env.reset()[0]
        score = 0
        for t in range(max_steps):
            action, _ = policy.act(state)
            state, reward, terminated, truncated, _ = env.step(action)
            score += reward
            if terminated or truncated:
                break
        scores.append(score)
    mean_score = np.mean(scores)
    std_scores = np.std(scores)
    return mean_score, std_scores

In [38]:
eval_env = gym.make('CartPole-v1')

In [41]:
evaluate_agent(
    env=eval_env,
    max_steps=cartpole_hyperparams["max_t"],
    n_eval_episodes=cartpole_hyperparams["n_evaluation_episodes"],
    policy=cartpole_policy,
)

(500.0, 0.0)

In [42]:
from huggingface_hub import notebook_login
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import datetime
import json
import imageio

import tempfile

import os


def record_video(env, policy, out_directory, fps=30):
    """
    Generate a replay video of the agent
    :param env
    :param Qtable: Qtable of our agent
    :param out_directory
    :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
    """
    images = []
    done = False
    state = env.reset()
    img = env.render(mode="rgb_array")
    images.append(img)
    while not done:
        # Take the action (index) that have the maximum expected future reward given that state
        action, _ = policy.act(state)
        # We directly put next_state = state for recording logic
        state, reward, done, info = env.step(action)
        img = env.render(mode="rgb_array")
        images.append(img)
    imageio.mimsave(out_directory, [np.array(img)
                    for i, img in enumerate(images)], fps=fps)


def push_to_hub(repo_id,
                model,
                hyperparameters,
                eval_env,
                video_fps=30
                ):
  """
  Evaluate, Generate a video and Upload a model to Hugging Face Hub.
  This method does the complete pipeline:
  - It evaluates the model
  - It generates the model card
  - It generates a replay video of the agent
  - It pushes everything to the Hub

  :param repo_id: repo_id: id of the model repository from the Hugging Face Hub
  :param model: the pytorch model we want to save
  :param hyperparameters: training hyperparameters
  :param eval_env: evaluation environment
  :param video_fps: how many frame per seconds to record our video replay
  """

  _, repo_name = repo_id.split("/")
  api = HfApi()

  # Step 1: Create the repo
  repo_url = api.create_repo(
      repo_id=repo_id,
      exist_ok=True,
  )

  with tempfile.TemporaryDirectory() as tmpdirname:
    local_directory = Path(tmpdirname)

    # Step 2: Save the model
    torch.save(model, local_directory / "model.pt")

    # Step 3: Save the hyperparameters to JSON
    with open(local_directory / "hyperparameters.json", "w") as outfile:
      json.dump(hyperparameters, outfile)

    # Step 4: Evaluate the model and build JSON
    mean_reward, std_reward = evaluate_agent(eval_env,
                                             hyperparameters["max_t"],
                                             hyperparameters["n_evaluation_episodes"],
                                             model)
    # Get datetime
    eval_datetime = datetime.datetime.now()
    eval_form_datetime = eval_datetime.isoformat()

    evaluate_data = {
        "env_id": hyperparameters["env_id"],
        "mean_reward": mean_reward,
        "n_evaluation_episodes": hyperparameters["n_evaluation_episodes"],
        "eval_datetime": eval_form_datetime,
    }

    # Write a JSON file
    with open(local_directory / "results.json", "w") as outfile:
        json.dump(evaluate_data, outfile)

    # Step 5: Create the model card
    env_name = hyperparameters["env_id"]

    metadata = {}
    metadata["tags"] = [
        env_name,
        "reinforce",
        "reinforcement-learning",
        "custom-implementation",
        "deep-rl-class"
    ]

    # Add metrics
    eval = metadata_eval_result(
        model_pretty_name=repo_name,
        task_pretty_name="reinforcement-learning",
        task_id="reinforcement-learning",
        metrics_pretty_name="mean_reward",
        metrics_id="mean_reward",
        metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
        dataset_pretty_name=env_name,
        dataset_id=env_name,
    )

    # Merges both dictionaries
    metadata = {**metadata, **eval}

    model_card = f"""
  # **Reinforce** Agent playing **{env_id}**
  This is a trained model of a **Reinforce** agent playing **{env_id}** .
  To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction
  """

    readme_path = local_directory / "README.md"
    readme = ""
    if readme_path.exists():
        with readme_path.open("r", encoding="utf8") as f:
          readme = f.read()
    else:
      readme = model_card

    with readme_path.open("w", encoding="utf-8") as f:
      f.write(readme)

    # Save our metrics to Readme metadata
    metadata_save(readme_path, metadata)

    # Step 6: Record a video
    video_path = local_directory / "replay.mp4"
    record_video(env, model, video_path, video_fps)

    # Step 7. Push everything to the Hub
    api.upload_folder(
        repo_id=repo_id,
        folder_path=local_directory,
        path_in_repo=".",
    )

    print(
        f"Your model is pushed to the Hub. You can view your model here: {repo_url}")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
notebook_login()

In [None]:
repo_id = "blackeys/Reinforce-cartpoleV1"  # TODO Define your repo id {username/Reinforce-{model-id}}
push_to_hub(
    repo_id,
    cartpole_policy,  # The model we want to save
    cartpole_hyperparams,  # Hyperparameters
    eval_env,  # Evaluation environment
    video_fps=30
)


: 