<a href="https://colab.research.google.com/github/AGKhalil/RL_implements/blob/master/VPG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
!pip install numpy tqdm gym matplotlib argparse torch wandb scipy

## Restart Runtime
This is done to ensure the installed dependencies and game are loaded.

In [0]:
import os

def restart_runtime():
  os.kill(os.getpid(), 9)
  
restart_runtime()

In [0]:
import os
import numpy as np
import random
from tqdm import tqdm
import gym
import time
import copy
import matplotlib.pyplot as plt
import argparse
from collections import namedtuple

import torch
import torch.tensor as Tensor
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from scipy.special import softmax

import logging
logging.propagate = False 
logging.getLogger().setLevel(logging.ERROR)

import wandb

%matplotlib inline

In [11]:
# WandB – Login to your wandb account so you can log all your metrics
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://app.wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: fd1a686e3fb538374e472fc536037d249adef19f
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[32mSuccessfully logged in to Weights & Biases![0m


In [0]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(env.observation_space.shape[0], 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, env.action_space.n)
            
    def forward(self, x, softmax=False):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [0]:
def get_action(obs):
    current_policy = get_current_policy(obs)
    probs = F.softmax(current_policy)
    dist = torch.distributions.Categorical(probs)
    act = dist.sample().item()
    return act, F.log_softmax(current_policy).squeeze(0)[act]

def get_current_policy(obs):
    return policy.forward(torch.from_numpy(obs).float().unsqueeze(0).to(gpu))

def reward_to_go(a):
    return np.sum([a[i] * np.power(gamma, i) for i in range(len(a))]) 

In [35]:
env = gym.make('CartPole-v0')



In [0]:
wandb.init(entity="agkhalil", project="pytorch-vpg-cartpole")
wandb.watch_called = False

config = wandb.config
config.batch_size = 50
config.episodes = 2000
config.lr = 0.0005
config.seed = 42
config.gamma = 0.99
eps = np.finfo(np.float32).eps.item()

gpu = torch.device('cuda:0')
torch.manual_seed(config.seed)
learning_rate = config.lr
batch_size = config.batch_size
policy = MLP().to(gpu)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

EPISODES = config.episodes
gamma = config.gamma

wandb.watch(policy, log="all")

for episode in tqdm(range(0, EPISODES)):
    old_rewards = []
    rewards = []
    log_soft = []
    obs = env.reset()
    done = False
    step = 0
    reward = 0
    while not done:
        action, log_prob = get_action(obs)
        new_obs, rew, done, _ = env.step(action)
        reward += rew * np.power(gamma, step)
        old_rewards.append(reward)        
        rewards.append(rew)
        log_soft.append(log_prob)
        step += 1
        obs = new_obs

    discounted_rewards = [reward_to_go(rewards[i:]) for i in range(len(rewards))]
    optimizer.zero_grad()
    discounted_rewards = torch.tensor(discounted_rewards).to(gpu)
    advantage = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + eps)
    loss = [-advantage[i] * log_soft[i] for i in range(len(advantage))]
    loss = torch.stack(loss)
    loss.to(gpu)
    loss.sum().backward()
    optimizer.step()
    wandb.log({
        "Episode reward": step,
        "Loss": loss.cpu(),
        }, step=episode)

torch.save(policy.state_dict(), "model.h5")
wandb.save('model.h5')

  This is separate from the ipykernel package so we can avoid doing imports until
  
 66%|██████▋   | 1326/2000 [03:34<02:27,  4.58it/s]

In [0]:
old_rewards = torch.tensor(old_rewards)
old_rewards

In [0]:
discounted_rewards

In [0]:
tot_per = []
epsilon = 0

for ep in tqdm(range(0, 100)):
    done = False
    obs = env.reset()
    tot_rew = 0
    while not done:
        act = get_action(obs)
        obs, rew, done, _ = env.step(act)
        tot_rew += rew
#         env.render()
    tot_per.append(tot_rew)
np.mean(tot_per)

100%|██████████| 100/100 [00:06<00:00, 13.78it/s]


199.11