In [22]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
%matplotlib inline

In [23]:
env = gym.make('CartPole-v1')
env.seed(1); torch.manual_seed(1);

WARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.


In [24]:
class Policy(torch.nn.Module):
    def __init__(self, n_inputs=4, n_hidden=10, n_outputs=2, learning_rate=0.01, gamma=0.99):
        super(Policy, self).__init__()
        self.layer1 = torch.nn.Linear(n_inputs, n_hidden) 
        self.layer2 = torch.nn.Linear(n_hidden, n_outputs)
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.episode_data = []
        self.optimizer = optim.Adam(self.parameters(), lr=1e-2)
        self.eps = np.finfo(np.float32).eps.item()
    
    def forward(self, x):
        #l1z = self.layer1(x)
        l1a = F.relu(self.layer1(x))
        
        l2z = self.layer2(l1a)
        out = F.softmax(l2z, dim=0)
        return out    
        


In [25]:
def learn(policy):
    R = 0
    rewards = []
    probs = np.array(policy.episode_data)[:,0]
    ep_rewards = np.array(policy.episode_data)[:,1]
    for r in reversed(ep_rewards):
        R = r + policy.gamma*R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + policy.eps)

    loss = np.dot(-probs, rewards)
    policy.zero_grad()
    #policy.optimizer.zero_grad()
    loss.backward()
    #policy.optimizer.step()
    with torch.no_grad():
        for param in policy.parameters():
            param.data -= policy.learning_rate*param.grad
#         print(self.episode_data) # debug
#         print(probs)
#         print(ep_rewards)
#         print(rewards)
    policy.loss = loss
    policy.episode_data = []

In [34]:
def main(env, policy, episode_num=1000, render_num=100, log_interval=100):
    for ep_idx in range(episode_num):
        state = env.reset()
        state = Variable(torch.Tensor(state))
        for step_idx in range(1000):
            action_probs = policy.forward(state)
            m = Categorical(action_probs)
            action = m.sample()
#             action_probs = action_probs.detach().numpy()
#             action = np.random.choice(len(action_probs), p=action_probs)
#             action_prob = action_probs[action]
            next_s, reward, done, _ = env.step(action.item())
#             if ep_idx > render_num:
#                 env.render()
            policy.episode_data.append((m.log_prob(action), reward))
            if done:
                break
            state = Variable(torch.Tensor(next_s))
        
        learn(policy)
        #if ep_idx % log_interval == 0:
        print('\rEpisode {}\tLast length: {:5d}'.format(ep_idx, step_idx+1), end='')


In [35]:

# policy = Policy()
# x = Variable(torch.Tensor([1, 2, 3, 4]))

# probs = policy.forward(x)
# m = Categorical(probs)
# action = m.sample()
# print(probs, m, action, m.log_prob(action))

# action_pr = probs.detach().numpy()
# action = np.random.choice(len(action_pr), p=action_pr)
# print(action_pr, action, np.log(action_pr[action]), np.log(1-action_pr[action]))

# # for param in policy.parameters():
# #     print (param)
    

In [36]:
policy = Policy()

In [37]:
main(env, policy, 300)

Episode 299	Last length:   144

In [38]:
policy.loss

tensor(5.3851, grad_fn=<ThAddBackward>)