In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import sys

import torch
from torch import nn
from torch import optim

print(sys.version)
print(torch.__version__)
print(torch.version.cuda)

3.7.7 (default, May 21 2020, 14:57:43) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-39)]
1.6.0
10.2


In [2]:
class policy_estimator():
    def __init__(self, env):
        self.n_inputs = env.observation_space.shape[0]
        self.n_outputs = env.action_space.n
        
        # Define network
        self.network = nn.Sequential(
            nn.Linear(self.n_inputs, 16), 
            nn.ReLU(), 
            nn.Linear(16, self.n_outputs),
            nn.Softmax(dim=-1))
    
    def predict(self, state):
        action_probs = self.network(torch.FloatTensor(state))
        return action_probs

In [3]:
env = gym.make('CartPole-v0')
s = env.reset()
pe = policy_estimator(env)
print(pe.predict(s))
print(pe.network(torch.FloatTensor(s)))

tensor([0.4861, 0.5139], grad_fn=<SoftmaxBackward>)
tensor([0.4861, 0.5139], grad_fn=<SoftmaxBackward>)


In [4]:
def discount_rewards(rewards, gamma=0.99):
    r = np.array([gamma**i * rewards[i] 
                  for i in range(len(rewards))])
    # Reverse the array direction for cumsum and then
    # revert back to the original order
    r = r[::-1].cumsum()[::-1]
    return r - r.mean()

In [12]:
def reinforce(env, policy_estimator, num_episodes=2000,
              batch_size=10, gamma=0.99):

    # Set up lists to hold results
    total_rewards = []
    batch_rewards = []
    batch_actions = []
    batch_states = []
    batch_counter = 1
    
    # Define optimizer
    optimizer = optim.Adam(policy_estimator.network.parameters(), 
                           lr=0.01)
    
    action_space = np.arange(env.action_space.n)
    for ep in range(num_episodes):
        s_0 = env.reset()
        states = []
        rewards = []
        actions = []
        complete = False
        while complete == False:
            # Get actions and convert to numpy array
            action_probs = policy_estimator.predict(s_0).detach().numpy()
            action = np.random.choice(action_space, p=action_probs)
            s_1, r, complete, _ = env.step(action)
            
            states.append(s_0)
            rewards.append(r)
            actions.append(action)
            s_0 = s_1
            
            # If complete, batch data
            if complete:
                batch_rewards.extend(discount_rewards(rewards, gamma))
                batch_states.extend(states)
                batch_actions.extend(actions)
                batch_counter += 1
                total_rewards.append(sum(rewards))
                
                # If batch is complete, update network
                if batch_counter == batch_size:
                    optimizer.zero_grad()
                    state_tensor = torch.FloatTensor(batch_states)
                    reward_tensor = torch.FloatTensor(batch_rewards)
                    # Actions are used as indices, must be LongTensor
                    action_tensor = torch.LongTensor(batch_actions)
                    
                    print("state_tensor: ", state_tensor)
                    print("reward_tensor: ", reward_tensor)
                    print("action_tensor: ", action_tensor)
                    
                    # Calculate loss
                    logprob = torch.log(
                        policy_estimator.predict(state_tensor))
                    selected_logprobs = reward_tensor * \
                        logprob[np.arange(len(action_tensor)), action_tensor]
                    loss = -selected_logprobs.mean()
                    
                    # Calculate gradients
                    loss.backward()
                    # Apply gradients
                    optimizer.step()
                    
                    batch_rewards = []
                    batch_actions = []
                    batch_states = []
                    batch_counter = 1
                    
                # Print running average
                #print("\rEp: {} Average of last 10: {:.2f}".format(
                #    ep + 1, np.mean(total_rewards[-10:])), end="")
                
    return total_rewards

In [13]:
rewards = reinforce(env, pe)
window = 10
smoothed_rewards = [np.mean(rewards[i-window:i+1]) if i > window 
                    else np.mean(rewards[:i+1]) for i in range(len(rewards))]

plt.figure(figsize=(12,8))
plt.plot(rewards)
plt.plot(smoothed_rewards)
plt.ylabel('Total Rewards')
plt.xlabel('Episodes')
plt.show()

state_tensor:  tensor([[-0.0428, -0.0263,  0.0209, -0.0289],
        [-0.0434,  0.1685,  0.0203, -0.3149],
        [-0.0400, -0.0269,  0.0140, -0.0159],
        ...,
        [ 1.1978, -0.5550, -0.0770,  0.2334],
        [ 1.1867, -0.7490, -0.0723,  0.5009],
        [ 1.1717, -0.5529, -0.0623,  0.1863]])
reward_tensor:  tensor([ 56.6990,  55.6990,  54.7090,  ..., -29.4929, -29.6310, -29.7677])
action_tensor:  tensor([1, 0, 1,  ..., 0, 1, 0])
state_tensor:  tensor([[ 0.0359,  0.0432,  0.0045, -0.0279],
        [ 0.0367, -0.1520,  0.0039,  0.2662],
        [ 0.0337, -0.3472,  0.0092,  0.5601],
        ...,
        [ 1.6718,  0.5271,  0.0468,  0.0112],
        [ 1.6824,  0.7215,  0.0470, -0.2664],
        [ 1.6968,  0.5258,  0.0416,  0.0408]])
reward_tensor:  tensor([ 53.7110,  52.7110,  51.7210,  ..., -29.4929, -29.6310, -29.7677])
action_tensor:  tensor([0, 0, 1,  ..., 1, 0, 0])
state_tensor:  tensor([[-0.0327, -0.0145, -0.0256, -0.0068],
        [-0.0330, -0.2092, -0.0258,  0.2777],
   

KeyboardInterrupt: 