In [None]:
# import argparse
import gym
import numpy as np
from itertools import count #  iterators which helps in getting faster execution time and writing memory-efficient code

import torch
import torch.nn as nn # used for nn.Linear, nn.dropout
import torch.nn.functional as F # used for F.relu, F.softmax
import torch.optim as optim # used for optim.Adam
from torch.distributions import Categorical # used to create a Categorical distribution with our actions and their probs

In [None]:
env = gym.make('CartPole-v1')
env.seed(0)
torch.manual_seed(0)

OBSERVATION_SPACE = env.observation_space.shape[0] 
# 4 for CartPole-v1: Cart Position, Cart Velocity, Pole Angle, Pole Angular Velocity

ACTION_SPACE = env.action_space.n
# 2 for CartPole-v1: move Left or Right

DROPOUT_RATE = 0.6
# choose a dropout rate

HIDDEN_LAYER_SIZE = 128
# choose a size for the hidden layer

In [None]:
# Hyperparameters
learning_rate = 0.01
gamma = 0.99
render = False


class Policy(nn.Module):
    # Our policy will be a neural network with 4 inputs (the environment info),
    # 128 neurons in the hidden layer (w/ Dropout at 60%) and 2 outputs
    # (corresponding to the two possible actions: left or right)
    # a rectified linear unit activation function and a softmax are also part
    # of the process

    def __init__(self):
        super(Policy, self).__init__()
        self.layer1 = nn.Linear(OBSERVATION_SPACE, HIDDEN_LAYER_SIZE)
        self.dropout = nn.Dropout(p=DROPOUT_RATE)
        self.layer2 = nn.Linear(HIDDEN_LAYER_SIZE, ACTION_SPACE)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        x = self.layer1(x)
        x = self.dropout(x)
        x = F.relu(x)
        action_scores = self.layer2(x)
        return F.softmax(action_scores, dim=1)


policy = Policy()
# Create our policy object

optimizer = optim.Adam(policy.parameters(), lr=1e-2)

eps = np.finfo(np.float32).eps.item() 
# Machine epsilon, the smallest number computable, is used to avoid division by zero

def select_action(state):

    state = torch.from_numpy(state).float().unsqueeze(0) 
    # unsqueeze: Returns a new tensor with a dimension of size one inserted at the specified position.
    # Makes the state a tensor of appropriate dimensions

    probs = policy(state) 
    # Passing the state (in tensor form) to the Policy, our neural net, which returns the probs of the two actions
    
    m = Categorical(probs) 
    # Creates a categorical distribution parameterized by probs (0 -> prob(0), 1 -> prob(1))

    action = m.sample()
    # Chooses an action according to the probs

    policy.saved_log_probs.append(m.log_prob(action)) 
    # torch.distributions.Categorical contains the function log_prob(something), we're calling it here

    return action.item() 
    # Use torch.Tensor.item() to get a Python number from a tensor containing a single value

def finish_episode():
    R = 0 # For total rewards
    policy_loss = []
    returns = []

    for r in policy.rewards[::-1]: 
        # Goes through rewards in inverse order, calculates discounted rewards

        R = r + gamma * R 
        # Total reward in s(t) = reward in s(t) + gamma * total rewards in s(t+1)

        returns.insert(0, R) 
        # The list insert() method inserts an element to the list at the specified index.

    returns = torch.tensor(returns)
    # Make it a tensor

    returns = (returns - returns.mean()) / (returns.std() + eps) 
    # Z-score = - mean / std. How far away are they from the mean?
    # Eps is added to avoid division by zero

    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R) 
        # Multiply log probs times returns, minus since we want to maximize returns

    optimizer.zero_grad() 
    # Set the gradients to zero so they don't accumulate

    policy_loss = torch.cat(policy_loss).sum() 
    # cat: Concatenates the given sequence of seq tensors in the given dimension. Then .sum() sums them

    policy_loss.backward()
    # Update grads: x.grad += dloss/dx

    optimizer.step() 
    # Update params. For SGD would be x += -lr * x.grad

    del policy.rewards[:]
     # Delete rewards, we don't need those anymore

    del policy.saved_log_probs[:] 
    # Delete saved_log_probs, we don't need those anymore

def main():
    running_reward = 10
    for i_episode in count(1): 
        # Run as many episodes as needed, faster than other loops

        state, ep_reward = env.reset(), 0 
        # Reset the environment and the reward to 0

        for t in range(1, 10000): 
            # Don't infinite loop while learning, maximum ep length: 10'000

            action = select_action(state) 
            # Pass the state to the neural network, returns chosen action

            state, reward, done, _ = env.step(action) 
            # Typical gym stuff, take the action, returns state, reward, done and info (not needed, hence _)

            if render == True:
                env.render()

            policy.rewards.append(reward)
            ep_reward += reward 
            # Total reward for the episode

            if done:
                break
        
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward 
        # Slowly update the "average" reward, so only get there when we get there

        finish_episode() 
        # Updates the params in our Policy

        if i_episode % 10 == 0: 
            # Every 10 episodes print an update
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
            if running_reward > env.spec.reward_threshold: 
                # Stop when the "average" reward gets over the threshold (475 for CartPole)

                print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
                break

In [None]:
if __name__ == '__main__':
    main()

Episode 10	Last reward: 69.00	Average reward: 16.74
Episode 20	Last reward: 23.00	Average reward: 24.05
Episode 30	Last reward: 85.00	Average reward: 33.88
Episode 40	Last reward: 37.00	Average reward: 45.31
Episode 50	Last reward: 16.00	Average reward: 59.90
Episode 60	Last reward: 162.00	Average reward: 95.08
Episode 70	Last reward: 108.00	Average reward: 99.79
Episode 80	Last reward: 277.00	Average reward: 133.26
Episode 90	Last reward: 99.00	Average reward: 165.36
Episode 100	Last reward: 86.00	Average reward: 136.98
Episode 110	Last reward: 18.00	Average reward: 127.73
Episode 120	Last reward: 84.00	Average reward: 125.35
Episode 130	Last reward: 126.00	Average reward: 119.86
Episode 140	Last reward: 462.00	Average reward: 156.19
Episode 150	Last reward: 123.00	Average reward: 158.56
Episode 160	Last reward: 88.00	Average reward: 144.48
Episode 170	Last reward: 176.00	Average reward: 151.33
Episode 180	Last reward: 500.00	Average reward: 204.67
Episode 190	Last reward: 201.00	Aver