In [9]:
import sys
import torch  
import gym
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
from tqdm import tqdm
#from scipy.special import softmax

In [10]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def softmax_grad(s): 
    jacobian_m = np.diag(s)
    for i in range(len(jacobian_m)):
        for j in range(len(jacobian_m)):
            if i == j:
                jacobian_m[i][j] = s[i] * (1-s[i])
            else: 
                jacobian_m[i][j] = -s[i]*s[j]
    return jacobian_m

In [41]:
def update_policy(policy, rewards, log_probs):
    discounted_rewards = []

    for t in range(len(rewards)):
        Gt = 0 
        pw = 0
        for r in rewards[t:]:
            Gt = Gt + GAMMA**pw * r
            pw = pw + 1
        discounted_rewards.append(Gt)
        
    discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / (np.std(discounted_rewards) + 1e-9) # normalize discounted rewards

    policy_gradient = []
    for log_prob, Gt in zip(log_probs, discounted_rewards):
        policy_gradient.append(-log_prob * Gt)
    
    #print(policy_gradient)

In [None]:
GAMMA = 0.99
env = gym.make('CartPole-v0')

num_actions = env.action_space.n
max_episode_num = 3000
max_steps = 10000
numsteps = []
avg_numsteps = []
all_rewards = []
policy = np.random.random_sample((env.observation_space.shape[0], env.action_space.n))

for episode in range(max_episode_num):
    state = env.reset().reshape(-1, 1).T
    log_probs = []
    rewards = []

    for steps in range(max_steps):
        #env.render()
        probs = softmax(np.dot(state, policy))
        action = np.random.choice(num_actions, p = np.squeeze(probs))
        log_prob = np.log(np.squeeze(probs)[action])
        new_state, reward, done, _ = env.step(action)
        log_probs.append(log_prob)
        rewards.append(reward)

        if done:
            update_policy(policy, rewards, log_probs)
            numsteps.append(steps)
            avg_numsteps.append(np.mean(numsteps[-10:]))
            all_rewards.append(np.sum(rewards))
            if episode % 10 == 0:
                sys.stdout.write("episode: {}, total reward: {}, average_reward: {}, length: {}\n".format(episode, np.round(np.sum(rewards), decimals = 3),  np.round(np.mean(all_rewards[-10:]), decimals = 3), steps))
            break
            
        state = new_state

episode: 0, total reward: 13.0, average_reward: 13.0, length: 12
episode: 10, total reward: 16.0, average_reward: 32.6, length: 15
episode: 20, total reward: 16.0, average_reward: 22.8, length: 15
episode: 30, total reward: 26.0, average_reward: 20.2, length: 25
episode: 40, total reward: 13.0, average_reward: 23.7, length: 12
episode: 50, total reward: 17.0, average_reward: 23.5, length: 16
episode: 60, total reward: 27.0, average_reward: 27.8, length: 26
episode: 70, total reward: 12.0, average_reward: 17.2, length: 11
episode: 80, total reward: 15.0, average_reward: 21.0, length: 14
episode: 90, total reward: 14.0, average_reward: 21.4, length: 13
episode: 100, total reward: 18.0, average_reward: 18.0, length: 17
episode: 110, total reward: 12.0, average_reward: 18.0, length: 11
episode: 120, total reward: 48.0, average_reward: 28.2, length: 47
episode: 130, total reward: 15.0, average_reward: 20.3, length: 14
episode: 140, total reward: 13.0, average_reward: 24.6, length: 12
episod

episode: 1280, total reward: 9.0, average_reward: 23.7, length: 8
episode: 1290, total reward: 15.0, average_reward: 18.7, length: 14
episode: 1300, total reward: 11.0, average_reward: 21.7, length: 10
episode: 1310, total reward: 31.0, average_reward: 22.9, length: 30
episode: 1320, total reward: 42.0, average_reward: 25.1, length: 41
episode: 1330, total reward: 26.0, average_reward: 19.1, length: 25
episode: 1340, total reward: 42.0, average_reward: 24.3, length: 41
episode: 1350, total reward: 13.0, average_reward: 17.6, length: 12
episode: 1360, total reward: 25.0, average_reward: 23.5, length: 24
episode: 1370, total reward: 14.0, average_reward: 20.0, length: 13
episode: 1380, total reward: 8.0, average_reward: 26.9, length: 7
episode: 1390, total reward: 18.0, average_reward: 20.6, length: 17
episode: 1400, total reward: 26.0, average_reward: 28.2, length: 25
episode: 1410, total reward: 12.0, average_reward: 23.5, length: 11
episode: 1420, total reward: 19.0, average_reward: 2

episode: 2500, total reward: 9.0, average_reward: 19.0, length: 8
episode: 2510, total reward: 15.0, average_reward: 29.0, length: 14
episode: 2520, total reward: 23.0, average_reward: 24.9, length: 22
episode: 2530, total reward: 11.0, average_reward: 17.0, length: 10
episode: 2540, total reward: 19.0, average_reward: 22.7, length: 18
episode: 2550, total reward: 15.0, average_reward: 17.7, length: 14
episode: 2560, total reward: 14.0, average_reward: 24.0, length: 13
episode: 2570, total reward: 15.0, average_reward: 23.2, length: 14
episode: 2580, total reward: 33.0, average_reward: 19.5, length: 32
episode: 2590, total reward: 10.0, average_reward: 21.3, length: 9
episode: 2600, total reward: 33.0, average_reward: 21.4, length: 32
episode: 2610, total reward: 50.0, average_reward: 21.7, length: 49
episode: 2620, total reward: 19.0, average_reward: 23.3, length: 18
episode: 2630, total reward: 10.0, average_reward: 27.3, length: 9
episode: 2640, total reward: 61.0, average_reward: 2

In [17]:
policy

array([[0.44759897, 0.96131853, 0.65431246, 0.74558678],
       [0.90286341, 0.76139292, 0.23279973, 0.69732   ]])