In [0]:
import gym
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import sys
from torch.autograd import Variable

GAMMA = 0.99

In [0]:
class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, learning_rate=2e-2):
        super(PolicyNetwork, self).__init__()
        self.num_actions = num_actions
        self.linear = nn.Linear(num_inputs, num_actions, bias = False)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, state):
        x = F.softmax(self.linear(state), dim=1)
        #print('x after softmax = {}'.format(x))
        return x 
    
    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.forward(Variable(state))
        #print('probs = {}'.format(probs))
        #Choose action with regard to policy
        highest_prob_action = np.random.choice(self.num_actions, p=np.squeeze(probs.detach().numpy()))
        log_prob = torch.log(probs.squeeze(0)[highest_prob_action]) #log for gradient
        return highest_prob_action, log_prob



def count_gradient_for_policy(policy_network, rewards, log_probs):
    discounted_rewards = []

    for t in range(len(rewards)):
        Gt = 0 
        pw = 0
        for r in rewards[t:]:
            Gt = Gt + GAMMA**pw * r
            pw = pw + 1
        discounted_rewards.append(Gt)
        
    discounted_rewards = torch.tensor(discounted_rewards)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards

    policy_gradient = []
    for log_prob, Gt in zip(log_probs, discounted_rewards):
        policy_gradient.append(-log_prob * Gt)
    
    policy_gradient = torch.stack(policy_gradient).sum()
    policy_network.optimizer.zero_grad()
    policy_gradient.backward()
    policy_network.optimizer.step()
    return [param.grad.numpy() for param in policy_network.parameters()]

In [0]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def backward(policy, prob, action, state): 

    grad = np.zeros(policy.shape)


    grad = -np.exp(prob[action])*np.exp(prob[:,None])*state[None,:]/(np.sum(np.exp(prob)))**2
    grad[action,:] = (np.exp(prob[action]*state[None,:]*(np.sum(np.exp(prob))-np.exp(prob[action]))))/(np.sum(np.exp(prob)))**2

    return grad


def align_columns_zeros(array):
    max_length = max(list(map(lambda x: len(x), array)))
    for col in range(len(array)):
        array[col] = np.pad(array[col], max_length - len(array[col]), 'constant', constant_values = 0)[max_length - len(array[col]):]
    return array

In [0]:
def trajectory_policy(policy, probs, rewards, actions, states):
    discounted_rewards = []

    for t in range(len(rewards)):
        Gt = 0 
        pw = 0
        for r in rewards[t:]:
            Gt = Gt + GAMMA**pw * r
            pw = pw + 1
        discounted_rewards.append(Gt)
        
    discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / (np.std(discounted_rewards) + 1e-9) # normalize discounted rewards

    policy_gradient = []
    for prob, Gt, action, state in zip(probs, discounted_rewards, actions, states):
        policy_gradient.append(1/prob[action] * backward(policy, prob, action, state) * Gt)
    policy_gradient = np.sum(policy_gradient, axis = 0)
    return policy_gradient

In [7]:
%%time
# GAMMA = 0.99
# learning_rate = 0.003
env = gym.make('CartPole-v0')

num_actions = env.action_space.n
max_episode_num = 2000
max_steps = 10000
numsteps = []
avg_numsteps = []
all_rewards = []
policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n)

for episode in range(max_episode_num):
    state = env.reset()
    ps = []
    log_probs = []
    rewards = []
    actions = []
    states = []
    #Каждый эпизод инициализируем собственную политику весами нейронной сети
    policy = []
    for param in policy_net.parameters():
      policy.append((param.detach().numpy()))
    policy = policy[0]
    for steps in range(max_steps):
        probs = softmax(np.dot(policy, state))
        action, log_prob = policy_net.get_action(state)
        new_state, reward, done, _ = env.step(action)
        actions.append(action)
        ps.append(np.squeeze(probs))
        states.append(np.squeeze(state))
        log_probs.append(log_prob)
        rewards.append(reward)

        if done:
            traj_sm = trajectory_policy(policy, ps, rewards, actions, states)
            traj_torch = count_gradient_for_policy(policy_net, rewards, log_probs)
            print('----------------------------------TORCH---------------------------- {} --------------------------------------------'.format(traj_torch[0]))
            print('-----------------------------------SM------------------------------ {} --------------------------------------------'.format(traj_sm))
            numsteps.append(steps)
            avg_numsteps.append(np.mean(numsteps[-10:]))
            all_rewards.append(np.sum(rewards))
            if episode % 10 == 0:
                print("episode: {}, total reward: {}, average_reward: {}, length: {}\n".format(episode, np.round(np.sum(rewards), decimals = 3),  np.round(np.mean(all_rewards[-10:]), decimals = 3), steps))
            break 
        state = new_state

----------------------------------TORCH---------------------------- [[-0.07396783 -1.0552615   0.12487228  1.6743059 ]
 [ 0.07396784  1.0552617  -0.12487231 -1.6743066 ]] --------------------------------------------
-----------------------------------SM------------------------------ [[ 0.08645509  0.05550772  0.02431923 -2.04332641]
 [ 0.13695598 -1.56897437  0.46608108  3.0309314 ]] --------------------------------------------
episode: 0, total reward: 14.0, average_reward: 14.0, length: 13

----------------------------------TORCH---------------------------- [[-0.08227038 -3.600146    0.08309889  4.769887  ]
 [ 0.0822704   3.6001465  -0.0830989  -4.7698874 ]] --------------------------------------------
-----------------------------------SM------------------------------ [[ 1.29508729  5.74494361  0.89229566 -5.12525251]
 [-0.2498248  -3.90829966 -0.24969556  1.11035166]] --------------------------------------------
----------------------------------TORCH---------------------------- [[

KeyboardInterrupt: ignored