In [1]:
import pandas as pd
import numpy as np
import gym

import torch

from torch.nn import Module
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

import matplotlib.pyplot as plt
from itertools import count


In [2]:
#Setting up environment
env=gym.make('CartPole-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
class Policy(Module):
    
    def __init__(self):
        
        super().__init__()
        
        ###Network Architecture
        self.Linear1=nn.Linear(4,64)
#         self.Linear2=nn.Linear(256,32)
        self.output_layer= nn.Linear(64,2)
        self.Dropout=nn.Dropout(p=0.1)
        
        ###Saving history
        self.saved_log_probs=[]
        self.rewards=[]
        
    def forward(self,x):
        
        ###Network Architecture flow
        x=F.relu(self.Linear1(x))
#         x=self.Dropout(x)
#         x=F.relu(self.Linear2(x))
        x=self.Dropout(x)
        x=self.output_layer(x)
        
        return F.softmax(x,dim=1)
        

    

In [4]:
###Initializing the network
policy=Policy()
optimizer=optim.Adam(policy.parameters(),lr=1e-2)
eps = np.finfo(np.float32).eps.item()

In [5]:
def select_action(state):
    
    ###Converting to tensor and adding a dimension
    state=torch.from_numpy(state).float().unsqueeze(0)
    
    ###Passing state through network to get probability
    probs=policy(state)
    
    ###Sampling action
    m=Categorical(probs)
    action=m.sample()
    
    ###Saving log of probability for gradient calculation
    policy.saved_log_probs.append(m.log_prob(action))
    
    return action.item()

In [6]:
def finish_episode(gamma=0.8):
    
    R=0
    rewards=policy.rewards[::-1]
    returns=[]
    policy_loss=[]
    for r in rewards:
        R=r+gamma*R
        returns.insert(0,R)
        
    returns=torch.Tensor(returns)
    
    returns=(returns-returns.mean())/(returns.std()+eps)
    
    loss=0
    
    for log_prob,R in zip(policy.saved_log_probs,returns):
        policy_loss.append(-log_prob*R)
    
    optimizer.zero_grad()
    
    policy_loss=torch.cat(policy_loss).sum()
    policy_loss.backward()
    
    optimizer.step()
    
    del policy.rewards[:]
    del policy.saved_log_probs[:]
    
    

In [7]:
def main():
    
    T=200
    log_interval=100
    running_reward=1
    for i_episode in count(1):
        state=env.reset()
        ep_reward=0
        
        for t in range(T):
            ###Selecting action
            action=select_action(state)
            state,reward,done,_=env.step(action)
            
            env.render()
            #Saving Rewards
            policy.rewards.append(reward)
            #Updating episode reward
            ep_reward+=reward
            
            
            if done:
                break
                

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        
        finish_episode()
        
        if i_episode % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
            
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

    env.close()

In [8]:
#main()

#torch.save(policy, "./models/policy_June9"

Episode 100	Last reward: 42.00	Average reward: 132.09
Episode 200	Last reward: 200.00	Average reward: 186.63
Solved! Running reward is now 195.207966266448 and the last episode runs to 199 time steps!


In [10]:
model=torch.load("./models/policy_June9")

In [11]:
def run(model):
    
    env=gym.make('CartPole-v0')
    T=200
    for i_episode in range(10):
        state=env.reset()
        ep_reward=0
        
        for t in range(T):
            #####Selecting action
            ###Converting to tensor and adding a dimension
            state=torch.from_numpy(state).float().unsqueeze(0)

            ###Passing state through network to get probability
            probs=model(state)

            ###Sampling action
            m=Categorical(probs)
            action=m.sample().item()
            
            state,reward,done,_=env.step(action)
            
            env.render()
            #Updating episode reward
            ep_reward+=reward
            
            if done:
                break
      
        print('Episode {}\tEpisode reward: {:.2f}'.format(
              i_episode, ep_reward))

    env.close()

In [12]:
run(model)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode 0	Episode reward: 200.00
Episode 1	Episode reward: 200.00
Episode 2	Episode reward: 200.00
Episode 3	Episode reward: 200.00
Episode 4	Episode reward: 200.00
Episode 5	Episode reward: 200.00
Episode 6	Episode reward: 200.00
Episode 7	Episode reward: 200.00
Episode 8	Episode reward: 200.00
Episode 9	Episode reward: 200.00
