<a href="https://colab.research.google.com/github/Arya-Raj/Cartpole-problem-using-Pytorch-and-RL-algorithms/blob/main/pytorch_implementation_of_cartpole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Actor-Critic algorithm in Cartpole (A2C)

In [1]:
import torch
import gym
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import math
from torch import optim
from torch.distributions import Categorical

eps = np.finfo(np.float32).eps.item()
seed = 42
torch.manual_seed(seed)
gamma = 0.99  # Discount factor for past rewards
max_steps_per_episode = 10000
env = gym.make("CartPole-v0")  # Create the environment
env.seed(seed)

[42]

In [2]:
class Actor_critic(nn.Module):
    def __init__(self,num_actions,num_value):
        super().__init__()
        self.layer1=nn.Linear(4,128)
        self.actor=nn.Linear(128,num_actions)
        self.critic=nn.Linear(128,num_value)
    def forward(self,state):
        common=F.relu(self.layer1(state))
        actor_prob=F.softmax(self.actor(common),dim=-1)
        critic_value=self.critic(common)
        
        return(actor_prob,critic_value)
        
       
model=Actor_critic(2,1)
opt= optim.Adam(model.parameters(), lr=0.01)
lr_scheduler = optim.lr_scheduler.ExponentialLR(opt, gamma=1. - 2e-5) 

In [3]:
action_prob_hist=[]
critic_value_hist=[]
reward_hist=[]
episode=0
running_reward=0




while True:
    episode_reward=0
    state=env.reset()
    for i in range(0,10000):
        #env.render()
        #state=torch.FloatTensor(state).unsqueeze(0)
        state=torch.from_numpy(state).float()
        actor_prob,critic_value=model(state)

        m=Categorical(actor_prob)
        action=m.sample()
        action_prob_hist.append(m.log_prob(action))
        critic_value_hist.append(critic_value)

        #one_d_tensor=actor_prob.squeeze(dim=0)
        #sampled_prob=random.choice(actor_prob)
        #action_prob_hist.append(math.log(sampled_prob))
        #action=one_d_tensor.tolist().index(sampled_prob)
        #action=actor_prob.tolist().index(sampled_prob)
        #critic_value_hist.append(critic_value)

        state,reward,done,info=env.step(action.item())
        reward_hist.append(reward)
        episode_reward+=reward
        #print("for timestep {} action {}, critic_value {},sampled_prob {},episode_reward {}".format(i,action,critic_value,sampled_prob,episode_reward))
        if done:
            break
    
    returns=[]
    actor_loss=[]
    critic_loss=[]
    running_reward=0.05*episode_reward+(1-0.05)*running_reward
    
    ds=0
    for r in reward_hist[::-1]:
        ds=r+0.99*ds
        returns.insert(0,ds)
    returns=torch.tensor(returns)
    returns=returns-returns.mean()/returns.std()+eps
    
    #print('running_reward {},returns {}'.format(running_reward,returns))    
    #tensaction_prob_hist=torch.tensor(action_prob_hist)
    #tenscritic_value_hist=torch.tensor(critic_value_hist)
    #tensreturns=torch.tensor(returns)

    hist=zip(action_prob_hist,critic_value_hist,returns)
    for log_prob,value,ret in hist:
        loss=ret-value.item()
        actor_loss.append(-log_prob*loss)
        critic_loss.append(F.smooth_l1_loss(value, torch.tensor([ret])))
    opt.zero_grad()
    #sum_loss=torch.tensor(sum(actor_loss)+sum(critic_loss),requires_grad=True,dtype=torch.float64)
    loss = torch.stack(actor_loss).sum() + torch.stack(critic_loss).sum()
    loss.backward()
    opt.step()

    
    #print("loss {} actor_loss {} critic_loss".format(sum_loss,actor_loss,critic_loss))
    
    action_prob_hist.clear()
    critic_value_hist.clear()
    reward_hist.clear()
    
    episode+=1
    
    if episode%10==0:
        print("Episode no {} running_reward {}".format(episode,running_reward))
        

    if running_reward>=195:
        print("Solved at episode {} with reward {}".format(episode,running_reward))
        break



Episode no 10 running_reward 9.037832377223241
Episode no 20 running_reward 14.305686971200565
Episode no 30 running_reward 29.274414309736237
Episode no 40 running_reward 44.6505708753231
Episode no 50 running_reward 58.098968627992896
Episode no 60 running_reward 55.55741588311191
Episode no 70 running_reward 75.22276790510894
Episode no 80 running_reward 82.93959462367081
Episode no 90 running_reward 59.42397709400871
Episode no 100 running_reward 45.93302228421416
Episode no 110 running_reward 41.51756600592898
Episode no 120 running_reward 52.76398120800984
Episode no 130 running_reward 81.96509438435643
Episode no 140 running_reward 105.63927630636577
Episode no 150 running_reward 135.7414091105352
Episode no 160 running_reward 146.35459484294566
Episode no 170 running_reward 140.18255158494748
Episode no 180 running_reward 157.6583409557919
Episode no 190 running_reward 163.32190910256608
Episode no 200 running_reward 166.69749847175288
Episode no 210 running_reward 167.20618716

##Policy gradient algorithm in Cartpole

In [7]:
import torch
import gym
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import math
from torch import optim
from torch.distributions import Categorical

eps = np.finfo(np.float32).eps.item()
seed = 42
torch.manual_seed(seed)
gamma = 0.99  # Discount factor for past rewards
max_steps_per_episode = 10000
env = gym.make("CartPole-v0")  # Create the environment
env.seed(seed)

[42]

In [8]:
class Actor_critic(nn.Module):
    def __init__(self,num_actions):
        super().__init__()
        self.layer1=nn.Linear(4,128)
        self.actor=nn.Linear(128,num_actions)
      
    def forward(self,state):
        common=F.relu(self.layer1(state))
        actor_prob=F.softmax(self.actor(common),dim=-1)
        
        
        return(actor_prob)
        
       
model=Actor_critic(2)
opt= optim.Adam(model.parameters(), lr=0.01)
lr_scheduler = optim.lr_scheduler.ExponentialLR(opt, gamma=1. - 2e-5) 

In [9]:
action_prob_hist=[]
reward_hist=[]
episode=0
running_reward=0




while True:
    episode_reward=0
    state=env.reset()
    for i in range(0,10000):
        state=torch.from_numpy(state).float()
        actor_prob=model(state)

        m=Categorical(actor_prob)
        action=m.sample()
        action_prob_hist.append(m.log_prob(action))
        



        state,reward,done,info=env.step(action.item())
        reward_hist.append(reward)
        episode_reward+=reward
        if done:
            break
    
    returns=[]
    actor_loss=[]
    running_reward=0.05*episode_reward+(1-0.05)*running_reward
    
    ds=0
    for r in reward_hist[::-1]:
        ds=r+0.99*ds
        returns.insert(0,ds)
    returns=torch.tensor(returns)
    returns=returns-returns.mean()/returns.std()+eps
    

    hist=zip(action_prob_hist,returns)
    for log_prob,ret in hist:
        actor_loss.append(-log_prob*ret)
    opt.zero_grad()
    loss = torch.stack(actor_loss).sum()
    loss.backward()
    opt.step()

    
    
    action_prob_hist.clear()
    reward_hist.clear()
    
    episode+=1
    
    if episode%10==0:
        print("Episode no {} running_reward {}".format(episode,running_reward))
        

    if running_reward>=195:
        print("Solved at episode {} with reward {}".format(episode,running_reward))
        break

Episode no 10 running_reward 9.720432464886816
Episode no 20 running_reward 20.894507145365033
Episode no 30 running_reward 36.91340877352152
Episode no 40 running_reward 33.96233240013956
Episode no 50 running_reward 36.42983999271926
Episode no 60 running_reward 49.82747565796303
Episode no 70 running_reward 80.96882361591129
Episode no 80 running_reward 117.5897973222537
Episode no 90 running_reward 121.28246998873071
Episode no 100 running_reward 101.704619110349
Episode no 110 running_reward 87.26433467600441
Episode no 120 running_reward 113.0092700837242
Episode no 130 running_reward 124.42349838153926
Episode no 140 running_reward 132.58349385855277
Episode no 150 running_reward 148.09520610624284
Episode no 160 running_reward 158.8141757454579
Episode no 170 running_reward 171.12281655363816
Episode no 180 running_reward 182.71016356950005
Episode no 190 running_reward 189.09793625567016
Episode no 200 running_reward 171.01510173485383
Episode no 210 running_reward 148.1286086