In [1]:
import pandas as pd
import numpy as np
import gym

import torch

from torch.nn import Module
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

import matplotlib.pyplot as plt
from itertools import count


In [2]:
#Setting up environment
env=gym.make('LunarLander-v2')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
class Policy(Module):
    
    def __init__(self,num_states=None,
                 hidden_states=None,
                 num_actions=None):
        
        super().__init__()
        
        ###Network Architecture
        neurons=[num_states,
                 *hidden_states,
                 num_actions]
        
        self.layers=nn.ModuleList([nn.Linear(in_n,out_n) 
            for in_n,out_n in zip(neurons,neurons[1:])])
        
        #self.Dropout=nn.Dropout(p=0.1)
        
        ###Saving history
        self.saved_log_probs=[]
        self.rewards=[]
        
    def forward(self,x):
        
        ###Network Architecture flow
        
        for i,layer in enumerate(self.layers):
            x=layer(x)

            if i!=len(self.layers)-1:
                x=torch.relu((x))
                #x=self.Dropout(x)

        return F.softmax(x,dim=1)
    
def weight_init(m):
    
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight)
        
        m.bias.data.fill_(0)
    
    
        


In [4]:
###Initializing the network
policy=Policy(num_states=8,
              hidden_states=[32,32],
              num_actions=env.action_space.n)
#policy.apply(weight_init)
optimizer=optim.Adam(policy.parameters(),lr=1e-2,amsgrad=True)
eps = np.finfo(np.float32).eps.item()

In [5]:
def select_action(state):
    
    ###Converting to tensor and adding a dimension
    state=torch.from_numpy(state).float().unsqueeze(0)
    
    ###Passing state through network to get probability
    probs=policy(state)
    
    ###Sampling action
    m=Categorical(probs)
    action=m.sample()
    
    ###Saving log of probability for gradient calculation
    policy.saved_log_probs.append(m.log_prob(action))
    
    return action.item()

In [6]:
def finish_episode(gamma=0.8):
    
    R=0
    rewards=policy.rewards[::-1]
    returns=[]
    policy_loss=[]
    for r in rewards:
        R=r+gamma*R
        returns.insert(0,R)
        
    returns=torch.Tensor(returns)
    
    returns=(returns-returns.mean())/(returns.std()+eps)
    
    loss=0
    
    for log_prob,R in zip(policy.saved_log_probs,returns):
        policy_loss.append(-log_prob*R)
    
    optimizer.zero_grad()
    
    policy_loss=torch.cat(policy_loss).sum()
    policy_loss.backward()
    
    optimizer.step()
    
    del policy.rewards[:]
    del policy.saved_log_probs[:]
    
    

In [7]:
def main():
    
    #T=200
    log_interval=100
    running_reward=0
    for i_episode in count(1):
        state=env.reset()
        ep_reward=0
        
        while(True):
            ###Selecting action
            action=select_action(state)
            state,reward,done,_=env.step(action)
            
            env.render()
            #Saving Rewards
            policy.rewards.append(reward)
            #Updating episode reward
            ep_reward+=reward
            
            
            if done:
                break
                

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        
        finish_episode(0.99)
        
        if i_episode % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
            if running_reward>50:
                torch.save(policy, "./models/model_%s"%i_episode)
            
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break

    env.close()

In [8]:
main()

Episode 100	Last reward: -142.72	Average reward: -147.65
Episode 200	Last reward: -41.97	Average reward: -65.55
Episode 300	Last reward: -142.00	Average reward: -65.49
Episode 400	Last reward: -143.40	Average reward: -202.08
Episode 500	Last reward: -196.19	Average reward: -286.54
Episode 600	Last reward: -521.15	Average reward: -555.00


KeyboardInterrupt: 

In [None]:
torch.save(policy, "./models/policy_Nov24")

In [None]:
# model=torch.load("./models/policy_Nov24")

In [None]:
# def run(model):
    
#     env=gym.make('CartPole-v0')
#     T=200
#     for i_episode in range(10):
#         state=env.reset()
#         ep_reward=0
        
#         for t in range(T):
#             #####Selecting action
#             ###Converting to tensor and adding a dimension
#             state=torch.from_numpy(state).float().unsqueeze(0)

#             ###Passing state through network to get probability
#             probs=model(state)

#             ###Sampling action
#             m=Categorical(probs)
#             action=m.sample().item()
            
#             state,reward,done,_=env.step(action)
            
#             env.render()
#             #Updating episode reward
#             ep_reward+=reward
            
#             if done:
#                 break
      
#         print('Episode {}\tEpisode reward: {:.2f}'.format(
#               i_episode, ep_reward))

#     env.close()

In [None]:
# run(model)