In [1]:
import pandas as pd
import numpy as np
import gym

import torch

from torch.nn import Module
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

import matplotlib.pyplot as plt
from itertools import count


In [2]:
#Setting up environment
env=gym.make('LunarLander-v2')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
class Policy(Module):
    
    def __init__(self,
                 num_states=None,
                 hidden_states=None,
                 num_actions=None):
        
        super().__init__()
        
        ###Network Architecture
        
        #Shared layer
        self.shared_layer =nn.Linear(num_states,hidden_states)
        # Actor layer
        self.action_layer = nn.Linear(hidden_states,num_actions)
        # Critic layer
        self.value_layer = nn.Linear(hidden_states, 1)
            
    
        #self.Dropout=nn.Dropout(p=0.1)
        
        ###Saving history
        self.saved_actions=[]
        self.rewards=[]
        
    def forward(self,x):
        
        ###Network Architecture flow
        
        x = F.relu(self.shared_layer(x))
        
        critic_val = self.value_layer(x)
        
        return F.softmax(self.action_layer(x),dim=1), critic_val
    
    
def weight_init(m):
    
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight)
        
        m.bias.data.fill_(0)
    
    
        


In [4]:
###Initializing the network
policy=Policy(num_states=8,
              hidden_states=256,
              num_actions=env.action_space.n)
#policy.apply(weight_init)
optimizer=optim.Adam(policy.parameters(),lr=3e-3,amsgrad=True)

eps = np.finfo(np.float64).eps.item()

In [5]:
def select_action(state):
    
    ###Converting to tensor and adding a dimension
    state=torch.from_numpy(state).float().unsqueeze(0)
    
    ###Passing state through network to get probability
    probs, state_value=policy(state)
    
    ###Sampling action
    m=Categorical(probs)
    action=m.sample()
    
    ###Saving log of probability for gradient calculation
    policy.saved_actions.append((m.log_prob(action), state_value))
    
    return action.item()

In [6]:
def finish_episode(gamma=0.99):
    
    """
    Training code. Calculates actor and critic loss and performs backprop.
    
    """
    R = 0
    saved_actions = policy.saved_actions
    policy_losses = [] # list to save actor (policy) loss
    value_losses = [] # list to save critic (value) loss
    returns = [] # list to save the true values

    # calculate the true value using rewards returned from the environment
    for r in policy.rewards[::-1]:
        # calculate the discounted value
        R = r + gamma * R
        returns.insert(0, R)

    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()

        # calculate actor (policy) loss 
        policy_losses.append(-log_prob * advantage)

        # calculate critic (value) loss using L1 smooth loss
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([[R]])))

    # reset gradients
    optimizer.zero_grad()

    # sum up all the values of policy_losses and value_losses
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

    # perform backprop
    loss.backward()
    optimizer.step()

    # reset rewards and action buffer
    del policy.rewards[:]
    del policy.saved_actions[:]

    

In [7]:
def main():
    
    #T=200
    log_interval=10
    running_reward=0
    past_reward=0
    for i_episode in count(1):
        state=env.reset()
        ep_reward=0
        
        while(True):
            ###Selecting action
            action=select_action(state)
            state,reward,done,_=env.step(action)
            
            #env.render()
            #Saving Rewards
            policy.rewards.append(reward)
            #Updating episode reward
            ep_reward+=reward
            
            
            if done:
                break
                

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        
        finish_episode()
        
        if i_episode % log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
            if running_reward>50 and running_reward>past_reward:
                past_reward=running_reward
                torch.save(policy, "./models/model_Nov25_ex1_%s"%i_episode)
            
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, i_episode))
            break

    env.close()

In [8]:
#main()

#torch.save(policy, "./models/policy_")

In [9]:
model=torch.load("./models/model_Nov25_ex1_1300")

In [10]:
def run(model):
    
    env=gym.make('LunarLander-v2')
    #T=200
    for i_episode in range(10):
        state=env.reset()
        ep_reward=0
        
        while(True):
            #####Selecting action
            ###Converting to tensor and adding a dimension
            state=torch.from_numpy(state).float().unsqueeze(0)

            ###Passing state through network to get probability
            probs,value=model(state)

            ###Sampling action
            m=Categorical(probs)
            action=m.sample().item()

            #action =torch.argmax(probs).item()
    
#             print(action)

            
            state,reward,done,_=env.step(action)
            
            env.render()
            #Updating episode reward
            ep_reward+=reward
            
            if done:
                break
      
        print('Episode {}\tEpisode reward: {:.2f}'.format(
              i_episode, ep_reward))

    env.close()

In [None]:
run(model)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode 0	Episode reward: 202.72
Episode 1	Episode reward: 198.56
Episode 2	Episode reward: 214.19
Episode 3	Episode reward: 198.07
Episode 4	Episode reward: 156.82
Episode 5	Episode reward: 199.56
Episode 6	Episode reward: 119.56
Episode 7	Episode reward: 168.52
Episode 8	Episode reward: 160.33
