In [25]:
# Here we import all libraries
import numpy as np
import gym
import matplotlib.pyplot as plt
import os
import torch
import random
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from collections import deque 
import sys
env = gym.make("CliffWalking-v0")

In [26]:
#Hyperparameters
episodes = 5000
eps = 1.0
learning_rate = 0.1
discount_factor = 0.99
tot_rewards = []
decay_val = 0.001
mem_size = 50000
batch_size = 2
gamma = 0.99

In [27]:
class NeuralNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(NeuralNetwork, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(1, 30),
            nn.ReLU(),
            nn.Linear(30, 30),
            nn.ReLU(),
            nn.Linear(30, action_size)
        )
    def forward(self, x):
        x = self.linear_relu_stack(x)
        return x

In [28]:
model = NeuralNetwork(env.observation_space.n, env.action_space.n)
opt = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
loss = nn.MSELoss()
replay_buffer = deque(maxlen=mem_size)

In [29]:
state = torch.tensor(env.reset(), dtype=torch.float32)
state = state.unsqueeze(dim=0)
out = model(state)

In [30]:
def compute_td_loss(batch_size):
    state, next_state, reward, done, action = zip(*random.sample(replay_buffer, batch_size))
    state = torch.from_numpy(np.array(state).reshape(-1, 1)).unsqueeze(dim = 0).type(torch.float32)
    next_state = torch.from_numpy(np.array(next_state).reshape(-1, 1)).unsqueeze(dim = 0).type(torch.float32)
    reward = torch.from_numpy(np.array(reward))
    done = torch.from_numpy(np.array(done))
    action = torch.from_numpy(np.array(action)).type(torch.int64)
    q_values = model(state)
    next_q_values = model(next_state)
    q_vals = q_values.squeeze().gather(dim=-1, index=action.reshape(-1,1)).reshape(1, -1)
    max_next_q_values = torch.max(next_q_values,2)[0].detach()

    loss = 0.5*((reward + gamma*max_next_q_values - q_vals)**2).mean()

    opt.zero_grad()
    loss.backward()
    opt.step()
    return loss
    

In [31]:
for i in range(episodes):
    print("Episode = ", i)
    state = env.reset()
    done = False
    steps = 0
    eps_rew = 0 
    while not done and steps<50:
        if np.random.uniform(0,1)<eps:
            action = env.action_space.sample()
        else:
            state = torch.tensor(state, dtype=torch.float32)
            state = state.unsqueeze(dim=0)
            action = np.argmax(model(state).detach().numpy())
        next_state, reward, done, info = env.step(action)
        replay_buffer.append((state, next_state, reward, done, action))
        if len(replay_buffer)>batch_size:
            loss = compute_td_loss(batch_size)
        eps = eps/(1 + 0.001)
        eps_rew += reward 
        if done:
            break
        state = next_state
        steps += steps
    tot_rewards.append(eps_rew)

Episode =  0
Episode =  1
Episode =  2
Episode =  3
Episode =  4
Episode =  5
Episode =  6
Episode =  7
Episode =  8
Episode =  9
Episode =  10
Episode =  11
Episode =  12
Episode =  13
Episode =  14
Episode =  15
Episode =  16
Episode =  17
Episode =  18
Episode =  19
Episode =  20
Episode =  21
Episode =  22
Episode =  23
Episode =  24
Episode =  25
Episode =  26
Episode =  27
Episode =  28
Episode =  29
Episode =  30
Episode =  31
Episode =  32
Episode =  33
Episode =  34
Episode =  35
Episode =  36
Episode =  37
Episode =  38
Episode =  39
Episode =  40
Episode =  41
Episode =  42
Episode =  43
Episode =  44
Episode =  45
Episode =  46
Episode =  47
Episode =  48
Episode =  49
Episode =  50
Episode =  51
Episode =  52
Episode =  53
Episode =  54
Episode =  55
Episode =  56
Episode =  57
Episode =  58
Episode =  59
Episode =  60
Episode =  61
Episode =  62
Episode =  63
Episode =  64
Episode =  65
Episode =  66
Episode =  67
Episode =  68
Episode =  69
Episode =  70
Episode =  71
Ep

KeyboardInterrupt: 

In [34]:
#Render the final environment
state, info = env.reset(seed=42, return_info=True)
import time
for i in range(100):
    
    print("i = ", i)
    env.render()
    
    state = torch.tensor(state, dtype=torch.float32)
    state = state.unsqueeze(dim=0)
    action = np.argmax(model(state).detach().numpy())
    print("action = ", action)
    state, reward, done, info = env.step(action)

    if done:
        state, info = env.reset(return_info=True)
env.close()

i =  0
action =  0
i =  1
action =  0
i =  2
action =  0
i =  3
action =  0
i =  4
action =  0
i =  5
action =  0
i =  6
action =  0
i =  7
action =  0
i =  8
action =  0
i =  9
action =  0
i =  10
action =  0
i =  11
action =  0
i =  12
action =  0
i =  13
action =  0
i =  14
action =  0
i =  15
action =  0
i =  16
action =  0
i =  17
action =  0
i =  18
action =  0
i =  19
action =  0
i =  20
action =  0
i =  21
action =  0
i =  22
action =  0
i =  23
action =  0
i =  24
action =  0
i =  25
action =  0
i =  26
action =  0
i =  27
action =  0
i =  28
action =  0
i =  29
action =  0
i =  30
action =  0
i =  31
action =  0
i =  32
action =  0
i =  33
action =  0
i =  34
action =  0
i =  35
action =  0
i =  36
action =  0
i =  37
action =  0
i =  38
action =  0
i =  39
action =  0
i =  40
action =  0
i =  41
action =  0
i =  42
action =  0
i =  43
action =  0
i =  44
action =  0
i =  45
action =  0
i =  46
action =  0
i =  47
action =  0
i =  48
action =  0
i =  49
action =  0
i =  50
ac

KeyboardInterrupt: 