# Cart Pole

In [1]:
import torch
from torch import nn, optim, tensor
# from torchsummary import summary

import gym

from collections import deque, namedtuple

import matplotlib
import matplotlib.pyplot as plt

import random

from math import exp

import numpy as np
from itertools import compress
import time
import json

## Setup

In [2]:
# So we can run off of the GPU for our tensors
# if torch.cuda.is_available():
#     device = "cuda"
# else:
device = "cpu"

# Live plots
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

plt.rcParams['figure.figsize'] = [10, 8]

print(device)

cpu


## Environment

In [3]:
env = gym.make("CartPole-v1")
# env = gym.make("LunarLander-v2")

## Parameters

In [4]:
# Parameters
nS = env.observation_space.shape[0]
nA = env.action_space.n

# Hyperparameters
BATCH_SIZE = 2**8

LAYER1_SIZE = 2**8
LAYER2_SIZE = 2**8

EPISODES_TRAINING = 2000

ALPHA = 1e-4
GAMMA = 0.99
# TAU = 0.005
TAU = 0.01
EPSILON_MAX = 1.0
EPSILON_MIN = 0.05
EPSILON_DECAY = 150
# EPSILON_DATA = [EPSILON_MAX, EPSILON_MIN, EPSILON_DECAY]

BUFFER_SIZE = 10000

## Replay Buffer

In [5]:
SARST = namedtuple("SARST", ["S", "A", "R", "S_prime", "T"])

class ReplayBuffer(object):
    def __init__(self, size):
        self.buffer = deque([], size)

    def push(self, *args):
        self.buffer.append(SARST(*args))

    def sample(self, sample_size):
        return random.sample(self.buffer, sample_size)
    
    def __len__(self):
        return len(self.buffer)

## DQN

In [6]:
class DQN(nn.Module):
    def __init__(self, INPUT_LAYER, LAYER1_SIZE, LAYER2_SIZE, OUTPUT_LAYER):
        super(DQN, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(INPUT_LAYER, LAYER1_SIZE),
            nn.ReLU(),
            nn.Linear(LAYER1_SIZE, LAYER2_SIZE),
            nn.ReLU(),
            nn.Linear(LAYER2_SIZE, OUTPUT_LAYER),
        ).to(device)

    def forward(self, x):
        return self.linear_relu_stack(x)

## Epsilon Greedy

In [14]:
def epsilonGreedy(state, network, nA, epsilon):
    # Decide if we are going to be greedy or not
    greedy = (random.random() > epsilon)

    if greedy:
        # Pick best action, if tie, use lowest index
        with torch.no_grad():   # Speeds up computation
            return network(torch.FloatTensor(state)).argmax().item()

    else:
        # Explore
        return tensor(random.randint(0, nA-1), device=device, dtype=torch.long).item()


## Setup

In [8]:
policy_net = DQN(nS, LAYER1_SIZE, LAYER2_SIZE, nA).to(device)
target_net = DQN(nS, LAYER1_SIZE, LAYER2_SIZE, nA).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.AdamW(policy_net.parameters(), lr=ALPHA, amsgrad=True)
RB = ReplayBuffer(BUFFER_SIZE)


In [9]:
steps = 0
def getEpsilon():
    global steps
    epsilon = EPSILON_MIN + (EPSILON_MAX - EPSILON_MIN)*exp(-steps/EPSILON_DECAY)
    steps += 1
    return epsilon

## Stolen Plot Function

In [10]:
def plot_multi(title, axis, args, save_string=""):
    n_plots = len(args)
    plt.clf()
    fig, ax = plt.subplots(n_plots, 1, sharex=True)
    for argi, arg in enumerate(args):
        data = torch.tensor(arg, dtype=torch.float)
        # ax[argi].clf()
        # ax[argi].clear()
        ax[argi].set_title(title[argi])
        ax[argi].set_ylabel(axis[argi])
        # ax[argi].set_xlabel('Episode')
        ax[argi].plot(data)

        # Take 100 episode averages and plot them too
        if len(arg) >= 100:
            means = data.unfold(0, 100, 1).mean(1).view(-1)
            means = torch.cat((torch.zeros(99), means))
            ax[argi].plot(means.numpy())
            
    plt.xlabel('Episode')
    plt.pause(0.001)  # pause a bit so that plots are updated
    display.clear_output(wait=True)
    if save_string != "":
        fig.savefig("..\\P2_Data\\model1\\"+save_string+".png")

        # if is_ipython:
        #     display.display(plt.gcf())
        


##

In [11]:
def train():
    minibatch_awk = RB.sample(BATCH_SIZE)
    minibatch = SARST(*zip(*minibatch_awk))
    
    N = len(minibatch.S)

    S = torch.cat(minibatch.S)
    A = minibatch.A
    torch_R = torch.cat(minibatch.R)
    torch_maxQ = torch.zeros(N, 1)
    nonterm_mask = tensor(minibatch.T)
    term_mask = tensor(list(np.array(minibatch.T)==False))
        
    Q_SA = policy_net(S).gather(1, torch.reshape(tensor(A), [N, 1]))
    
    with torch.no_grad():
        S_prime_masked =  list(compress(minibatch.S_prime, minibatch.T))
        torch_maxQ[nonterm_mask] = torch.reshape(target_net(torch.cat(S_prime_masked)).max(1)[0], [sum(nonterm_mask).item(), 1])

    y = (torch_maxQ * GAMMA) + torch_R

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(Q_SA, y)
    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

    return loss.item()
    

In [12]:
episodic_rewards = []
episodic_loss = []
episodic_epsilon = []


def DQN_network(episodes):
    start_time = time.time()
    T = 0
    for epi in range(episodes):
        
        S, _ = env.reset()
        S = torch.tensor([S], dtype=torch.float32, device=device)

        episodic_reward = 0
        episodic_mean_loss = 0
        terminated = False
        
        epsilon = getEpsilon()
        
        while not terminated:
            T += 1
            if True and epi%10==0:
                env.render()

            # Choose action
            A = epsilonGreedy(S, policy_net, nA, epsilon)
            # Take step
            S_prime, reward, terminated, _, _ = env.step(A)

            # S_prime = None if terminated else tensor(torch.FloatTensor(S_prime).to(device), requires_grad=True)
            S_prime = [0] if terminated else tensor([S_prime], dtype=torch.float32, device=device)

            # Store the transition
            RB.push(S, A, tensor([[reward]], dtype=torch.float32, device=device), 
                    S_prime, tensor(not terminated, device=device, dtype=torch.bool))

            S = S_prime

            # Update the networks networks
            if len(RB) > BATCH_SIZE:
                episodic_mean_loss += train()
                
            episodic_reward += reward

            if T%10==0:
                # Soft update of the target network's weights
                target_net_state_dict = target_net.state_dict()
                policy_net_state_dict = policy_net.state_dict()
                for key in policy_net_state_dict:
                    target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
                target_net.load_state_dict(target_net_state_dict)

        episodic_epsilon.append(epsilon)
        episodic_loss.append(episodic_mean_loss/T)
        episodic_rewards.append(episodic_reward)

        if epi % 10 == 0:
            save_string = "_policy_weights_episode_"+str(epi).zfill(4)
            # torch.save(target_net.state_dict(), "..\\P2_Data\\model1\\"+"target"+save_string+".pth")
            # torch.save(policy_net.state_dict(), "..\\P2_Data\\model1\\"+"policy"+save_string+".pth")
                
            # # Soft update of the target network's weights (do at end of episode)
            # target_net_state_dict = target_net.state_dict()
            # policy_net_state_dict = policy_net.state_dict()
            # for key in policy_net_state_dict:
            #     target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
            # target_net.load_state_dict(target_net_state_dict)

            print(epsilon)
            plot_multi(["Training Rewards...", "Training Loss...", "Epsilon Value..."], 
                       ["Reward", "Mean Episode Loss", "Epsilon"], 
                       [episodic_rewards, episodic_loss, episodic_epsilon])
        
                
    
    delta_time = time.time()-start_time
    plot_multi(["Reward History", "Loss History", "Epsilon History"], 
               ["Reward", "Loss", "Epsilon"], 
               [episodic_rewards, episodic_loss, episodic_epsilon], save_string="model1")
    
    plt.ioff()
    plt.show()
    
    # Save data
    data_file = open("..\\P2_Data\\model1\\data.json", 'w+')
    json_data = {"episodic_rewards": episodic_rewards, 
                    "episodic_loss": episodic_loss, 
                    "episodic_epsilon": episodic_epsilon,
                    "training_time": delta_time
                    }
    json.dump(json_data, data_file)

In [15]:
DQN_network(EPISODES_TRAINING)
env.close()

KeyboardInterrupt: 

In [16]:
env.close()

In [None]:
# from tqdm import tqdm
def test(episodes):
    episodic_rewards = []
    episodic_durations = []

    for epi in range(episodes):

        
        S = env.reset()
        S = torch.tensor([S], dtype=torch.float32, device=device)

        episodic_reward = 0
        episodic_duration = 0
        T = 0
        terminated = False
        while not terminated:
            episodic_duration += 1
            env.render()

            # Choose action
            A = epsilonGreedy(S, policy_net, nA, -1)
            # Take step
            S_prime, reward, terminated, _ = env.step(A)
            S = S_prime

            episodic_reward += reward

        episodic_rewards.append(episodic_reward)
        episodic_durations.append(episodic_duration)
        # if epi % 10 == 0:
        print(epi, "of", episodes)
        plot_multi(["Training Rewards...", "Training Durations..."], 
                    ["Reward", "Episode Duration"], 
                    [episodic_rewards, episodic_durations])
            
        