In [None]:
!pip3 install gym==0.26.2

^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
import gym
import numpy as np
import time
import matplotlib.pyplot as plt
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from collections import deque
from mpl_toolkits import mplot3d
from matplotlib import cm
import pandas as pd
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Set the seed for reproducibility
seed = 7
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

### Simple Deep RL Algorithm

- Episodic - fix number of time steps to act in environment, not continuous

- Online - Agent as access to the environment and acts (interacts) on the env in real time

- Mode-free - We don't try to create an internal model of the environment

In [None]:
class DQN(nn.Module):
    def __init__(self, env, learning_rate):
        super(DQN, self).__init__()

        # Input to DNN - input features (observations)
        input_features = env.observation_space.shape[0]

        # Output to DNN - output actions
        action_space = env.action_space.n

        # i - 128 - 64 - 32 - o
        self.dense1 = nn.Linear(in_features=input_features, out_features=128)
        self.dense2 = nn.Linear(in_features=128, out_features=64)
        self.dense3 = nn.Linear(in_features=64, out_features=32)
        self.dense4 = nn.Linear(in_features=32, out_features=action_space)

        # How does the Adam optimizer work?
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
    
    def forward(self, x):
        # Why tanh? try with relu instead of tanh too
        output = torch.relu(self.dense1(x))
        output = torch.tanh(self.dense2(output))
        output = torch.tanh(self.dense3(output))
        output = self.dense4(output)
        
        return output


### Experience Replay

- Data assumption for gradient method for training DNN : iid (independent identically distributed) - Not true for reinforcement learning (why?)

- Because the data is highly correlated the next state of the agent and the reward depend on the actions in the previous state

- Which can causing the DQN to be instable

- To get around this we use the experience replay technique which breaks the correlation between subsequent transitions by 1) saving experiences in memory 2) sampling randomly from the stored transition when we make Q-value updates. 

- Important to note is that experience replay has a fixed memory. Therefore if we exceed the replay buffer we then only store the most recent experiences. Override / get rid of older experiences

- Look up papers for using dynamic experience replay memory

- This is one of the tricks Vincent was referring too**

In [None]:
class ExperienceReplay:
    def __init__(self, env, buffer_size, min_replay_size = 1000, seed = 123):
        self.env = env
        self.min_replay_size = min_replay_size
        self.replay_buffer = deque(maxlen=buffer_size)
        # whats this reward buffer for?
        self.reward_buffer = deque([-200.0], maxlen=100)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print('Please wait, the experience replay buffer will be filled with random transitions')

        obs, _ = self.env.reset(seed=seed)
        for _ in range(self.min_replay_size):
            action = env.action_space.sample()
            new_obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            transition = (obs, action, reward, done, new_obs)
            self.replay_buffer.append(transition)
            obs = new_obs

            if done:
                obs, _ = env.reset(seed=seed)

        print('Initialization with random transitions is done!')

    def add_data(self, data):
        self.replay_buffer.append(data)

    def sample(self, batch_size):
        transitions = random.sample(self.replay_buffer, batch_size)

        observations = np.asarray([t[0] for t in transitions])
        actions = np.asarray([t[1] for t in transitions])
        rewards = np.asarray([t[2] for t in transitions])
        dones = np.asarray([t[3] for t in transitions])
        new_observations = np.asarray([t[4] for t in transitions])

        observations_t = torch.as_tensor(observations, dtype = torch.float32, device=self.device)
        actions_t = torch.as_tensor(actions, dtype = torch.int64, device=self.device).unsqueeze(-1)
        rewards_t = torch.as_tensor(rewards, dtype = torch.float32, device=self.device).unsqueeze(-1)
        dones_t = torch.as_tensor(dones, dtype = torch.float32, device=self.device).unsqueeze(-1)
        new_observations_t = torch.as_tensor(new_observations, dtype = torch.float32, device=self.device)

        return observations_t, actions_t, rewards_t, dones_t, new_observations_t 

    def add_reward(self, reward):
        self.reward_buffer.append(reward)

### Vanilla DQN Agent

In [None]:
class vanilla_DQNAgent:
    def __init__(self, env_name, device, epsilon_decay, 
                 epsilon_start, epsilon_end, discount_rate, lr, buffer_size, seed = 123):
        self.env_name = env_name
        self.env = gym.make(self.env_name, render_mode = None)
        self.device = device
        self.epsilon_decay = epsilon_decay
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.discount_rate = discount_rate
        self.learning_rate = lr
        self.buffer_size = buffer_size
        
        self.replay_memory = ExperienceReplay(self.env, self.buffer_size, seed = seed)
        self.online_network = DQN(self.env, self.learning_rate).to(self.device)

    def choose_action(self, step, observation, greedy=False):
        # what does np.interp do?
        epsilon = np.interp(step, [0, self.epsilon_decay], [self.epsilon_start, self.epsilon_end])

        random_sample = random.random()

        if (random_sample <= epsilon) and not greedy:
            # random action
            action = self.env.action_space.sample()
        
        else:
            # greedy action
            obs_t = torch.as_tensor(observation, dtype=torch.float32, device=self.device)
            # what does unsqueeze do?
            q_values = self.online_network(obs_t.unsqueeze(0))
            
            # arg max ? explain with example
            max_q_index = torch.argmax(q_values, dim = 1)[0]
            action = max_q_index.detach().item()

        return action, epsilon

    def learn(self, batch_size):
        # sample rand transition with size = batch size from reply buffer
        observations_t, actions_t, rewards_t, dones_t, new_observations_t = self.replay_memory.sample(batch_size)

        target_q_values = self.online_network(new_observations_t)
        max_target_q_values = target_q_values.max(dim=1, keepdim=True)[0]

        targets = rewards_t + self.discount_rate * (1-dones_t) * max_target_q_values

        q_values = self.online_network(observations_t)
        action_q_values = torch.gather(input=q_values, dim=1, index=actions_t)

        # try different types of loss
        loss = F.smooth_l1_loss(action_q_values, targets.detach())
        #loss = F.mse_loss(action_q_values, targets.detach())

        self.online_network.optimizer.zero_grad()
        loss.backward()
        self.online_network.optimizer.step()

### HyperParameters

In [None]:
#Discount rate
discount_rate = 0.99
#That is the sample that we consider to update our algorithm
batch_size = 32
#Maximum number of transitions that we store in the buffer
buffer_size = 50000
#Minimum number of random transitions stored in the replay buffer
min_replay_size = 1000
#Starting value of epsilon
epsilon_start = 1.0
#End value (lowest value) of epsilon
epsilon_end = 0.05
#Decay period until epsilon start -> epsilon end
epsilon_decay = 1000

max_episodes = 250000

#Learning_rate
lr = 5e-4

### Initialize the agent

In [None]:
env_name = 'MountainCar-v0'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vanilla_agent = vanilla_DQNAgent(env_name, device, epsilon_decay, epsilon_start, epsilon_end, discount_rate, lr, buffer_size)

Please wait, the experience replay buffer will be filled with random transitions
Initialization with random transitions is done!


In [None]:
def training_loop(env_name, agent, max_episodes, target_=False, seed=42):
    env = gym.make(env_name, render_mode=None)
    env.action_space.seed(seed)
    obs, _ = env.reset(seed=seed)
    # why negative?
    average_reward_list = [-200]
    episode_reward = 0.0

    for step in range(max_episodes):

        # choose move greedy or random
        # what is step used for?
        action, epsilon = agent.choose_action(step, obs)

        new_obs, reward, terminated, truncated, _ = env.step(action)

        done = terminated or truncated

        transition = (obs, action, reward, done, new_obs)

        agent.replay_memory.add_data(transition)

        obs = new_obs

        episode_reward += reward

        # game over - reinitialize reward
        if done:
            obs, _ = env.reset(seed=seed)
            agent.replay_memory.add_reward(episode_reward)
            episode_reward = 0.0

        # learning stage
        agent.learn(batch_size)

        # avg after 100 episodes
        if (step + 1) % 100 == 0:
            average_reward_list.append(np.mean(agent.replay_memory.reward_buffer))

        if target_:
            target_update_frequence = 250
            if step % target_update_frequence == 0:
                dagent.update_target_network()

        if (step+1) % 10000 == 0:
            print(20*'--')
            print('Step', step)
            print('Epsilon', epsilon)
            print('Avg Rew', np.mean(agent.replay_memory.reward_buffer))
            print()

    return average_reward_list

In [None]:
average_rewards_vanilla_dqn = training_loop(env_name, vanilla_agent, max_episodes)

----------------------------------------
Step 9999
Epsilon 0.05
Avg Rew -191.83018867924528

----------------------------------------
Step 19999
Epsilon 0.05
Avg Rew -195.67

----------------------------------------
Step 29999
Epsilon 0.05
Avg Rew -199.38

----------------------------------------
Step 39999
Epsilon 0.05
Avg Rew -192.79

----------------------------------------
Step 49999
Epsilon 0.05
Avg Rew -190.63

----------------------------------------
Step 59999
Epsilon 0.05
Avg Rew -193.7

----------------------------------------
Step 69999
Epsilon 0.05
Avg Rew -190.54

----------------------------------------
Step 79999
Epsilon 0.05
Avg Rew -189.91

----------------------------------------
Step 89999
Epsilon 0.05
Avg Rew -192.99

----------------------------------------
Step 99999
Epsilon 0.05
Avg Rew -191.52

----------------------------------------
Step 109999
Epsilon 0.05
Avg Rew -192.03

----------------------------------------
Step 119999
Epsilon 0.05
Avg Rew -194.22

----

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3a1fff61-e37b-4e25-8498-6d22064c7401' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>