In [2]:
import copy
import random
import pickle
import os
import sys
import gym
from gym import wrappers
import numpy as np
import matplotlib.pyplot as plt
import sys

import torch
from torch import nn
import torch.nn.functional as F
import torchvision.transforms as T
from PIL import Image

from utilities.data_structures.Config import Config
from agents.DQN_agents.DDQN import DDQN
from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets

config = Config()
config.seed = 1
config.num_episodes_to_run = 1000
config.file_to_save_data_results = "results/data_and_graphs/Cart_Pole_Results_Data.pkl"
config.file_to_save_results_graph = "results/data_and_graphs/Cart_Pole_Results_Graph.png"
config.show_solution_score = False
config.visualise_individual_results = False
config.visualise_overall_agent_results = True
config.standard_deviation_results = 1.0
config.runs_per_agent = 1
config.use_GPU = True
config.overwrite_existing_results_file = False
config.randomise_random_seed = True
config.save_model = True


hyperparameters = {
    "DQN_Agents": {
        "learning_rate": 1e-5,
        "batch_size": 32,
        "buffer_size": 400000,
        "epsilon": 0.01,
        "epsilon_decay_rate_denominator": 1,
        "discount_rate": 0.99,
        "tau": 1e-4,
        "alpha_prioritised_replay": 0.6,
        "beta_prioritised_replay": 0.1,
        "incremental_td_error": 1e-8,
        "update_every_n_steps": 1,
        "learning_iterations": 1,
        "final_layer_activation": "None",
        "batch_norm": False,
        "gradient_clipping_norm": 0.7,
        "clip_rewards": False
    },
    "Stochastic_Policy_Search_Agents": {
        "policy_network_type": "Linear",
        "noise_scale_start": 1e-2,
        "noise_scale_min": 1e-3,
        "noise_scale_max": 2.0,
        "noise_scale_growth_factor": 2.0,
        "stochastic_action_decision": False,
        "num_policies": 10,
        "episodes_per_policy": 1,
        "num_policies_to_keep": 5,
        "clip_rewards": False
    },
    "Policy_Gradient_Agents": {
        "learning_rate": 0.05,
        "linear_hidden_units": [20, 20],
        "final_layer_activation": "SOFTMAX",
        "learning_iterations_per_round": 5,
        "discount_rate": 0.99,
        "batch_norm": False,
        "clip_epsilon": 0.1,
        "episodes_per_learning_round": 4,
        "normalise_rewards": True,
        "gradient_clipping_norm": 7.0,
        "mu": 0.0, #only required for continuous action games
        "theta": 0.0, #only required for continuous action games
        "sigma": 0.0, #only required for continuous action games
        "epsilon_decay_rate_denominator": 1.0,
        "clip_rewards": False
    },

    "Actor_Critic_Agents":  {

        "learning_rate": 0.005,
        "linear_hidden_units": [20, 10],
        "final_layer_activation": ["SOFTMAX", None],
        "gradient_clipping_norm": 5.0,
        "discount_rate": 0.99,
        "epsilon_decay_rate_denominator": 1.0,
        "normalise_rewards": True,
        "exploration_worker_difference": 2.0,
        "clip_rewards": False,

        "Actor": {
            "learning_rate": 0.0003,
            "linear_hidden_units": [64, 64],
            "final_layer_activation": "Softmax",
            "batch_norm": False,
            "tau": 0.005,
            "gradient_clipping_norm": 5,
            "initialiser": "Xavier"
        },

        "Critic": {
            "learning_rate": 0.0003,
            "linear_hidden_units": [64, 64],
            "final_layer_activation": None,
            "batch_norm": False,
            "buffer_size": 1000000,
            "tau": 0.005,
            "gradient_clipping_norm": 5,
            "initialiser": "Xavier"
        },

        "min_steps_before_learning": 400,
        "batch_size": 256,
        "discount_rate": 0.99,
        "mu": 0.0, #for O-H noise
        "theta": 0.15, #for O-H noise
        "sigma": 0.25, #for O-H noise
        "action_noise_std": 0.2,  # for TD3
        "action_noise_clipping_range": 0.5,  # for TD3
        "update_every_n_steps": 1,
        "learning_updates_per_learning_session": 1,
        "automatically_tune_entropy_hyperparameter": True,
        "entropy_term_weight": None,
        "add_extra_noise": False,
        "do_evaluation_iterations": True
    }
}



# Model and Env wrapper, currently Breakout and CartPole implemented

In [3]:
class CNN(nn.Module):
    """input resized to [84, 84, 1]"""
    def __init__(self, config):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size = 8, stride = 4)
        # 20 20
        self.conv2 = nn.Conv2d(32, 64, kernel_size = 4, stride = 2)
        # 9 9
        self.conv3 = nn.Conv2d(64, 64, kernel_size = 3, stride = 1)
        # 7 7
        self.fc1 = nn.Linear(7*7*64, 1024)
        self.fc2 = nn.Linear(1024, 4)
        
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = self.conv3(x)
        x = F.relu(x)
        x = x.view(-1, 7*7*64)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x

class NN(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.fc1 = nn.Linear(4, 32)
        self.fc2 = nn.Linear(32, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x
    
class Wrapper(gym.ObservationWrapper):
    def __init__(self, environment):
        super().__init__(environment)

    def observation(self, state):
        return self.process(state)
    
    def step(self, action):
        state, reward, done, info = super().step(action)
        if info["ale.lives"]!=5:
            done = True
        return state, reward, done, info
    
    def process(self, state):
        state = Image.fromarray(state)
        state = state.crop(box = [0, 34, 160, 194])
        state = state.convert(mode="L")
        state = state.resize((84, 84))
        state = np.array(state)
        return state.reshape(1, 84, 84)

# Initialization

In [4]:
config.model_class = CNN
config.environment = Wrapper(gym.make('BreakoutDeterministic-v4'))
#config.environment = gym.make('CartPole-v0')
config.hyperparameters = hyperparameters["DQN_Agents"]
agent = DDQN(config)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


TITLE  BreakoutDeterministic
Score required to win set to infinity therefore no learning rate annealing will happen


# Train

In [None]:
# donnot reset the epsiode number (that resets eps-greddy) increase the n_episodes instead
game_scores, rolling_scores, time_taken = agent.run_n_episodes(50000)

 Episode 110, Score:  4.00, Max score seen:  7.00, Rolling score:  3.46, Max rolling score seen:  3.47"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

In [12]:
import torch.optim as optim
agent.q_network_optimizer = optim.Adam(agent.q_network_local.parameters(), lr =1e-4)

# S/L

In [5]:
checkpoint = torch.load("./checkpoint6")
agent.q_network_local.load_state_dict(checkpoint['model_dict'])
agent.q_network_target.load_state_dict(checkpoint['model_dict'])

<All keys matched successfully>

In [6]:
torch.save({'model_dict': agent.q_network_local.state_dict()}, "checkpoint6")

# Discussion

refer to https://openai.com/blog/openai-baselines-dqn/, for best practices.

The MNIST and CIFAR of RL

4 dim parameterized CartPole can be solved (keeping balance for more than 200 frames on average) within 500 episodes

As for image input, BreakoutDeterministic-v4. 10M transitions is a common setting, and it is okay to have no improvements during the first 1M transitions (perhaps due to high eps).

Changes
Implemented real eps-greedy