In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from DQNagent import DQNagent
import gym
import random




# DQN
I have put the whole cartpole training into a function so it is easier to use. The parameters we have to tune are (in order of importance):
 - network_params. Most importantly the number of nodes as we can just asume that 'relu' activation is optimal
 - batch_size
 - learning_rate 
 - exploration_parameter
 - future_reward_discount_factor 
 - action_selection_method

When using a target network (target = True) the extra parameters to tune are:
 - target_network_update_frequency

When using experinece replay (buffer = True) the batch_size means something else so it has to, most likely, be retuned.

Non trainable parameters are:
 - target
 - buffer
 - total_episodes

In [4]:

def Cartpole(total_episodes,target, target_network_update_frequency, buffer, batch_size, learning_rate, future_reward_discount_factor, exploration_parameter, network_params, action_selection_method):
    scores = []
    env = gym.make("CartPole-v1")
    agent = DQNagent(env.observation_space.shape, env.action_space.n, target, buffer, batch_size, learning_rate, future_reward_discount_factor, exploration_parameter, network_params)

    for i in range(total_episodes):
        #reset the environment
        state = env.reset()

        #Cartpole-v1 has a maximum episode length of 500
        for t in range(500):
            #env.render()
            #select action based on exploration method
            action = agent.action_selection(state, method = action_selection_method)

            #Get example, and save to replay buffer
            next_state, reward, done, _ = env.step(action)
            #if done: reward = -10 #Finishing before 500 steps means the cartpole fell, thus we give a -10 reward

            agent.memorize(state, action, reward, next_state, done)

            #Train network
            if len(agent.memory) >= batch_size:
                agent.train()
    
            if t%target_network_update_frequency == 0 and target:
                agent.update_target_network()

            #The score is how long the cart stayed upright, this can be a maximum of 500
            if done or t==499:
                print(f"Episode {i}: Score {t+1}/500")
                break

            state = next_state
        scores.append(t+1)
    return scores

    

# Tuning Example
Here is an example where I try out 3 different learning rates $ = [0.1, 0.01, 0.001]$. Note that I am only doing $10$ episodes, this is way too little to remark anything, you would need at least 100. For this example however, I will use $10$ so that it goes quicker.

In [5]:
#Parameter I am optimizing
learning_rates = [0.1, 0.01, 0.001]

#Other optimizable parameters not being optimized now
future_reward_discount_factor = 0.95
exploration_parameter = 0.1
network_params = [(24,'relu'),(24,'relu')]
action_selection_method = 'egreedy'
batch_size = 32 #While training it will take the last n=batch_size examples. If buffer=True then it will take n=batch_size random samples from the memory
target_network_update_frequency = 10 #After how many update steps do we update the target network

#Other parameters
buffer = False #If we want to include experience buffer to DQN
target = False #If we want to include target network to DQN
total_episodes = 10 #How many times the agent goes from reset -> done (note that after 500 steps the agent will return done no matter what so the training won't get stuck infinitely)

#The cartpole function returns a list of scores (per episode) so I will use this in a loop for all the learning rates
score_per_lr = []
for learning_rate in learning_rates:
    score = Cartpole(total_episodes,target, target_network_update_frequency, buffer, batch_size, learning_rate, future_reward_discount_factor, exploration_parameter, network_params, action_selection_method)
    score_per_lr.append(score)

Episode 0: Score 35/500
Episode 1: Score 15/500
Episode 2: Score 27/500


# Plot

In [None]:
for i,lr in enumerate(learning_rates):
    plt.plot(np.arrange(1,total_episodes+1),score_per_lr[i], label = f"learning rate = {lr}")
plt.xlabel("Episode")
plt.ylabel("Score")
plt.legend()