In [1]:
import numpy as np
import tensorflow as tf
from DQNagent import DQNagent
import gym
import random
import pickle as pkl
import gym


# DQN

In [2]:
#Optimizable parameters
learning_rate = 0.01
future_reward_discount_factor = 0.95
exploration_parameter = 0.1
tune_params = {'input_units' : 32, 'layers' : 1, 'layer_units' : 32, 'learning_rate' : 1e-2}
action_selection_method = 'egreedy'
batch_size = 32 #if buffer is true
target_network_update_frequency = 10 #After how many update steps do we update the target network

#Other parameters
buffer = False #If we want to include experience buffer to DQN
target = False #If we want to include target network to DQN
total_episodes = 1000
                    
    
env = gym.make("CartPole-v1")
agent = DQNagent(env.observation_space.shape, env.action_space.n,learning_rate, future_reward_discount_factor, exploration_parameter, tune_params)
score = []
for i in range(total_episodes):
    #reset and reshape to work with the DQNagent
    state = env.reset().reshape(1,-1)

    #Cartpole-v1 has a maximum episode length of 500
    for t in range(500):
        #env.render()
        #select action based on exploration method
        action = agent.action_selection(state, method = action_selection_method)

        #Get example, and save to replay buffer
        next_state, reward, done, _ = env.step(action)
        next_state = next_state.reshape(1,-1)
        #if done: reward = -10 #Finishing before 500 steps means the cartpole fell, thus we give a -10 reward
        if buffer:
            agent.memorize(state, action, reward, next_state, done)

        #Train network
        if buffer:
            if len(agent.memory) >= batch_size:
                agent.experience_replay_train(t, target_network_update_frequency, batch_size, target)
        else:
            agent.train(state, action, reward, next_state, done, target)
            if t%target_network_update_frequency == 0:
                agent.update_target_network()

        #The score is how long the cart stayed upright, this can be a maximum of 500
        if done or t==499:
            print(f"Episode {i}: Score {t+1}/500")
            score.append(t+1)
            break

        state = next_state
    

Episode 0: Score 9/500
Episode 1: Score 9/500
Episode 2: Score 9/500
Episode 3: Score 10/500
Episode 4: Score 10/500
Episode 5: Score 11/500
Episode 6: Score 9/500
Episode 7: Score 9/500
Episode 8: Score 11/500
Episode 9: Score 10/500
Episode 10: Score 10/500
Episode 11: Score 10/500
Episode 12: Score 10/500
Episode 13: Score 9/500
Episode 14: Score 9/500
Episode 15: Score 8/500
Episode 16: Score 10/500
Episode 17: Score 11/500
Episode 18: Score 12/500
Episode 19: Score 10/500
Episode 20: Score 9/500
Episode 21: Score 12/500
Episode 22: Score 12/500
Episode 23: Score 11/500
Episode 24: Score 26/500
Episode 25: Score 9/500
Episode 26: Score 9/500
Episode 27: Score 22/500
Episode 28: Score 10/500
Episode 29: Score 21/500
Episode 30: Score 18/500
Episode 31: Score 12/500
Episode 32: Score 9/500
Episode 33: Score 10/500
Episode 34: Score 8/500
Episode 35: Score 9/500
Episode 36: Score 10/500
Episode 37: Score 8/500
Episode 38: Score 11/500
Episode 39: Score 11/500
Episode 40: Score 11/500


## Tuning

In [None]:
#Optimizable parameters
learning_rate = 0.01
future_reward_discount_factor = 0.95
exploration_parameter = 0.1
tune_params = {'input_units' : [x * 32 for x in range(1, 9)], 'layers' : [x for x in range(1,5)], 'layer_units' : [x * 32 for x in range(1, 9)], 'learning_rate' : [1e-2, 1e-3, 1e-4]}
action_selection_method = 'egreedy'
batch_size = 32 #if buffer is true
target_network_update_frequency = 10 #After how many update steps do we update the target network

#Other parameters
buffer = False #If we want to include experience buffer to DQN
target = False #If we want to include target network to DQN
total_episodes = 1000 #How many times the agent goes from reset -> done (note that after 500 steps the agent will return done no matter what so the training won't get stuck infinitely)


env = gym.make("CartPole-v1")
perf = {}
open("tuning.pkl", "w").close()
for i_u in tune_params['input_units']:
    for l in tune_params['layers']:
        for l_u in tune_params['layer_units']:
            for l_r in tune_params['learning_rate']:
                network_params = {'input_units' : i_u, 'layers': l, 'layer_units' : l_u, 'learning_rate' : l_r}
                agent = DQNagent(env.observation_space.shape, env.action_space.n,learning_rate, future_reward_discount_factor, exploration_parameter, network_params)
                score = []
                for i in range(total_episodes):
                    #reset and reshape to work with the DQNagent
                    state = env.reset().reshape(1,-1)

                    #Cartpole-v1 has a maximum episode length of 500
                    for t in range(500):
                        #env.render()
                        #select action based on exploration method
                        action = agent.action_selection(state, method = action_selection_method)

                        #Get example, and save to replay buffer
                        next_state, reward, done, _ = env.step(action)
                        next_state = next_state.reshape(1,-1)
                        #if done: reward = -10 #Finishing before 500 steps means the cartpole fell, thus we give a -10 reward
                        if buffer:
                            agent.memorize(state, action, reward, next_state, done)

                        #Train network
                        if buffer:
                            if len(agent.memory) >= batch_size:
                                agent.experience_replay_train(t, target_network_update_frequency, batch_size, target)
                        else:
                            agent.train(state, action, reward, next_state, done, target)
                            if t%target_network_update_frequency == 0:
                                agent.update_target_network()

                        #The score is how long the cart stayed upright, this can be a maximum of 500
                        if done or t==499:
                            print(f"Episode {i}: Score {t+1}/500")
                            score.append(t+1)
                            break

                        state = next_state
                perf[network_params] = [score[-1], sum(score)]
                with open('tuning.pkl', 'ab+') as fp:
                    pkl.dump(perf, fp)
                    fp.close()