In [1]:
import numpy as np
import tensorflow as tf
from DQNagent import DQNagent
import gym
import random




# DQN

In [3]:
#Optimizable parameters
learning_rate = 0.01
future_reward_discount_factor = 0.95
exploration_parameter = 0.1
network_params = [(24,'relu'),(24,'relu')]
action_selection_method = 'egreedy'
batch_size = 32 #if buffer is true
target_network_update_frequency = 10 #After how many update steps do we update the target network

#Other parameters
buffer = False #If we want to include experience buffer to DQN
target = False #If we want to include target network to DQN
total_episodes = 1000 #How many times the agent goes from reset -> done (note that after 500 steps the agent will return done no matter what so the training won't get stuck infinitely)


env = gym.make("CartPole-v1")
agent = DQNagent(env.observation_space.shape, env.action_space.n,learning_rate, future_reward_discount_factor, exploration_parameter, network_params)

for i in range(total_episodes):
    #reset and reshape to work with the DQNagent
    state = env.reset().reshape(1,-1)

    #Cartpole-v1 has a maximum episode length of 500
    for t in range(500):
        #env.render()
        #select action based on exploration method
        action = agent.action_selection(state, method = action_selection_method)

        #Get example, and save to replay buffer
        next_state, reward, done, _ = env.step(action)
        next_state = next_state.reshape(1,-1)
        #if done: reward = -10 #Finishing before 500 steps means the cartpole fell, thus we give a -10 reward
        if buffer:
            agent.memorize(state, action, reward, next_state, done)

        #Train network
        if buffer:
            if len(agent.memory) >= batch_size:
                agent.experience_replay_train(t, target_network_update_frequency, batch_size, target)
        else:
            agent.train(state, action, reward, next_state, done, target)
            if t%target_network_update_frequency == 0:
                agent.update_target_network()

        #The score is how long the cart stayed upright, this can be a maximum of 500
        if done or t==499:
            print(f"Episode {i}: Score {t+1}/500")
            break

        state = next_state
    


    

Episode 0: Score 11/500
Episode 1: Score 9/500
Episode 2: Score 8/500
Episode 3: Score 9/500
Episode 4: Score 12/500
Episode 5: Score 11/500
Episode 6: Score 9/500
Episode 7: Score 10/500
Episode 8: Score 12/500
Episode 9: Score 14/500
