# Imports

In [6]:
import numpy as np
import gymnasium as gym
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque
import matplotlib.pyplot as plt
import random

# Creating the environment

In [4]:
# Create the environment
env = gym.make('CartPole-v1')

# Set seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)


# Defining the Hyperparameters

In [5]:
# Define hyperparameters
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 64
learning_rate = 0.001
gamma = 0.95  # Discount rate
memory_size = 100000  # Size of the replay buffer
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995
epsilon_min = 0.01
num_episodes = 1000
update_target_network_freq = 10  # Frequency to update target network
memory = deque(maxlen=memory_size)

# Print all hyperparameters
print('state_size:', state_size)
print('action_size:', action_size)
print('batch_size:', batch_size)
print('learning_rate:', learning_rate)
print('gamma:', gamma)
print('memory_size:', memory_size)
print('epsilon:', epsilon)
print('epsilon_decay:', epsilon_decay)
print('epsilon_min:', epsilon_min)
print('num_episodes:', num_episodes)
print('update_target_network_freq:', update_target_network_freq)


state_size: 4
action_size: 2
batch_size: 64
learning_rate: 0.001
gamma: 0.95
memory_size: 100000
epsilon: 1.0
epsilon_decay: 0.995
epsilon_min: 0.01
num_episodes: 1000
update_target_network_freq: 10


In [7]:
# Build the neural network model
def build_model(hidden_layers,num_layers):
    # Neural network for Deep Q Learning
    model = Sequential()
    for i in range(num_layers):
        if i == 0:
            model.add(Dense(hidden_layers, input_dim=state_size, activation='relu'))
        else:
            model.add(Dense(hidden_layers, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(lr=learning_rate))
    return model

In [9]:
# Create main model and target model
main_model = build_model(32,3)
target_model = build_model(32,3)



In [10]:
# Function to update the target network
def update_target_model():
    target_model.set_weights(main_model.get_weights())

# Function to choose an action using epsilon-greedy policy
def choose_action(state, epsilon):
    return random.randrange(action_size) if np.random.rand() <= epsilon else np.argmax(main_model.predict(state)[0])

# Function to store experience in replay memory
def store_transition(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))


In [None]:
train_start = 1000

# Function to train the agent with a sampled batch from the replay buffer
def train_replay():
    if len(memory) < train_start:
        return
    # Sample a mini-batch from the memory
    minibatch = random.sample(memory, batch_size)
    
    # Variables to store mini-batch data
    update_input = np.zeros((batch_size, state_size))
    update_target = np.zeros((batch_size, state_size))
    action, reward, done = [], [], []
    
    for i in range(batch_size):
        update_input[i] = minibatch[i][0]
        action.append(minibatch[i][1])
        reward.append(minibatch[i][2])
        update_target[i] = minibatch[i][3]
        done.append(minibatch[i][4])
    
    # Set the target Q-value
    target = main_model.predict(update_input)
    target_next = main_model.predict(update_target)
    target_val = target_model.predict(update_target)

    for i in range(batch_size):
        # If the episode is done, just use the observed reward
        if done[i]:
            target[i][action[i]] = reward[i]
        else:
            # The key DDQN update step: use the main network to select the action,
            # and the target network to compute the Q-value for that action
            a = np.argmax(target_next[i])
            target[i][action[i]] = reward[i] + gamma * target_val[i][a]

    # Train the main model with the target Q-value
    main_model.fit(update_input, target, batch_size=batch_size, epochs=1, verbose=0)
