# Building a Deep Q-Network with keras

## Set up the environment

In [4]:
import gym
import numpy as np

# Create the environment
env = gym.make('CartPole-v1')

# Set Random seed for reproducibility
np.random.seed(42)
env.reset(seed=42)

(array([ 0.0273956 , -0.00611216,  0.03585979,  0.0197368 ], dtype=float32),
 {})

- CartPole-v1 is an envronment where a pole is balanced on a cart and the goal si to prevent the pole from falling over.
- Setting random seeds ensures that can reproduce the result

## Define the DQN Model

In [5]:
import warnings
warnings.filterwarnings('ignore')

def warn (*args, **kwargs):
    pass
warnings.warn = warn

# Import necessary libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

def build_model(state_size, action_size):
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
    return model

state_size = env.observation_space.shape[0]
action_size = env.action_space.n
model = build_model(state_size, action_size)

## Implement the replay buffer

a replay buffer stores the agent's experiences for training. will implement a replay buffer using a deque

In [8]:
from collections import deque
import random

memory = deque(maxlen=2000)
# stores experiences in memory
def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

## Implement the epsilon-greedy policy

will balances explorations and exploitation by choosing random actions with probability epsilon

In [13]:
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995

def act(state): 
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
    q_values = model.predict(state)
    return np.argmax(q_values[0])  # returns action with highest Q-value

## Implement the Q-learning update

In [14]:
def replay(batch_size):
    global epsilon
    # sample random batch from memory (breaks correlation)
    minibatch = random.sample(memory, batch_size)
    
    
    for state, action, reward, next_state, done in minibatch:
        target = reward # Start with immediate reward
        
        if not done:
            # If episode didn't end, add discounted future reward
            # gamma = 0.95 (discount factor)
            target = reward + gamma * np.amax(model.predict(next_state)[0])
        
        # Get current Q-values for this state
        target_f = model.predict(state)
        
        # Update Q-value for the action taken
        target_f[0][action] = target
        
        # Train model on this single sample
        model.fit(state, target_f, epochs=1, verbose=0)
    
    # Gradually reduce exploration
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

## Train the DQN

train the DQN agent by interaction with the environment and updateing the Q-Values using the replay buffer

In [21]:
episodes = 50   # More episodes to ensure sufficient training
batch_size = 32     # mini batch size for replay training
gamma = 0.95    # discount factor for future rewards

for e in range(episodes):
    state = env.reset()
    if  isinstance(state, tuple):  # For compatibility with Gym v0.26+
        state = state[0]
    state = np.reshape(state, [1, state_size])
    
    for time in range(500): # Max steps per episode
        # Choose action using epsilon-greedy policy
        action = act(state)
        
        # Perform action in the environment
        result = env.step(action)
        if len(result) == 4:  # Handle 4-value output
            next_state, reward, done, _ = result
        else:   # Handle 5-value output
            next_state, reward, done, _, _ = result
            
        if  isinstance(next_state, tuple):  # Handle tuple next_state
            next_state = next_state[0]
        next_state = np.reshape(next_state, [1, state_size])
        
        # Store experience in memory
        remember(state, action, reward, next_state, done)
        
        # Update state
        state = next_state
        
        if done:
            print(f"Episode: {e+1}/{episodes}, score: {time}, e: {epsilon:.2}")
            break
        
    if len(memory) > batch_size:
        replay(batch_size)

        

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Episode: 1/50, score: 24, e: 0.91
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━

## Evaluate the Performance

In [22]:
# Evaluation loop
evaluation_episodes = 10  # Number of evaluation episodes
scores = []  # Track scores for performance metrics
 
for e in range(evaluation_episodes):
    state = env.reset()
    if isinstance(state, tuple):  # Handle tuple output
        state = state[0]
    state = np.reshape(state, [1, state_size])
 
    total_reward = 0  # Track total reward per episode
 
    for time in range(200):  # Max steps per episode
        # Choose the greedy action
        action = np.argmax(model.predict(state)[0])
 
        # Perform action in the environment
        result = env.step(action)
        if len(result) == 4:  # Handle 4-value output
            next_state, reward, done, _ = result
        else:  # Handle 5-value output
            next_state, reward, terminated, truncated, _ = result
            done = terminated or truncated
 
        if isinstance(next_state, tuple):  # Handle tuple next_state
            next_state = next_state[0]
        next_state = np.reshape(next_state, [1, state_size])
 
        state = next_state
        total_reward += reward
 
        if done:  # If episode ends
            print(f"Evaluation Episode: {e+1}/{evaluation_episodes}, Score: {time}, Total Reward: {total_reward}")
            scores.append(total_reward)
            break
 
# Summary of evaluation performance
print(f"Average Reward: {np.mean(scores):.2f}, Max Reward: {np.max(scores)}, Min Reward: {np.min(scores)}")
 
env.close()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Evaluation Episode: 1/10, Score: 8, Total Reward: 9.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0

in the 10th evaluatoin episode, agent survives 9 steps (variable time), and total rewards collected is 10.