# Import Libraries

gymnasium: Library for creating and interacting with environments. 

numpy: For numerical operations.

torch: PyTorch library for building and training neural networks.

random: To ensure randomness.

deque: A double-ended queue to store experiences (replay buffer).

In [8]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

# Random Seed 

Setting a random seed ensures reproducibility, making experiments consistent across runs.

In [9]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x2264819a930>

# Replay Buffer

A data structure to store experiences from the environment for training.

push: Adds experiences to the buffer.

sample: Returns a random sample of experiences for training.

__len__: Returns the current size of the buffer.

In [10]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done)
    
    def __len__(self):
        return len(self.buffer)

# Neural Network 

A simple neural network with two hidden layers (36 and 48 neurons).

ReLU activation functions add non-linearity.

input_dim and output_dim correspond to the state and action spaces of the environment, respectively.

In [11]:
def create_network(input_dim, output_dim):
    return nn.Sequential(
        nn.Linear(input_dim, 36), # input to 1st hidden layer
        nn.ReLU(),                # activation Function
        nn.Linear(36, 48),        # 1st hidden layer to 2nd hidden layer
        nn.ReLU(),                # activation Function
        nn.Linear(48, output_dim) # 2nd hidden layer to output layer
    )

# DQN Agent

Initializes the agent with key parameters for training and exploration.

env: The environment.
\
buffer_capacity: Maximum size of the replay buffer.
\
batch_size: Number of experiences sampled for training.
\
gamma: Discount factor for future rewards.
\
epsilon_start, epsilon_end, epsilon_decay: Parameters for the epsilon-greedy policy, controlling exploration.
\
target_update: Frequency of target network updates.
\
lr: Learning rate for the optimizer.



Network and Optimizer 

train_network: The primary network for learning.
\
target_network: Used to compute target Q-values.
\
to(self.device): Moves the network to GPU or CPU.
\
optimizer: Optimizes the network weights using the Adam optimizer.
\
target_network.load_state_dict: Copies weights from the training network.
\
target_network.eval(): Puts the target network in evaluation mode.

In [28]:
class DQNAgent:
    def __init__(self, env, buffer_capacity=10000, batch_size=50, gamma=0.99, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995, target_update=10, lr=0.001):
        self.env = env
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.target_update = target_update
        self.replay_buffer = ReplayBuffer(buffer_capacity)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Print the device information
        if self.device.type == 'cuda':
            print("Using GPU for training")
        else:
            print("Using CPU for training")
        
        self.train_network = create_network(env.observation_space.shape[0], env.action_space.n).to(self.device)
        self.target_network = create_network(env.observation_space.shape[0], env.action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.train_network.parameters(), lr=lr)
        
        self.target_network.load_state_dict(self.train_network.state_dict())
        self.target_network.eval()
        
# Chooses an action using an epsilon-greedy strategy, balancing exploration and exploitation.   
     
    def select_action(self, state, epsilon=0.0):
        if random.random() < epsilon:
            return self.env.action_space.sample()
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.train_network(state)
        return q_values.cpu().numpy().argmax()
    
# Samples experiences from the replay buffer.Computes the target Q-values
# Updates the training network using gradient descent.
     
    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)
        
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        
        q_values = self.train_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_network(next_states).max(1)[0]
        target_q_values = rewards + (self.gamma * next_q_values * (1 - dones))
        
        loss = nn.MSELoss()(q_values, target_q_values.detach())
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

# Iterates through episodes, interacting with the environment, storing experiences, and training the network.
# Updates the target network periodically.
# Reduces epsilon over time to decrease exploration.

    def train(self, num_episodes, save_path="dqn_model1.pth"):
        for episode in range(num_episodes):
            state, _ = self.env.reset(seed=episode)
            cumulative_reward = 0
            for step in range(200):
                self.env.render()
                action = self.select_action(state, self.epsilon)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                self.replay_buffer.push(state, action, reward, next_state, terminated or truncated)
                state = next_state
                cumulative_reward += reward
                self.update()

                if terminated:
                    break

            print(f"Episode {episode} finished after {step + 1} steps with cumulative reward {cumulative_reward}")
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
            
            if episode % self.target_update == 0:
                self.target_network.load_state_dict(self.train_network.state_dict())

        # Save the model at the end of training
        torch.save(self.train_network.state_dict(), save_path)
        print(f"Model saved to {save_path}")

# Main

gym.make('MountainCar-v0', render_mode='human'): Initializes the environment with rendering enabled.

DQNAgent: Initializes the agent with the specified environment and parameters.

agent.train(num_episodes=1000): Trains the agent for 1000 episodes.

In [26]:
env = gym.make('MountainCar-v0', render_mode=None)
agent = DQNAgent(env)
agent.train(num_episodes=2500)

Using GPU for training


  gym.logger.warn(


Episode 0 finished after 200 steps with cumulative reward -200.0
Episode 1 finished after 200 steps with cumulative reward -200.0
Episode 2 finished after 200 steps with cumulative reward -200.0
Episode 3 finished after 200 steps with cumulative reward -200.0
Episode 4 finished after 200 steps with cumulative reward -200.0
Episode 5 finished after 200 steps with cumulative reward -200.0
Episode 6 finished after 200 steps with cumulative reward -200.0
Episode 7 finished after 200 steps with cumulative reward -200.0
Episode 8 finished after 200 steps with cumulative reward -200.0
Episode 9 finished after 200 steps with cumulative reward -200.0
Episode 10 finished after 200 steps with cumulative reward -200.0
Episode 11 finished after 200 steps with cumulative reward -200.0
Episode 12 finished after 200 steps with cumulative reward -200.0
Episode 13 finished after 200 steps with cumulative reward -200.0
Episode 14 finished after 200 steps with cumulative reward -200.0
Episode 15 finished 

# Testing

After training, the model is tested and evaluated. The test involves running the agent in the environment and calculating metrics like the number of successes, failures, average reward, and more.



In [27]:
class DQNAgentTest:
    def __init__(self, env, model_path):
        self.env = env
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = create_network(env.observation_space.shape[0], env.action_space.n).to(self.device)
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.model(state)
        return q_values.cpu().numpy().argmax()

    def test(self, num_episodes=10):
        total_rewards = 0
        successes = 0
        total_steps = 0
        failures = 0

        for episode in range(num_episodes):
            state, _ = self.env.reset(seed=episode)
            cumulative_reward = 0
            done = False
            steps = 0

            while not done:
                self.env.render()
                action = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                state = next_state
                cumulative_reward += reward
                steps += 1
                done = terminated or truncated

                # Consider failure if the car doesn't reach the goal within 200 steps
                if steps >= 200:
                    failures += 1
                    break

            total_rewards += cumulative_reward
            total_steps += steps
            if cumulative_reward > -200:  # Assuming reward of -200 means failure in this environment
                successes += 1

            print(f"Test Episode {episode} finished with cumulative reward {cumulative_reward} in {steps} steps.")

        average_reward = total_rewards / num_episodes
        success_rate = (successes / num_episodes) * 100
        average_steps = total_steps / num_episodes

        print(f"\nEvaluation Metrics over {num_episodes} episodes:")
        print(f"Average Reward: {average_reward}")
        print(f"Success Rate: {success_rate}%")
        print(f"Average Steps to Completion: {average_steps}")
        print(f"Failures: {failures}")

# Path to the saved model
model_path = "Moutain_Car_DQN_Model.pth"

# Create the test agent and load the trained model
test_agent = DQNAgentTest(env, model_path)

# Run the test and evaluate
test_agent.test(num_episodes=10)


  self.model.load_state_dict(torch.load(model_path, map_location=self.device))


Test Episode 0 finished with cumulative reward -149.0 in 149 steps.
Test Episode 1 finished with cumulative reward -142.0 in 142 steps.
Test Episode 2 finished with cumulative reward -162.0 in 162 steps.
Test Episode 3 finished with cumulative reward -114.0 in 114 steps.
Test Episode 4 finished with cumulative reward -84.0 in 84 steps.
Test Episode 5 finished with cumulative reward -89.0 in 89 steps.
Test Episode 6 finished with cumulative reward -145.0 in 145 steps.
Test Episode 7 finished with cumulative reward -148.0 in 148 steps.
Test Episode 8 finished with cumulative reward -161.0 in 161 steps.
Test Episode 9 finished with cumulative reward -86.0 in 86 steps.

Evaluation Metrics over 10 episodes:
Average Reward: -128.0
Success Rate: 100.0%
Average Steps to Completion: 128.0
Failures: 0
