<a href="https://colab.research.google.com/github/Dewwbe/Deep-learning-Lab-07-Part-2-Deep-Q-Learning-DQN-/blob/main/Deep_Q_Learning_model_with_experience_replay_and_a_fixed_target_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# ================================================================
# ✅ Fixed dependency installation for LunarLander-v2 on Colab (Python 3.12)
# ================================================================
!apt-get install -y swig
!pip install "gymnasium[box2d]" pygame box2d


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 38 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (779 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 126675 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubunt

In [21]:
import gymnasium as gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from collections import deque

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", device)

# Quick environment check
env = gym.make("LunarLander-v3")
state, info = env.reset()
print("✅ Environment ready, state shape:", np.shape(state))
env.close()

Running on: cpu


  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


✅ Environment ready, state shape: (8,)


  return datetime.utcnow().replace(tzinfo=utc)


In [22]:
# ===============================
# 1. Neural Network Definition
# ===============================
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )

    def forward(self, x):
        return self.net(x)



In [23]:
# ===============================
# 2. Replay Buffer
# ===============================
class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.array, zip(*batch))
        return (
            torch.FloatTensor(state).to(device),
            torch.LongTensor(action).to(device),
            torch.FloatTensor(reward).to(device),
            torch.FloatTensor(next_state).to(device),
            torch.FloatTensor(done).to(device),
        )

    def __len__(self):
        return len(self.buffer)



In [24]:
# ===============================
# 3. DQN Agent
# ===============================
class DQNAgent:
    def __init__(self, state_dim, action_dim, replay=True, target_update=True):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = 0.99
        self.lr = 1e-3
        self.batch_size = 64
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.replay_enabled = replay
        self.target_update_enabled = target_update

        self.online_net = DQN(state_dim, action_dim).to(device)
        self.target_net = DQN(state_dim, action_dim).to(device)
        self.target_net.load_state_dict(self.online_net.state_dict())

        self.optimizer = optim.Adam(self.online_net.parameters(), lr=self.lr)
        self.loss_fn = nn.MSELoss()
        self.memory = ReplayBuffer()

        self.steps_done = 0

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randrange(self.action_dim)
        state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
        q_values = self.online_net(state_t)
        return q_values.argmax().item()

    def train_step(self):
        if len(self.memory) < self.batch_size or not self.replay_enabled:
            return 0.0

        states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size)

        q_values = self.online_net(states)
        next_q_values = self.target_net(next_states)

        q_target = q_values.clone()
        for i in range(self.batch_size):
            q_target[i, actions[i]] = rewards[i] + self.gamma * torch.max(next_q_values[i]) * (1 - dones[i])

        loss = self.loss_fn(q_values, q_target.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def update_target_network(self):
        if self.target_update_enabled:
            self.target_net.load_state_dict(self.online_net.state_dict())

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [25]:

# ===============================
# 4. Training Function
# ===============================
def train_dqn(env_name="LunarLander-v2", episodes=500, replay=True, target_update=True):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    agent = DQNAgent(state_dim, action_dim, replay, target_update)
    rewards_history = []
    eps_history = []
    loss_history = []

    target_update_freq = 1000
    total_steps = 0

    for ep in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.select_action(state)
            next_state, reward, done, _, _ = env.step(action)
            agent.memory.push(state, action, reward, next_state, done)

            loss = agent.train_step()
            loss_history.append(loss)

            state = next_state
            total_reward += reward
            agent.decay_epsilon()
            eps_history.append(agent.epsilon)

            if total_steps % target_update_freq == 0:
                agent.update_target_network()

            total_steps += 1

        rewards_history.append(total_reward)
        if ep % 10 == 0:
            print(f"Episode {ep}/{episodes} | Reward: {total_reward:.2f} | Epsilon: {agent.epsilon:.3f}")

    env.close()
    return rewards_history, eps_history, loss_history

In [28]:
# ===============================
# 5. Train and Compare
# ===============================
print("\n=== Training DQN with Replay + Target Network ===")
rewards_replay, epsilons_replay, losses_replay = train_dqn(env_name="LunarLander-v3", replay=True, target_update=True)

print("\n=== Training DQN without Replay/Target Network ===")
rewards_simple, epsilons_simple, losses_simple = train_dqn(env_name="LunarLander-v3", replay=False, target_update=False)


=== Training DQN with Replay + Target Network ===
Episode 0/500 | Reward: -353.22 | Epsilon: 0.609


  return datetime.utcnow().replace(tzinfo=utc)


Episode 10/500 | Reward: -184.39 | Epsilon: 0.010
Episode 20/500 | Reward: -1.93 | Epsilon: 0.010
Episode 30/500 | Reward: -348.89 | Epsilon: 0.010
Episode 40/500 | Reward: -657.84 | Epsilon: 0.010
Episode 50/500 | Reward: -1685.14 | Epsilon: 0.010
Episode 60/500 | Reward: -343.44 | Epsilon: 0.010
Episode 70/500 | Reward: -239.27 | Epsilon: 0.010
Episode 80/500 | Reward: -94.65 | Epsilon: 0.010
Episode 90/500 | Reward: -104.27 | Epsilon: 0.010
Episode 100/500 | Reward: -17.90 | Epsilon: 0.010
Episode 110/500 | Reward: -247.59 | Epsilon: 0.010
Episode 120/500 | Reward: -14.59 | Epsilon: 0.010
Episode 130/500 | Reward: -39.13 | Epsilon: 0.010
Episode 140/500 | Reward: -40.29 | Epsilon: 0.010
Episode 150/500 | Reward: -136.22 | Epsilon: 0.010
Episode 160/500 | Reward: -335.39 | Epsilon: 0.010
Episode 170/500 | Reward: -188.88 | Epsilon: 0.010
Episode 180/500 | Reward: -365.61 | Epsilon: 0.010
Episode 190/500 | Reward: -148.19 | Epsilon: 0.010
Episode 200/500 | Reward: -78.17 | Epsilon: 0.

In [None]:
!pip install box2d-py

In [None]:
# ===============================
# 6. Plot Results
# ===============================
plt.figure(figsize=(14,6))
plt.plot(rewards_replay, label="With Replay & Target Network")
plt.plot(rewards_simple, label="Without Replay/Target")
plt.title("Episode Reward vs Episode Number")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.legend()
plt.show()

plt.figure(figsize=(14,6))
plt.plot(epsilons_replay, label="With Replay & Target")
plt.plot(epsilons_simple, label="Without Replay/Target")
plt.title("Epsilon Decay over Time")
plt.xlabel("Training Steps")
plt.ylabel("Epsilon")
plt.legend()
plt.show()

plt.figure(figsize=(14,6))
plt.plot(losses_replay, label="With Replay & Target")
plt.plot(losses_simple, label="Without Replay/Target")
plt.title("Training Loss vs Steps")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
# ===============================
# 7. Observations
# ===============================
print("""
Observations:
1. With experience replay and target network, training is much more stable.
2. The epsilon value decays gradually, allowing a balance between exploration and exploitation.
3. Without replay or target network, the reward curve is noisy and unstable.
4. Replay memory helps break correlation between consecutive samples, improving convergence.
""")