# DQN on LunarLander-v2

In [None]:
import numpy as np
import random
import gym
import matplotlib.pyplot as plt
from collections import deque
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers

## Environment setup

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.batch_size = 64
        self.memory = deque(maxlen=100_000)

        self.model = self.build_model()

    def build_model(self):
        model = models.Sequential()
        model.add(layers.Dense(128, input_dim=self.state_size, activation='relu'))
        model.add(layers.Dense(128, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(np.expand_dims(state, axis=0), verbose=0)
        return np.argmax(q_values[0])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        states, targets = [], []

        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(np.expand_dims(state, axis=0), verbose=0)[0]
            if done:
                target[action] = reward
            else:
                next_q = np.amax(self.model.predict(np.expand_dims(next_state, axis=0), verbose=0)[0])
                target[action] = reward + self.gamma * next_q
            states.append(state)
            targets.append(target)

        self.model.fit(np.array(states), np.array(targets), epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


## Agent Learning On LunarLander-v2

In [None]:
env = gym.make("LunarLander-v2")

state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

episodes = 500
scores = []

for e in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        action = agent.act(state)
        next_state, reward, done, _, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        agent.replay()

    scores.append(total_reward)
    print(f"Episode {e+1}/{episodes} - Reward: {total_reward:.2f} - Epsilon: {agent.epsilon:.4f}")

# Save the model
agent.model.save("dqn_lunarlander_model.h5")

## Reward Plot

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(scores, label="Episode Reward")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.title("DQN Training - LunarLander")
plt.legend()
plt.grid()
plt.show()

## Testing the MODEL

In [None]:
model = tf.keras.models.load_model("dqn_lunarlander_model.h5")
test_episodes = 10
test_rewards = []

for e in range(test_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        q_values = model.predict(np.expand_dims(state, axis=0), verbose=0)
        action = np.argmax(q_values[0])
        next_state, reward, done, _, _ = env.step(action)
        state = next_state
        total_reward += reward
    test_rewards.append(total_reward)
    print(f"Test Episode {e+1} - Reward: {total_reward:.2f}")

print(f"\nAverage Test Reward over {test_episodes} episodes: {np.mean(test_rewards):.2f}")
