In [1]:
# !pip install pandas

In [None]:
# dqn_hyperparam_tuning.ipynb

import os
import sys
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import traci
import matplotlib.pyplot as plt
from itertools import product
import pandas as pd

In [3]:
# --- SUMO Configuration ---
def sumo_config(traffic_pattern="P1"):
    sumo_config = [
        "sumo",
        "-c", f"SUMO_networks/{traffic_pattern}/junction.sumocfg",
        "--step-length", "0.05",
        "--delay", "0",
        "--lateral-resolution", "0.1",
        "--start",
        "--no-warnings",
        "--no-step-log",
    ]
    return sumo_config

if "SUMO_HOME" in os.environ:
    tools = os.path.join(os.environ["SUMO_HOME"], "tools")
    sys.path.append(tools)
else:
    sys.exit("Please declare environment variable 'SUMO_HOME'")

if not traci.isLoaded():
    traci.start(sumo_config())

In [4]:
# --- Global Variables ---
action_space_size = 8
TRAFFIC_LIGHT_ID = "traffic_light"
DELTA_PHASE_DURATION = 6
YELLOW_PHASE_DURATION = 4
lane_detectors = [f'q{i+1}' for i in range(8)]
current_phase = 2

def get_queue_length():
    return torch.tensor([
        traci.lanearea.getLastStepHaltingNumber(d) for d in lane_detectors
    ], dtype=torch.float)

def get_current_state():
    return get_queue_length()

def simulate_time(seconds=1):
    for _ in range(20 * seconds):
        traci.simulationStep()

def step(action):
    global current_phase
    if 2 * action == current_phase:
        traci.trafficlight.setPhase(TRAFFIC_LIGHT_ID, 2 * action)
        simulate_time(DELTA_PHASE_DURATION)
    else:
        traci.trafficlight.setPhase(TRAFFIC_LIGHT_ID, current_phase + 1)
        simulate_time(YELLOW_PHASE_DURATION)
        current_phase = 2 * action
        traci.trafficlight.setPhase(TRAFFIC_LIGHT_ID, 2 * action)
        simulate_time(DELTA_PHASE_DURATION)
    next_state = get_current_state()
    reward = -torch.sum(next_state)
    done = traci.simulation.getMinExpectedNumber() == 0
    return next_state, reward, done

In [5]:
# --- Neural Network ---
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.main = nn.Sequential(
            nn.Linear(state_dim, 128), nn.ReLU(),
            nn.Linear(128, 64), nn.ReLU(),
            nn.Linear(64, action_dim)
        )

    def forward(self, x):
        return self.main(x)


In [6]:
# --- Environment Switching ---
import itertools
traffic_patterns = itertools.cycle(["P1", "P2", "P3", "P4"])

def change_env():
    pattern = next(traffic_patterns)
    if traci.isLoaded():
        traci.close()
    traci.start(sumo_config(pattern))

In [7]:
# --- Choose Action ---
def choose_action(state, epsilon, policy_net):
    if random.random() < epsilon:
        return random.randint(0, action_space_size - 1)
    else:
        return torch.argmax(policy_net(state.unsqueeze(0))).item()

In [8]:
# --- Optimizer Step ---
def optimise_model(policy_net, target_net, memory, optimizer, batch_size, gamma):
    if len(memory) < batch_size:
        return
    batch = random.sample(memory, batch_size)
    states = torch.stack([x[0] for x in batch])
    actions = torch.tensor([x[1] for x in batch]).unsqueeze(1)
    rewards = torch.tensor([x[2] for x in batch], dtype=torch.float)
    next_states = torch.stack([x[3] for x in batch])
    dones = torch.tensor([x[4] for x in batch], dtype=torch.float)

    q_vals = policy_net(states).gather(1, actions).squeeze()
    with torch.no_grad():
        max_next_q_vals = target_net(next_states).max(1)[0]
        target_vals = rewards + gamma * max_next_q_vals * (1 - dones)
    loss = nn.MSELoss()(q_vals, target_vals)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [9]:
# --- Training Loop ---
def train_algorithm(params, episodes=10):
    gamma = params["gamma"]
    epsilon = params["epsilon"]
    epsilon_decay = params["epsilon_decay"]
    min_epsilon = params["min_epsilon"]
    lr = params["learning_rate"]
    batch_size = params["batch_size"]
    target_update_freq = params["target_update_freq"]
    memory_size = params["memory_size"]

    state_dim = len(lane_detectors)
    policy_net = DQN(state_dim, action_space_size)
    target_net = DQN(state_dim, action_space_size)
    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    memory = deque(maxlen=memory_size)

    rewards_per_episode = []
    steps_done = 0

    for episode in range(episodes):
        change_env()
        state = get_current_state()
        episode_reward = 0
        done = False

        while not done:
            action = choose_action(state, epsilon, policy_net)
            next_state, reward, done = step(action)
            memory.append((state, action, reward, next_state, done))
            state = next_state
            episode_reward += reward
            optimise_model(policy_net, target_net, memory, optimizer, batch_size, gamma)
            if steps_done % target_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())
            steps_done += 1

        epsilon = max(min_epsilon, epsilon * epsilon_decay)
        rewards_per_episode.append(episode_reward.item())

    return sum(rewards_per_episode) / episodes

In [None]:
# --- Grid Search ---
param_grid = {
    "gamma": [0.99],
    "epsilon": [0.9],
    "epsilon_decay": [0.95],
    "min_epsilon": [0.05],
    "learning_rate": [0.001],
    "batch_size": [128],
    "target_update_freq": [400],
    "memory_size": [10000],
}

keys, values = zip(*param_grid.items())
param_combinations = [dict(zip(keys, v)) for v in product(*values)]


In [None]:
results = []

for combo in param_combinations:
    print(f"Running combo: {combo}")
    avg_reward = train_algorithm(combo, episodes=150)
    combo_result = combo.copy()
    combo_result["avg_reward"] = avg_reward
    results.append(combo_result)

Running combo: {'gamma': 0.99, 'epsilon': 1.0, 'epsilon_decay': 0.99, 'min_epsilon': 0.05, 'learning_rate': 0.001, 'batch_size': 64, 'target_update_freq': 100, 'memory_size': 5000}
Running combo: {'gamma': 0.7, 'epsilon': 1.0, 'epsilon_decay': 0.99, 'min_epsilon': 0.05, 'learning_rate': 0.001, 'batch_size': 64, 'target_update_freq': 100, 'memory_size': 5000}


In [12]:
# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("grid_search_results.csv", index=False)

In [13]:
# Show top 5 configs
print(results_df.sort_values("avg_reward", ascending=False).head())

   gamma  epsilon  epsilon_decay  min_epsilon  learning_rate  batch_size  \
0   0.99      1.0           0.99         0.05          0.001          64   
1   0.70      1.0           0.99         0.05          0.001          64   

   target_update_freq  memory_size  avg_reward  
0                 100         5000    -11777.0  
1                 100         5000    -17974.0  
