# Models

## REINFORCE

In [None]:
import gymnasium as gym
import sumo_rl
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Linear(input_size, output_size)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.fc(x)
        return self.softmax(x)

# Hyperparams
learning_rate = 0.01
gamma = 0.99
num_episodes = 500
NUM_BINS = 10

env = gym.make('sumo-rl-v0',
               net_file='nets/single-intersection/single-intersection.net.xml',
               route_file='nets/single-intersection/single-intersection.rou.xml',
               use_gui=False,
               num_seconds=1000)


#dimensions for neural net based on state and action space
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

policy_network = PolicyNetwork(state_size, action_size)
optimizer = optim.Adam(policy_network.parameters(), lr=learning_rate)

episode_rewards = []

for episode in range(num_episodes):
    state, info = env.reset()
    done = False
    total_reward = 0
    log_probs = []
    rewards = []

    while not done:
        state_tensor = torch.FloatTensor(state).unsqueeze(0)

        action_probs = policy_network(state_tensor)
        action = np.random.choice(action_size, p=action_probs.detach().numpy().flatten())

        log_probs.append(torch.log(action_probs[0, action]))
        next_state, reward, done, _, _ = env.step(action)
        rewards.append(reward)

        state = next_state
        total_reward += reward

        if len(rewards) >= 999:
            done = True

    episode_rewards.append(total_reward)
    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

    discounted_rewards = []
    cumulative_reward = 0
    for r in reversed(rewards):
        cumulative_reward = r + gamma * cumulative_reward
        discounted_rewards.insert(0, cumulative_reward)

    discounted_rewards = torch.FloatTensor(discounted_rewards)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-8)

    policy_loss = [-log_prob * reward for log_prob, reward in zip(log_probs, discounted_rewards)]
    policy_loss = torch.stack(policy_loss).sum()

    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()


def moving_average(data, window_size):
    cumsum = np.cumsum(data)
    return (cumsum[window_size:] - cumsum[:-window_size]) / window_size

window_size = 10
smoothed_rewards = moving_average(episode_rewards, window_size)


plt.plot(range(window_size, num_episodes), smoothed_rewards)
plt.xlabel('Episodes')
plt.ylabel('Smoothed Cumulative Reward')
plt.title(f'Smoothed Cumulative Reward per Episode (Moving Avg Window: {window_size})')
plt.grid(True)
plt.show()


## Round Robin (time-based trigger)

In [None]:
import gymnasium as gym
import sumo_rl
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('sumo-rl-v0',
               net_file='nets/single-intersection/single-intersection.net.xml',
               route_file='nets/single-intersection/single-intersection.rou.xml',
               out_csv_name='path_to_output.csv',
               use_gui=False,
               num_seconds=1000)

# Tracking rewards over episodes
episode_rewards = []
num_episodes = 500  # Define the number of episodes for training

for episode in range(num_episodes):
    obs, info = env.reset()
    done = False
    total_reward = 0
    prev_phase_one_hot = 0

    while not done:
        phase_one_hot, min_green = obs[0], obs[1]

        if min_green and prev_phase_one_hot != phase_one_hot:
            action = 1 if phase_one_hot == 1 else 0
        else:
            action = phase_one_hot

        next_obs, reward, terminated, truncated, info = env.step(action)

        obs = next_obs
        prev_phase_one_hot = phase_one_hot
        total_reward += reward
        done = terminated or truncated

    episode_rewards.append(total_reward)
    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

# Function to calculate moving average
def moving_average(data, window_size):
    cumsum = np.cumsum(data)
    return (cumsum[window_size:] - cumsum[:-window_size]) / window_size

# Plotting cumulative rewards over episodes with smoothing
window_size = 10  # Adjust the window size for smoothing
smoothed_rewards = moving_average(episode_rewards, window_size)

plt.plot(range(window_size, num_episodes), smoothed_rewards)
plt.xlabel('Episodes')
plt.ylabel('Smoothed Cumulative Reward')
plt.title(f'Smoothed Cumulative Reward per Episode (Moving Avg Window: {window_size})')
plt.grid(True)
plt.show()

## Round Robin (traffic density-based trigger)

In [None]:
import gymnasium as gym
import sumo_rl
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('sumo-rl-v0',
               net_file='nets/single-intersection/single-intersection.net.xml',
               route_file='nets/single-intersection/single-intersection.rou.xml',
               out_csv_name='path_to_output.csv',
               use_gui=False,
               num_seconds=1000)

# Tracking rewards over episodes
episode_rewards = []
num_episodes = 500  # Define the number of episodes for training
lane_density_threshold = 0.2 # Define the density threshold to switch light

for episode in range(num_episodes):
    obs, info = env.reset()
    done = False
    total_reward = 0
    prev_phase_one_hot = 0

    while not done:
        phase_one_hot, lane_1_density, lane_2_density = obs[0], obs[2], obs[3]

        if prev_phase_one_hot != phase_one_hot and (lane_1_density > lane_density_threshold or lane_2_density > lane_density_threshold):
            action = 1 if phase_one_hot == 1 else 0
        else:
            action = phase_one_hot

        next_obs, reward, terminated, truncated, info = env.step(action)

        obs = next_obs
        prev_phase_one_hot = phase_one_hot
        total_reward += reward
        done = terminated or truncated

    episode_rewards.append(total_reward)
    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

# Function to calculate moving average
def moving_average(data, window_size):
    cumsum = np.cumsum(data)
    return (cumsum[window_size:] - cumsum[:-window_size]) / window_size

# Plotting cumulative rewards over episodes with smoothing
window_size = 10  # Adjust the window size for smoothing
smoothed_rewards = moving_average(episode_rewards, window_size)

plt.plot(range(window_size, num_episodes), smoothed_rewards)
plt.xlabel('Episodes')
plt.ylabel('Smoothed Cumulative Reward')
plt.title(f'Smoothed Cumulative Reward per Episode (Moving Avg Window: {window_size})')
plt.grid(True)
plt.show()

## Q-Learning

In [None]:
import gymnasium as gym
import sumo_rl
import numpy as np
import matplotlib.pyplot as plt

# Define Q-learning parameters
Q = {}  # Q-table
alpha = 0.005  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 1.0  # Initial epsilon for epsilon-greedy strategy
min_epsilon = 0.01  # Minimum epsilon value
epsilon_decay = 0.990  # Epsilon decay rate

env = gym.make('sumo-rl-v0',
               net_file='nets/single-intersection/single-intersection.net.xml',
               route_file='nets/single-intersection/single-intersection.rou.xml',
               out_csv_name='path_to_output.csv',
               use_gui=False,
               num_seconds=1000)

# Tracking rewards over episodes
episode_rewards = []
num_episodes = 500  # Define the number of episodes for training
NUM_BINS = 10  # Number of bins for discretization

for episode in range(num_episodes):
    obs, info = env.reset()
    done = False
    total_reward = 0

    while not done:
        # Discretize continuous decimal numbers in the observation
        discretized_values = [int(value * NUM_BINS) for value in obs[3:]]
        obs_tuple = tuple(list(obs[:3]) + discretized_values)

        # Epsilon-greedy action selection
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            if obs_tuple in Q:
                action = np.argmax(Q[obs_tuple])
            else:
                action = env.action_space.sample()

        next_obs, reward, terminated, truncated, info = env.step(action)

        next_discretized_values = [int(val * NUM_BINS) for val in next_obs[3:]]
        next_obs_tuple = tuple(list(next_obs[:3]) + next_discretized_values)

        if obs_tuple not in Q:
            Q[obs_tuple] = np.zeros(env.action_space.n)
        if next_obs_tuple not in Q:
            Q[next_obs_tuple] = np.zeros(env.action_space.n)

        Q[obs_tuple][action] += alpha * (reward + gamma * max(Q[next_obs_tuple]) - Q[obs_tuple][action])

        obs = next_obs
        total_reward += reward
        done = terminated or truncated

    episode_rewards.append(total_reward)
    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

    # Epsilon decay
    if epsilon > min_epsilon:
        epsilon *= epsilon_decay


# Function to calculate moving average
def moving_average(data, window_size):
    cumsum = np.cumsum(data)
    return (cumsum[window_size:] - cumsum[:-window_size]) / window_size

# Plotting cumulative rewards over episodes with smoothing
window_size = 10  # Adjust the window size for smoothing
smoothed_rewards = moving_average(episode_rewards, window_size)

plt.plot(range(window_size, num_episodes), smoothed_rewards)
plt.xlabel('Episodes')
plt.ylabel('Smoothed Cumulative Reward')
plt.title(f'Smoothed Cumulative Reward per Episode (Moving Avg Window: {window_size})')
plt.grid(True)
plt.show()

# Benchmarks

## IntelliLight

https://github.com/wingsweihua/IntelliLight

## Fixed-time Control

## Actuated Control