# Network Traffic Signal Control using Q-Learning

In [1]:
import numpy as np
import random

### Simulate the Traffic Environment

In [10]:
class TrafficEnvironment:
    def __init__(self, num_intersections):
        self.num_intersections = num_intersections
        self.max_cars = 20  # Maximum number of cars at each intersection
        self.state = np.zeros(num_intersections, dtype=int)
        
    def reset(self): # resets the cuurent state to get the randomness
        self.state = np.random.randint(0, self.max_cars + 1, self.num_intersections)
        return tuple(self.state)  # Return state as a tuple
    
    def step(self, action):
        reward = 0
        
        # Applying the action (change in traffic lights)
        for i in range(self.num_intersections):
            if action[i] == 1:  # Green light
                cars_passed = min(self.state[i], 5)  # Assuming max 5 cars can pass on green
                self.state[i] -= cars_passed
                reward -= self.state[i]  # Negative reward for waiting cars
            else:  # Red light
                self.state[i] = min(self.state[i] + np.random.randint(0, 3), self.max_cars)
                reward -= self.state[i] * 2  # Higher negative reward for red light
        
        # Adding some randomness to car arrivals
        self.state += np.random.randint(0, 3, self.num_intersections)
        self.state = np.clip(self.state, 0, self.max_cars)
        
        return tuple(self.state), reward, False

### Implementing Q-Learning Agent

In [11]:
class QLearningAgent:
    def __init__(self, num_intersections, learning_rate=0.05, discount_factor=0.99, epsilon=0.1):
        self.num_intersections = num_intersections
        self.num_actions = 2 ** num_intersections
        self.q_table = {}
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = epsilon
    
    def get_action(self, state):
        if state not in self.q_table:
            self.q_table[state] = np.zeros(self.num_actions)
        
        if random.random() < self.epsilon:
            return random.randint(0, self.num_actions - 1) # Exploration 
        else:
            return np.argmax(self.q_table[state]) # Exploitation
    
    def update_q_table(self, state, action, reward, next_state):
        if state not in self.q_table:
            self.q_table[state] = np.zeros(self.num_actions)
        
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros(self.num_actions)
        
        current_q = self.q_table[state][action]
        max_next_q = np.max(self.q_table[next_state])
        new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state][action] = new_q

### Training the Q-Learning Agent

In [12]:
def train(num_episodes, num_intersections):
    env = TrafficEnvironment(num_intersections)
    agent = QLearningAgent(num_intersections)
    
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        
        for _ in range(100):  # 100 time steps per episode
            action = agent.get_action(state)
            action_binary = [int(x) for x in format(action, f'0{num_intersections}b')]
            next_state, reward, done = env.step(action_binary)
            agent.update_q_table(state, action, reward, next_state)
            state = next_state
            total_reward += reward
        
        if episode % 100 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward}")
    
    return agent

# Train the agent
num_intersections = 4
num_episodes = 10000
trained_agent = train(num_episodes, num_intersections)

### Testing the Trained Agent

In [13]:
env = TrafficEnvironment(num_intersections)
state = env.reset()
total_reward = 0

for _ in range(100):
    action = trained_agent.get_action(state)
    action_binary = [int(x) for x in format(action, f'0{num_intersections}b')]
    next_state, reward, _ = env.step(action_binary)
    state = next_state
    total_reward += reward

print(f"Test Total Reward: {total_reward}")

Episode 0, Total Reward: -13428
Episode 100, Total Reward: -10090
Episode 200, Total Reward: -9556
Episode 300, Total Reward: -9064
Episode 400, Total Reward: -9029
Episode 500, Total Reward: -9109
Episode 600, Total Reward: -7364
Episode 700, Total Reward: -7283
Episode 800, Total Reward: -7777
Episode 900, Total Reward: -7634
Episode 1000, Total Reward: -8178
Episode 1100, Total Reward: -7694
Episode 1200, Total Reward: -7273
Episode 1300, Total Reward: -8466
Episode 1400, Total Reward: -7783
Episode 1500, Total Reward: -7857
Episode 1600, Total Reward: -6463
Episode 1700, Total Reward: -8023
Episode 1800, Total Reward: -7498
Episode 1900, Total Reward: -7955
Episode 2000, Total Reward: -7168
Episode 2100, Total Reward: -6354
Episode 2200, Total Reward: -6973
Episode 2300, Total Reward: -6872
Episode 2400, Total Reward: -6259
Episode 2500, Total Reward: -6723
Episode 2600, Total Reward: -6170
Episode 2700, Total Reward: -4949
Episode 2800, Total Reward: -6399
Episode 2900, Total Rewa

* Penaltes (negative reward) are decreasing overtime, which means agent is learning