# **Study on the Ideal Behaviour for Merging into the Highway. - Accelerate. Access the influence on other vehicles.**

##### This study aims to determine the optimal strategy for the ego vehicle to safely and efficiently merge onto a highway, prioritizing the action of accelerating and merge before the other vehicle reaches the merging point. The only variables under consideration are the reward for merging before the highway vehicle reaches the merging point `merging_reward` and the influence reward `influence_penalty`, which penalizes if the actions of the ego vehicle significantly influences the highway vehicle behaviour. The goal is to find the optimal reward configuration that encourages the ego vehicle to accelerate, ensuring both safety and traffic efficiency.

### **Imports**

In [1]:
from IPython.display import Video
import cv2
import imageio
import gymnasium as gym
from matplotlib import pyplot as plt
import pprint
import highway_env
import pandas as pd
import time
import numpy as np
from stable_baselines3 import PPO
from highway_env import utils
from highway_env.envs import MergeEnv
from highway_env.vehicle.controller import ControlledVehicle
%matplotlib inline

### **Creation of the environment**

##### With the ego-vehicle on the merging lane and a single vehicle on the highway, on the right most lane and a costumized reward function

In [2]:
class RightLaneVehicle(ControlledVehicle):
    """
    A vehicle that is restricted to stay in the right lane and never changes lanes.
    """
    def act(self, action: int = None) -> None:
        # Ensures the vehicle does not change lanes (disallows actions 0 and 2 for lane change)
        if action in [0, 2]:  # Actions to change to the left or right
            action = 1  # Force to stay in the lane (action 1)
        super().act(action)


class CustomMergeEnv(MergeEnv):
    def _make_vehicles(self) -> None:
        road = self.road

        # Merge point in lane 0
        merge_position = road.network.get_lane(("b", "c", 0)).position(0, 0)  # Merge point on the highway
        
        # Initial position of the ego vehicle in the merge lane
        ego_initial_position = road.network.get_lane(("j", "k", 0)).position(30, 0)  # Ego vehicle in the merge lane

        # Initial position of the highway vehicle in the rightmost lane (lane 1)
        highway_vehicle_initial_position = road.network.get_lane(("a", "b", 1)).position(80, 0)  # In lane 1 of the highway

        # Set initial speeds
        ego_speed = 20  # Initial speed of the ego vehicle
        highway_speed = 30  # Initial speed of the highway vehicle

        # Calculate time for both vehicles to reach the merge point
        time_to_merge = (merge_position[0] - ego_initial_position[0]) / ego_speed

        # Adjust the highway vehicle's speed to ensure both reach the merge point at the same time
        highway_vehicle_speed = (merge_position[0] - highway_vehicle_initial_position[0]) / time_to_merge

        # Create the ego vehicle in the merge lane
        ego_vehicle = self.action_type.vehicle_class(
            road, ego_initial_position, speed=ego_speed
        )
        road.vehicles.append(ego_vehicle)

        # Create the highway vehicle in the rightmost lane (lane 1)
        highway_vehicle = RightLaneVehicle(
            road, highway_vehicle_initial_position, speed=highway_vehicle_speed
        )
        road.vehicles.append(highway_vehicle)

        # Set the ego vehicle as the main vehicle
        self.vehicle = ego_vehicle

        # Debug: Check positions and speeds of the vehicles
        print(f"Ego vehicle position: {ego_vehicle.position}, Speed: {ego_vehicle.speed}")
        print(f"Highway vehicle position: {highway_vehicle.position}, Speed: {highway_vehicle.speed}")



    def _reward(self, action: int) -> float:
        """
        Custom reward function that penalizes the ego vehicle if its actions influence the highway vehicle,
        while incentivizing efficient merging behavior.
        """
        # Get the original reward from the parent class (if it exists)
        reward = super()._reward(action)
        
        ego_vehicle = self.vehicle
        road = self.road

        # Find the highway vehicle (vehicle in the rightmost lane)
        highway_vehicle = None
        for vehicle in road.vehicles:
            if isinstance(vehicle, RightLaneVehicle):  # Identify the highway vehicle
                highway_vehicle = vehicle
                break

        if not highway_vehicle:
            return reward

        # Calculate relative positions and velocities
        distance_to_highway_vehicle = abs(highway_vehicle.position[0] - ego_vehicle.position[0])
        # is_ahead = distance_to_highway_vehicle > 0  # Check if the highway vehicle is ahead
        near_merge_point = abs(ego_vehicle.position[0] - road.network.get_lane(("b", "c", 0)).position(0, 0)[0]) < 100

        # Estimate highway vehicle's deceleration based on change in speed
        if not hasattr(self, "_previous_highway_speed"):
            self._previous_highway_speed = highway_vehicle.speed  # Initialize previous speed

        highway_acceleration = highway_vehicle.speed - self._previous_highway_speed
        self._previous_highway_speed = highway_vehicle.speed  # Update for the next step

        # Penalize ego vehicle for influencing the highway vehicle's behavior
        influence_penalty = 0.0
        if near_merge_point and distance_to_highway_vehicle < 20:  # Close to the highway vehicle
            if highway_acceleration < -1.0:  # Significant deceleration (tunable threshold)
                print("Highway vehicle influenced: significant deceleration detected")
                influence_penalty = self.config.get("influence_penalty", 5.0)  # Large penalty for interference

        # Reward for merging efficiently
        merging_reward = 0.0
        if near_merge_point:
            if ego_vehicle.speed > highway_vehicle.speed and distance_to_highway_vehicle < 0:
                merging_reward += self.config.get("merging_bonus", 3.0)
            else:
                merging_reward -= self.config.get("merging_penalty", 2.0)

        # Total reward includes the merging incentive and interference penalty
        reward += merging_reward - influence_penalty

        # Debug information
        print(f"Distance to highway vehicle: {distance_to_highway_vehicle}, Ego speed: {ego_vehicle.speed}, Highway speed: {highway_vehicle.speed}, Highway acceleration: {highway_acceleration}")
        print(f"Merging reward: {merging_reward}, Influence penalty: {influence_penalty}, Total reward: {reward}")

        return reward

In [3]:
# Registering the custom environment
gym.envs.registration.register(
    id='CustomMerge-v0',
    entry_point='__main__:CustomMergeEnv',  
)

### **Training the model**

#### Initial configuration with balanced values

In [4]:
env_v0 = gym.make("CustomMerge-v0", render_mode='rgb_array')
env_v0.unwrapped.config.update({
    "influence_penalty": 5.0,
    "merging_bonus": 3.0,
    "merging_penalty": 2.0
})

Ego vehicle position: [30.  14.5], Speed: 20
Highway vehicle position: [80.  4.], Speed: 15.0


In [None]:
model = PPO('MlpPolicy', env_v0,
            policy_kwargs=dict(net_arch=[256, 256]),
            learning_rate=5e-4,
            n_steps=2048, 
            batch_size=64, 
            n_epochs=10,  
            gamma=0.8,
            gae_lambda=0.95, 
            clip_range=0.2, 
            verbose=1,
            tensorboard_log="env_ego_entering_accelerate_0/")
timesteps = 1000000
model.learn(total_timesteps=timesteps)
model.save("env_ego_entering_accelerate_0/model")

#### Configurations with a severe increase on the penalties

In [5]:
env_v1 = gym.make("CustomMerge-v0", render_mode='rgb_array')
env_v1.unwrapped.config.update({
    "influence_penalty": 10.0,
    "merging_bonus": 1.5,
    "merging_penalty": 2.0
})

Ego vehicle position: [30.  14.5], Speed: 20
Highway vehicle position: [80.  4.], Speed: 15.0


In [None]:
model = PPO('MlpPolicy', env_v1,
            policy_kwargs=dict(net_arch=[256, 256]),
            learning_rate=5e-4,
            n_steps=2048, 
            batch_size=64, 
            n_epochs=10,  
            gamma=0.8,
            gae_lambda=0.95, 
            clip_range=0.2, 
            verbose=1,
            tensorboard_log="env_ego_entering_accelerate_1/")
timesteps = 1000000
model.learn(total_timesteps=timesteps)
model.save("env_ego_entering_accelerate_1/model")

#### "Agressive" configuration - reduction of the influence penalty and increase of the merging penalty

In [6]:
env_v2 = gym.make("CustomMerge-v0", render_mode='rgb_array')
env_v2.unwrapped.config.update({
    "influence_penalty": 5.0,
    "merging_bonus": 1.5,
    "merging_penalty": 4.0
})

Ego vehicle position: [30.  14.5], Speed: 20
Highway vehicle position: [80.  4.], Speed: 15.0


In [None]:
model = PPO('MlpPolicy', env_v2,
            policy_kwargs=dict(net_arch=[256, 256]),
            learning_rate=5e-4,
            n_steps=2048, 
            batch_size=64, 
            n_epochs=10,  
            gamma=0.8,
            gae_lambda=0.95, 
            clip_range=0.2, 
            verbose=1,
            tensorboard_log="env_ego_entering_accelerate_2/")
timesteps = 1000000
model.learn(total_timesteps=timesteps)
model.save("env_ego_entering_accelerate_2/model")

#### "Safe" configuration - increase of the influence penalty and increase of the merging penalty

In [7]:
env_v3 = gym.make("CustomMerge-v0", render_mode='rgb_array')
env_v3.unwrapped.config.update({
    "influence_penalty": 10.0,
    "merging_bonus": 1.5,
    "merging_penalty": 4.0
})

Ego vehicle position: [30.  14.5], Speed: 20
Highway vehicle position: [80.  4.], Speed: 15.0


In [None]:
model = PPO('MlpPolicy', env_v3,
            policy_kwargs=dict(net_arch=[256, 256]),
            learning_rate=5e-4,
            n_steps=2048, 
            batch_size=64, 
            n_epochs=10,  
            gamma=0.8,
            gae_lambda=0.95, 
            clip_range=0.2, 
            verbose=1,
            tensorboard_log="env_ego_entering_accelerate_3/")
timesteps = 1000000
model.learn(total_timesteps=timesteps)
model.save("env_ego_entering_accelerate_3/model")

#### Configuration to incentivise the merging

In [8]:
env_v4 = gym.make("CustomMerge-v0", render_mode='rgb_array')
env_v4.unwrapped.config.update({
    "influence_penalty": 3.0,
    "merging_bonus": 5.0,
    "merging_penalty": 2.0
})

Ego vehicle position: [30.  14.5], Speed: 20
Highway vehicle position: [80.  4.], Speed: 15.0


In [None]:
model = PPO('MlpPolicy', env_v4,
            policy_kwargs=dict(net_arch=[256, 256]),
            learning_rate=5e-4,
            n_steps=2048, 
            batch_size=64, 
            n_epochs=10,  
            gamma=0.8,
            gae_lambda=0.95, 
            clip_range=0.2, 
            verbose=1,
            tensorboard_log="env_ego_entering_accelerate_4/")
timesteps = 1000000
model.learn(total_timesteps=timesteps)
model.save("env_ego_entering_accelerate_4/model")

### **Evaluate and compare the models**

**For env_v0**
- Average Reward: -0.0795
- Average Steps to Merge: 9.0
- Average Episode Time: 0.07 seconds
- Number of Collisions: 0
- Successful Merges: 200
- Number of Dangerous Driving Episodes (sudden speed changes): 0

**For env_v1**
- Average Reward: -0.0231
- Average Steps to Merge: 9.0
- Average Episode Time: 0.07 seconds
- Number of Collisions: 0
- Successful Merges: 200
- Number of Dangerous Driving Episodes (sudden speed changes): 0

**For env_v2**
- Average Reward: -8.0597
- Average Steps to Merge: 9.0
- Average Episode Time: 0.07 seconds
- Number of Collisions: 0
- Successful Merges: 200
- Number of Dangerous Driving Episodes (sudden speed changes): 0

**For env_v3**
- Average Reward: -8.0231
- Average Steps to Merge: 9.0
- Average Episode Time: 0.07 seconds
- Number of Collisions: 0
- Successful Merges: 200
- Number of Dangerous Driving Episodes (sudden speed changes): 0

**For env_v4**
- Average Reward: -0.0231
- Average Steps to Merge: 9.0
- Average Episode Time: 0.07 seconds
- Number of Collisions: 0
- Successful Merges: 200
- Number of Dangerous Driving Episodes (sudden speed changes): 0

In [18]:
def evaluate_agent(model, env, num_episodes, speed_threshold_ratio=0.5):
    total_rewards = []  # List to store total rewards for each episode
    total_collisions = 0  # Counter for total collisions across all episodes
    successful_merges = 0  # Counter for successful merges
    dangerous_driving_episodes = 0  # Counter for episodes with dangerous driving behavior
    total_steps_to_merge = []  # List to store the number of steps taken to merge in each episode
    total_episode_times = []  # List to store the time taken for each episode

    # Calculate the speed threshold
    reward_speed_range = env.unwrapped.config["reward_speed_range"]
    speed_threshold = (reward_speed_range[1] - reward_speed_range[0]) * speed_threshold_ratio  # Threshold for sudden speed changes

    for episode in range(num_episodes):
        start_time = time.time()  # Record the start time of the episode
        obs, info = env.reset()  # Reset the environment and get the initial observation
        done = False  # Variable to track if the episode is finished
        episode_reward = 0  # Variable to track the reward for the current episode
        collisions = 0  # Counter for collisions in the current episode
        dangerous_driving = False  # Flag to indicate if dangerous driving occurred
        steps_to_merge = 0  # Counter for steps taken to merge
        last_speed = None  # Initialize last speed as None

        # Store the positions of the vehicles on the highway to check the merge
        highway_vehicles = []
        for vehicle in env.road.vehicles:
            # Check if the vehicle is not the ego vehicle
            if vehicle != env.vehicle:
                highway_vehicles.append(vehicle)

        while not done:  # Loop until the episode is done
            # The agent chooses an action
            action, _states = model.predict(obs, deterministic=True)
            # Execute the action in the environment
            obs, reward, terminated, truncated, info = env.step(action)

            episode_reward += reward  # Accumulate reward for the episode
            steps_to_merge += 1  # Increment the steps to merge counter

            # Check the current speed and round to 2 decimal places
            current_speed = round(info.get('speed', 0), 2)

            # Check for sudden speed changes
            if last_speed is not None and abs(current_speed - last_speed) > speed_threshold:
                dangerous_driving = True  # Mark as dangerous driving if speed change exceeds threshold

            last_speed = current_speed  # Update the last speed for the next iteration

            # Check for collisions
            if 'crashed' in info and info['crashed']:
                collisions += 1  # Increment collision counter if a crash occurred

            # Check if the episode has ended (either 'terminated' or 'truncated')
            done = terminated or truncated

            ego_position = env.road.vehicles[0].position[0]  # Get the position of the ego vehicle
            highway_vehicles_positions = [vehicle.position[0] for vehicle in env.road.vehicles[1:]]  # Get positions of highway vehicles
            for highway_position in highway_vehicles_positions:
                    if not collisions and ego_position > highway_position and env.road.vehicles[0].lane_index[2]!=0:
                        successful_merges += 1  # Increment successful merges if the ego vehicle is ahead of at least one highway vehicle
                        done = True  # End the episode if the merge is successful

        # Log episode metrics
        total_rewards.append(episode_reward)  # Add episode reward to the total rewards list
        total_collisions += collisions  # Update total collisions count
        total_steps_to_merge.append(steps_to_merge)  # Add steps to merge for this episode

        if dangerous_driving:
            dangerous_driving_episodes += 1  # Increment count of dangerous driving episodes

        # Calculate the time taken for the episode and add to the list
        episode_time = time.time() - start_time  # Calculate elapsed time
        total_episode_times.append(episode_time)  # Add episode time to the list

    # Final metric calculations
    avg_reward = np.mean(total_rewards)  # Calculate average reward
    avg_steps_to_merge = np.mean(total_steps_to_merge)  # Calculate average steps to merge
    avg_episode_time = np.mean(total_episode_times)  # Calculate average episode time

    # Display results
    print(f"Average Reward: {avg_reward}")  # Print average reward
    print(f"Average Steps to Merge: {avg_steps_to_merge}")  # Print average steps to merge
    print(f"Average Episode Time: {avg_episode_time:.2f} seconds")  # Print average episode time
    print(f"Number of Collisions: {total_collisions}")  # Print total collisions
    print(f"Successful Merges: {successful_merges}")  # Print number of successful merges
    print(f"Number of Dangerous Driving Episodes (sudden speed changes): {dangerous_driving_episodes}")  # Print count of dangerous driving episodes

    return {
        "avg_reward": avg_reward,  # Return average reward
        "avg_steps_to_merge": avg_steps_to_merge,  # Return average steps to merge
        "avg_episode_time": avg_episode_time,  # Return average episode time
        "number_collisions": total_collisions,  # Return total number of collisions
        "successful_merges": successful_merges,  # Return number of successful merges
        "number_dangerous_episodes": dangerous_driving_episodes  # Return number of dangerous driving episodes
    }

In [19]:
# Load the trained model
model = PPO.load("env_ego_entering_accelerate_0/model")  

# Evaluate the model
results = evaluate_agent(model, env_v0, 200) 

Posição do veículo ego: [30.  14.5], Velocidade: 20
Posição do veículo da autoestrada: [80.  4.], Velocidade: 15.0
Distance to highway vehicle: 44.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 39.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 35.00000000000004, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 30.000000000000114, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 25.00000000

In [20]:
# Load the trained model
model = PPO.load("env_ego_entering_accelerate_1/model")  

# Evaluate the model
results = evaluate_agent(model, env_v1, 200) 

Posição do veículo ego: [30.  14.5], Velocidade: 20
Posição do veículo da autoestrada: [80.  4.], Velocidade: 15.0
Distance to highway vehicle: 44.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 39.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 35.00000000000004, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 30.000000000000114, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 25.00000000

In [21]:
# Load the trained model
model = PPO.load("env_ego_entering_accelerate_2/model")  

# Evaluate the model
results = evaluate_agent(model, env_v2, 200) 

Posição do veículo ego: [30.  14.5], Velocidade: 20
Posição do veículo da autoestrada: [80.  4.], Velocidade: 15.0
Distance to highway vehicle: 44.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 39.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 35.00000000000004, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 30.000000000000114, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 25.00000000

In [22]:
# Load the trained model
model = PPO.load("env_ego_entering_accelerate_3/model")  

# Evaluate the model
results = evaluate_agent(model, env_v3, 200) 

Posição do veículo ego: [30.  14.5], Velocidade: 20
Posição do veículo da autoestrada: [80.  4.], Velocidade: 15.0
Distance to highway vehicle: 44.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 39.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 35.00000000000004, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 30.000000000000114, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 25.00000000

In [23]:
# Load the trained model
model = PPO.load("env_ego_entering_accelerate_4/model")  

# Evaluate the model
results = evaluate_agent(model, env_v4, 200) 

Posição do veículo ego: [30.  14.5], Velocidade: 20
Posição do veículo da autoestrada: [80.  4.], Velocidade: 15.0
Distance to highway vehicle: 44.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 39.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 35.00000000000004, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 30.000000000000114, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 25.00000000

### **Analysis of Driving Agent Evaluation Results**

The performance of five different driving agent models, each configured with distinct parameter adjustments, is compared in terms of their ability to handle merges, avoid collisions, and maintain overall safety. The models—**env_v0**, **env_v1**, **env_v2**, **env_v3**, and **env_v4**—demonstrate various trade-offs between reward, merging behavior, and stability.

The **env_v0** model, with a balanced configuration, provides solid performance with **200 successful merges** and **0 collisions**. The **average reward** of **-0.0795** reflects a near-neutral outcome, which is typical for a system with balanced penalties and bonuses. The model consistently avoids **dangerous driving episodes** and maintains an efficient merging process, taking **9 steps to merge** on average with an **average episode time of 0.07 seconds**.

In **env_v1**, where penalties for influence and merging are increased, the agent's **average reward** improves slightly to **-0.0231**, indicating a marginal increase in performance. Despite the harsher penalties, the agent still successfully completes **200 merges** with **0 collisions** and **0 dangerous driving episodes**. The **9 steps to merge** and **0.07 seconds episode time** remain consistent with **env_v0**, suggesting that the agent adapts well to the increased penalties without significantly compromising its behavior.

The **env_v2** model, with higher penalties for merging, results in a significant drop in **average reward** to **-8.0597**, indicating a more cautious approach to merging. However, the model still completes **200 successful merges** and avoids collisions or dangerous behavior. The **average steps to merge** and **episode time** remain unchanged, indicating that despite the aggressive penalties, the agent's decision-making process stays consistent in terms of efficiency, even though it is more conservative in its approach.

The **env_v3** model, designed with a more "safe" configuration (higher penalties and merging bonus), shows a similar performance to **env_v2**. The **average reward** of **-8.0231** reflects the cautious approach that prioritizes safety over aggressive merging. Like **env_v2**, it avoids **collisions** and **dangerous episodes**, maintaining the same level of performance in terms of **successful merges** and **efficiency** (9 steps to merge, 0.07 seconds per episode). The agent appears to have a similarly conservative approach to merging, focusing on minimizing risk.

The **env_v4** model, with a configuration designed to incentivize merging, achieves a **reward** similar to **env_v1** at **-0.0231**. The agent still completes **200 successful merges**, with no **collisions** or **dangerous episodes**, showcasing the effectiveness of the merging bonus. The **9 steps to merge** and **0.07 seconds episode time** are consistent with the other models, indicating efficient behavior. The model strikes a balance between incentivizing merging and maintaining safety, ensuring effective performance without sacrificing risk management.


All five models perform well in terms of **successful merges**, **no collisions**, and **no dangerous driving episodes**. The primary differences lie in the **average reward**, which reflects the trade-offs between penalties, bonuses, and the agent's merging behavior:

- **env_v0** provides the most neutral and balanced performance, resulting in a modest but stable **reward**.
- **env_v1** demonstrates slight improvement in **average reward** due to the increased penalties, without compromising on merge success.
- **env_v2** and **env_v3**, with more aggressive or safety-focused configurations, result in significant drops in **average reward**, but the agents maintain safe and successful merging, with no collisions or dangerous episodes.
- **env_v4** incentivizes merging through a higher merging bonus, achieving a reward similar to **env_v1** while maintaining safety and efficiency.

Overall, the **env_v0**, **env_v1**, and **env_v4** models appear to strike the best balance between **successful merges**, **low collision rates**, and **stable performance**, making them the most effective configurations.

In [29]:
# Load the trained model
model = PPO.load("env_ego_entering_accelerate_0/model")

# Initialize the environment and variables for recording
frames = []
obs, info = env_v0.reset()
done = False
step_count = 0
max_steps = 1000

# Resize frame to be divisible by 16 (macro block size for video codecs)
def resize_frame_to_macro_block_size(frame, block_size=16):
    h, w, _ = frame.shape
    new_w = (w // block_size) * block_size
    new_h = (h // block_size) * block_size
    return cv2.resize(frame, (new_w, new_h))

# Run the agent in the environment
while step_count < max_steps and not done:
    action, _ = model.predict(obs)
    obs, reward, done, truncated, info = env_v0.step(action)
    frame = env_v0.render()

    # Resize the frame to avoid the macro_block_size warning
    resized_frame = resize_frame_to_macro_block_size(frame)
    frames.append(resized_frame)
    
    step_count += 1

# Close the environment
env_v0.close()

# Save the frames as a video
video_filename = "entering_and_accelerate.mp4"
imageio.mimsave(video_filename, frames, fps=30)
print(f"Video saved as {video_filename}")

Posição do veículo ego: [30.  14.5], Velocidade: 20
Posição do veículo da autoestrada: [80.  4.], Velocidade: 15.0
Distance to highway vehicle: 44.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 39.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 35.00000000000004, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 30.000000000000114, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 25.00000000

In [31]:
# Display the video
video_filename = "entering_and_accelerate.mp4"
Video(video_filename, embed=True)

In [11]:
# Load the trained model
model = PPO.load("env_ego_entering_accelerate_1/model")

# Initialize the environment and variables for recording
frames = []
obs, info = env_v1.reset()
done = False
step_count = 0
max_steps = 1000

# Resize frame to be divisible by 16 (macro block size for video codecs)
def resize_frame_to_macro_block_size(frame, block_size=16):
    h, w, _ = frame.shape
    new_w = (w // block_size) * block_size
    new_h = (h // block_size) * block_size
    return cv2.resize(frame, (new_w, new_h))

# Run the agent in the environment
while step_count < max_steps and not done:
    action, _ = model.predict(obs)
    obs, reward, done, truncated, info = env_v1.step(action)
    frame = env_v1.render()

    # Resize the frame to avoid the macro_block_size warning
    resized_frame = resize_frame_to_macro_block_size(frame)
    frames.append(resized_frame)
    
    step_count += 1

# Close the environment
env_v1.close()

# Save the frames as a video
video_filename = "entering_and_accelerate1.mp4"
imageio.mimsave(video_filename, frames, fps=30)
print(f"Video saved as {video_filename}")

Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int


Ego vehicle position: [30.  14.5], Speed: 20
Highway vehicle position: [80.  4.], Speed: 15.0
Distance to highway vehicle: 44.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 39.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 35.00000000000004, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 30.000000000000114, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 25.00000000000017, Ego speed: 20

In [12]:
# Display the video
video_filename = "entering_and_accelerate1.mp4"
Video(video_filename, embed=True)

In [13]:
# Load the trained model
model = PPO.load("env_ego_entering_accelerate_0/model")

# Initialize the environment and variables for recording
frames = []
obs, info = env_v2.reset()
done = False
step_count = 0
max_steps = 1000

# Resize frame to be divisible by 16 (macro block size for video codecs)
def resize_frame_to_macro_block_size(frame, block_size=16):
    h, w, _ = frame.shape
    new_w = (w // block_size) * block_size
    new_h = (h // block_size) * block_size
    return cv2.resize(frame, (new_w, new_h))

# Run the agent in the environment
while step_count < max_steps and not done:
    action, _ = model.predict(obs)
    obs, reward, done, truncated, info = env_v2.step(action)
    frame = env_v2.render()

    # Resize the frame to avoid the macro_block_size warning
    resized_frame = resize_frame_to_macro_block_size(frame)
    frames.append(resized_frame)
    
    step_count += 1

# Close the environment
env_v2.close()

# Save the frames as a video
video_filename = "entering_and_accelerate2.mp4"
imageio.mimsave(video_filename, frames, fps=30)
print(f"Video saved as {video_filename}")

Exception: code() argument 13 must be str, not int
Exception: code() argument 13 must be str, not int


Ego vehicle position: [30.  14.5], Speed: 20
Highway vehicle position: [80.  4.], Speed: 15.0
Distance to highway vehicle: 44.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 39.99999999999997, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 35.00000000000004, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 30.000000000000114, Ego speed: 20.0, Highway speed: 15.0, Highway acceleration: 0.0
Merging reward: 0.0, Influence penalty: 0.0, Total reward: 0.8333333333333333
crashFalse
overFalse
Distance to highway vehicle: 25.00000000000017, Ego speed: 20

In [14]:
# Display the video
video_filename = "entering_and_accelerate2.mp4"
Video(video_filename, embed=True)