In [24]:
import os
import json
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [25]:
data_dir = os.path.join(os.getcwd(), 'simulation_results')

##### Task 1.1

In [26]:
scenarios = ['clear-night', 'clear-noon', 'clear-sunset', 'haze-noon', 'haze-sunset', 'rain-noon']
csvs = ['ctl', 'cvip', 'traj']

In [None]:
all_data = []
dfs = {}
for folder in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder)
    if os.path.isdir(folder_path):
        for scenario in scenarios:
            if scenario in folder:
                folder = scenario
                break

        for file in os.listdir(folder_path):
            if file.endswith('.csv'):
                file_path = os.path.join(folder_path, file)
                df = pd.read_csv(file_path)
                for csv_type in csvs:
                    if csv_type in file:
                        file = csv_type
                        break
                all_data.append((folder, file, df))
                key = f"{folder}_{file}"
                dfs[key] = df

for i, (folder, file, df) in enumerate(all_data):
    print(f"Data from folder '{folder}', file '{file}':\n", df.head(), "\n")
    print("="*50)

In [None]:
# List the first 5 rows of the ctl.csv, cvip.csv, and traj.csv with the clear-night weather condition.
scenario = 'clear-night'

for file_type in csvs:
    key = f"{scenario}_{file_type}"
    df = dfs.get(key)
    if df is not None:
        # print(df['agent_id'].nunique())
        print(f"Data from '{key}':\n", df.head())

##### Task1.2

In [None]:
# a. The duration of the scene
duration = 0
for scenario in scenarios:
    for file_type in csvs:
        key = f"{scenario}_{file_type}"
        df = dfs.get(key)
        if df is not None:
            start_time = df['ts'].min()
            end_time = df['ts'].max()
            duration = end_time - start_time

    print(f"Scene '{scenario}'")
    print(f"The duration of the scene: {duration} fps")
    print("="*50)

b. Mean and standard deviation of the values of the features

In [None]:
features = ['throttle', 'steer', 'brake', 'cvip', 'x', 'y', 'v']
results_df = pd.DataFrame(index=features)

for scenario in scenarios:
    mean_std_values = {}
    for file_type in csvs:
        key = f"{scenario}_{file_type}" 
        df = dfs.get(key) 
        if df is not None:
            for feature in features:
                if feature in df.columns:
                    mean_value = df[feature].mean()
                    std_value = df[feature].std()
                    mean_std_values[feature] = f"{mean_value:.3f} ({std_value:.3f})"

    if mean_std_values:
        results_df[scenario] = pd.Series(mean_std_values)

results_df

In [31]:
results_df.to_excel('df.xlsx', index=True)

##### Task 1.3

In [None]:
# colors, matching with scenarios
colors = ['tomato', 'crimson', 'darkorange', 'cornflowerblue', 'lightseagreen', 'lightslategrey']

for feature in features:
    plt.figure(figsize=(10, 6))
    plt.title(f"{feature.capitalize()} vs Frames for different weather conditions")
    plt.xlabel("Frames (ts)")  
    plt.ylabel(f"{feature.capitalize()} value")

    for i, (scenario, color) in enumerate(zip(scenarios, colors)):
        df = None
        for file_type in csvs:
            key = f"{scenario}_{file_type}"
            df = dfs.get(key)
            if df is not None and feature in df.columns:
                df['ts'] = df['ts'].astype(float) 
                plt.plot(df['ts'], df[feature], label=scenario, color=color, alpha=0.7)

    plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True)) 
    plt.xticks(rotation=45)
    plt.legend(title="Weather Conditions", loc='lower right')
    plt.tight_layout()  
    plt.show()

##### Task 1.4

##### Based on your intuition and life experience, which of the features do you think will change during an accident? How will the feature(s) change? 
1. `Throttle`: The throttle may decrease sharply just before the accident, as drivers usually slow down or stop when sensing a collision is imminent.
2. `Brake`: The brake input may increase dramatically during an accident, indicating the driver’s attempt to stop the vehicle.
3. `Steering`: The steering may show rapid changes during an accident, as the driver might try to swerve to avoid an object or another vehicle.
4. `Speed(v)`: Speed will typically decrease and may even reach zero as the vehicle comes to a stop due to the accident.
5. `Distance to other objects(cvip)`: The distance between the vehicle and other vehicles or objects will decrease rapidly as the accident approaches.
6. `x(horizontal direction)`: If the driver or the autonomous system attempts to swerve before the accident, x may undergo rapid changes.
7. `y(driving direction)`: If the vehicle decelerates or stops before the accident, y will gradually decrease and may eventually approach zero.

##### By looking at the plots you generated in Task 1.3, which weather condition(s) has an accident? 
Based on the observations: 
1. The brake value(brake=1) remains at 1, indicating the vehicle applied full braking. 
2. The speed(v) drops and does not increase again, suggesting the vehicle may have come to a stop.
3. The cvip value equals 0, meaning the distance between the vehicle and another object or vehicle is zero, implying a collision.

From these points, it can be inferred that an accident occurred under the **`rain-noon`** weather condition.

##### Task1.3 plot time as an ordinal variable(additional)

In [None]:
offset = 0  

for feature in features:
    plt.figure(figsize=(10, 6)) 
    plt.title(f"{feature.capitalize()} vs Time for different weather conditions")
    plt.xlabel("Index")
    plt.ylabel(f"{feature.capitalize()} value")

    for i, (scenario, color) in enumerate(zip(scenarios, colors)):
        df = None 
        for file_type in csvs:
            key = f"{scenario}_{file_type}"  
            df = dfs.get(key) 
            if df is not None and feature in df.columns:
                plt.plot(df.index + offset, df[feature], label=scenario, color=color, alpha=0.7)
                offset += len(df)

    plt.xticks(rotation=45)  
    # plt.legend(title="Weather Conditions", loc='lower right')
    plt.tight_layout()  
    plt.show()

##### Task1.3 subplot(additional)

In [None]:
ncols = 3  
nrows = (len(scenarios) + ncols - 1) // ncols  

for feature in features:
    fig, ax = plt.subplots(nrows, ncols, figsize=(12, 6 * nrows))  

    ax = ax.flatten()  
    for i, (scenario, color) in enumerate(zip(scenarios, colors)):
        df = None  
        for file_type in csvs:
            key = f"{scenario}_{file_type}"  
            df = dfs.get(key)  
            if df is not None and feature in df.columns:
                ax[i].plot(df['ts'], df[feature], label=scenario, color=color, alpha=0.7)
                ax[i].set_title(f"{feature.capitalize()} vs Frames for {scenario}")
                ax[i].set_xlabel("Frames (ts)")  
                ax[i].set_ylabel(f"{feature.capitalize()} value")
                # ax[i].legend(loc='lower right')

    for j in range(i + 1, len(ax)):
        fig.delaxes(ax[j])

    plt.tight_layout()  
    plt.show()

##### Turn txt files into a dataframe first

In [35]:
# Function to read the route_highway.txt file
def read_route_highway_txt(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
        # Extract necessary fields from JSON-like data
        return {
            'duration_game': data['_checkpoint']['records'][0]['meta']['duration_game'],
            'duration_system': data['_checkpoint']['records'][0]['meta']['duration_system'],
            'route_length': data['_checkpoint']['records'][0]['meta']['route_length'],
            'score_composed': data['_checkpoint']['records'][0]['scores']['score_composed'],
            'score_penalty': data['_checkpoint']['records'][0]['scores']['score_penalty'],
            'score_route': data['_checkpoint']['records'][0]['scores']['score_route'],
            'status': data['_checkpoint']['records'][0]['status']
        }

# Function to read CSV files
def read_csv_file(file_path):
    return pd.read_csv(file_path)

In [36]:
df_list = []

In [None]:
merged_csv_df = []
df_list = []  # Initialize the df_list to store all the DataFrames

for scenario in scenarios:
    folder_path = os.path.join(data_dir, f'route_highway_epoch24_{scenario}_fi_ghost_cutin')
    files_in_folder = os.listdir(folder_path)
    print(f"Files in folder {folder_path}: {files_in_folder}")

    # Read the route_highway.txt file
    route_highway_txt_path = os.path.join(folder_path, 'route_highway.txt')

    if os.path.exists(route_highway_txt_path):
        route_highway_data = read_route_highway_txt(route_highway_txt_path)
        route_highway_df = pd.DataFrame([route_highway_data])
    else:
        print(f"route_highway.txt not found in {folder_path}")
        continue  # Skip to the next scenario if the file is not found

    # Concatenate the route_highway_df with merged_csv_df
    full_df = pd.concat([pd.DataFrame(merged_csv_df), pd.concat([route_highway_df], ignore_index=True)], axis=1)
    full_df['scenario'] = scenario
    df_list.append(full_df)

# After the loop, you can concatenate all dataframes in df_list if needed
final_df = pd.concat(df_list, ignore_index=True)

# Combine all scenario data into one dataframe
if df_list:
    final_df = pd.concat(df_list, ignore_index=True)
else:
    print("No valid dataframes were created.")

In [None]:
final_df

##### Task 2.1

In [None]:
# Suppose each simulation run has a result of accident/non-accident, calculate the probability of accident (counts, marginal probability).
# Count the number of accidents ('Failed') and total runs
accident_count = final_df[final_df['status'] == 'Failed'].shape[0]
total_runs = final_df.shape[0]

# Calculate the probability of an accident
probability_of_accident = accident_count / total_runs

# Display the results
print(f"Total runs: {total_runs}")
print(f"Accidents: {accident_count}")
print(f"Probability of accident: {probability_of_accident:.4f}")

##### Task 2.2

##### Task 2.4

In [None]:
# Iterate over each scenario
df_list = []
for scenario in scenarios:
    folder_path = os.path.join(data_dir, f'route_highway_epoch24_{scenario}_fi_ghost_cutin')
    files_in_folder = os.listdir(folder_path)
    print(f"Files in folder {folder_path}: {files_in_folder}")

    # Read the route_highway.txt file
    route_highway_txt_path = os.path.join(folder_path, 'route_highway.txt')

    if os.path.exists(route_highway_txt_path):
        route_highway_data = read_route_highway_txt(route_highway_txt_path)
        route_highway_df = pd.DataFrame([route_highway_data])
    else:
        print(f"route_highway.txt not found in {folder_path}")
        continue

    # Initialize an empty dictionary to hold CSV data for each scenario
    csv_data_dict = {}
    # Read the CSV files (ctl, cvip, traj)
    for csv_type in csvs:
        matching_files = [f for f in files_in_folder if f.endswith(f'single_{csv_type}.csv')]
        if matching_files:
            csv_file_path = os.path.join(folder_path, matching_files[0])
            csv_df = read_csv_file(csv_file_path)
            csv_data_dict[csv_type] = csv_df
        else:
            print(f"No file ending with 'single_{csv_type}.csv' found in {folder_path}")

    # Merge CSV dataframes on 'ts','agent_id' column
    if 'ctl' in csv_data_dict:
        merged_csv_df = csv_data_dict['ctl']
        for csv_type in ['cvip', 'traj']:
            if csv_type in csv_data_dict:
                merged_csv_df = pd.merge(merged_csv_df, csv_data_dict[csv_type], on=['ts','agent_id'], how='left')

        # Concatenate the route_highway data to match the length of merged CSV dataframe
        data_df = pd.concat([merged_csv_df, pd.concat([route_highway_df]*len(merged_csv_df), ignore_index=True)], axis=1)
        data_df['scenario'] = scenario
        df_list.append(data_df)

# Combine all scenario data into one dataframe
if df_list:
    data_df = pd.concat(df_list, ignore_index=True)
else:
    print("No valid dataframes were created.")

In [None]:
data_df

In [None]:
# a. We study the following features: “brake”, “steer”, “v”, “y”, “x”, “cvip”, “throttle”. Plot the distribution of each feature for the abnormal runs (including the accident runs) vs normal runs. Treat the values at each time point as an independent individual sample and generate the density plot of the distribution. Describe the difference between the “steer” distribution for normal and abnormal runs.
# Classify runs as normal or abnormal based on the 'status' field
data_df['run_type'] = data_df['status'].apply(lambda x: 'Normal' if x == 'Completed' else 'Abnormal')

# List of features to analyze
features = ['brake', 'steer', 'v', 'y', 'x', 'cvip', 'throttle']

# Plot distribution for each feature
for feature in features:
    plt.figure(figsize=(10, 6))
    
    # Generate density plot for normal and abnormal runs
    sns.kdeplot(data=data_df[data_df['run_type'] == 'Normal'], x=feature, label='Normal', fill=True, alpha=0.5)
    sns.kdeplot(data=data_df[data_df['run_type'] == 'Abnormal'], x=feature, label='Abnormal', fill=True, alpha=0.5)
    
    # Add title and labels
    plt.title(f'Distribution of {feature} for Normal vs Abnormal Runs', fontsize=14)
    plt.xlabel(f'{feature}', fontsize=12)
    plt.ylabel('Density', fontsize=12)
    plt.legend()
    
    # Show the plot
    plt.show()

##### i. 
Null Hypothesis (H₀): The mean of the *`steer`* values for abnormal runs is equal to the mean of the *`steer`* values for normal runs.\
𝐻0: 𝜇 abnormal = 𝜇 normal

Alternative Hypothesis (H₁): The mean of the *`steer`* values for abnormal runs is different from the mean of the *`steer`* values for normal runs.\
𝐻1: 𝜇 abnormal ≠ 𝜇 normal
##### ii. 
We use Levene's test to check whether the variances between two groups of data (normal and abnormal steer values) are equal. Checking the equality of variances is important before performing a t-test, as it determines which version of the t-test should be used. It is a robust test for equality of variances and works well even when the data is not normally distributed.
* `Levene's test result`: Based on the result (Statistic: 38.9101, p-value: 0.0000), we chose Welch's t-test for further analysis.
* `T-Test (Welch's T-test) result`: T-test statistic: 3.9279, p-value: 0.0001
##### iii. 
Based on the results of both the Levene’s test and Welch’s t-test, we conclude that the steering behavior (steer values) during abnormal runs is significantly different from that during normal runs. This difference is statistically significant at the 0.05 significance level.

In [None]:
# b.
# Classify runs as normal or abnormal based on the 'status' field
data_df['run_type'] = data_df['status'].apply(lambda x: 'Normal' if x == 'Completed' else 'Abnormal')

# Separate steer values for normal and abnormal runs
normal_steer = data_df[data_df['run_type'] == 'Normal']['steer']
abnormal_steer = data_df[data_df['run_type'] == 'Abnormal']['steer']

# Remove NaN or infinite values from both normal and abnormal steer values
normal_steer = normal_steer.dropna()
abnormal_steer = abnormal_steer.dropna()

# Check for infinite values (inf)
normal_steer = normal_steer[~normal_steer.isin([float('inf'), float('-inf')])]
abnormal_steer = abnormal_steer[~abnormal_steer.isin([float('inf'), float('-inf')])]

# Step 1: Test for equal variances using Levene’s test
stat, p_value_var = stats.levene(normal_steer, abnormal_steer)

print(f'Levene’s test statistic: {stat:.4f}, p-value: {p_value_var:.4f}')

# Step 2: Perform the appropriate t-test based on the result of Levene’s test
if p_value_var > 0.05:
    # Variances are equal, use the standard 2-sample t-test (pooled variance)
    t_stat, p_value_t = stats.ttest_ind(normal_steer, abnormal_steer, equal_var=True)
else:
    # Variances are not equal, use Welch's t-test (unequal variances)
    t_stat, p_value_t = stats.ttest_ind(normal_steer, abnormal_steer, equal_var=False)

print(f'T-test statistic: {t_stat:.4f}, p-value: {p_value_t:.4f}')

##### Task2.5

In [None]:
# b.
selected = data_df[['steer', 'cvip', 'v']]
correlation_matrix = selected.corr()
correlation_matrix

##### Task2.6

In [None]:
# b. Perform the KS two-sample test and calculate its statistics.
features = ['steer', 'cvip', 'v']

for feature in features:
    normal_values = data_df[data_df['status'] == 'Completed'][feature]
    abnormal_values = data_df[data_df['status'] == 'Failed'][feature]

    ks_statistic, p_value = stats.ks_2samp(normal_values, abnormal_values)

    print(f"Feature: {feature}")
    print(f"KS Statistic: {ks_statistic}")
    print(f"P-value: {p_value}")
    print("-" * 30) 

In [None]:
# d. Repeat the same test on a feature that you did not select as an indicator of abnormal behavior in Task 2.5. What is your conclusion?
features_nselected = ['brake', 'y', 'x', 'throttle']

for feature in features_nselected:
    normal_values = data_df[data_df['status'] == 'Completed'][feature]
    abnormal_values = data_df[data_df['status'] == 'Failed'][feature]

    ks_statistic, p_value = stats.ks_2samp(normal_values, abnormal_values)

    print(f"Feature: {feature}")
    print(f"KS Statistic: {ks_statistic}")
    print(f"P-value: {p_value}")
    print("-" * 30) 

##### Task2.8