##### Group 17
##### Hsin Chun Cheng(hccheng3), Po Wei Hsu(powei2)

In [19]:
import os
import json
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from dtaidistance import dtw
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [20]:
file_path = 'mp1_part3_data_2024.csv'
df = pd.read_csv(file_path)
df.shape

(2000, 4)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2000 non-null   int64 
 1   weather     2000 non-null   object
 2   scenario    2000 non-null   object
 3   status      2000 non-null   object
dtypes: int64(1), object(3)
memory usage: 62.6+ KB


In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,weather,scenario,status
0,0,clear,ghost_cutin,Completed
1,1,cloudy,ghost_cutin,Completed
2,2,rain,ghost_cutin,Completed
3,3,snow,ghost_cutin,Completed
4,4,clear,ghost_cutin,Completed


##### Task4

##### 1. Parse the provided Carla simulation dataset and calculate the following probabilities for the cut-in scenario (you need to filter our invalid data points before doing the analysis):

In [28]:
# a. The probability of accident P(acc=1) across all weather conditions.

# Filter rows where the status is not 'Completed' to identify accident occurrences
valid_data = df[df['status'] != 'Completed']  
# Calculate the total number of data points in the dataset
total_data_points = len(df)  

# Calculate the probability of an accident, P(acc=1), across all weather conditions
# This is done by dividing the number of accident occurrences by the total data points
P_acc_all = len(valid_data) / total_data_points  

# Output the overall accident probability
P_acc_all

0.099

In [29]:
# b. The probability of an accident conditioned on the weather, P(acc=1 | weather=?), for each weather condition.
# Create a new column 'accident' where an accident occurrence is marked as 1 if status is not 'Completed', otherwise 0
df['accident'] = df['status'].apply(lambda x: 1 if x != 'Completed' else 0)

# Calculate the total number of occurrences for each weather condition
weather_counts = df.groupby('weather').size()

# Calculate the number of accidents for each weather condition
weather_accidents = df.groupby('weather')['accident'].sum()

# Calculate the probability of an accident given each weather condition, P(acc=1 | weather=?)
# This is done by dividing the number of accidents by the total occurrences for each weather condition
P_acc_weather = weather_accidents / weather_counts

# Output the conditional probability of an accident for each weather condition
P_acc_weather

weather
clear     0.024
cloudy    0.054
rain      0.130
snow      0.188
dtype: float64

##### 2. The baseline simulated dataset contains accident information for snowy conditions and rainy conditions. In California it is sunny 80% of the time, rainy 5% of the time, snowy 2% of the time and cloudy the rest of the time. In Chicago, it is sunny 60% of the time, it rains 15% and it is snowy 20% of the time, and cloudy the rest of the time. Calculate the probability of an accident in the cut-in scenario for California and Chicago, respectively.

In [30]:
# Calculate the weighted average probability of an accident in California
# based on the weather distribution in California
P_acc_california = (0.80 * P_acc_weather['clear'] +  # 80% clear weather
                    0.05 * P_acc_weather['rain'] +   # 5% rainy weather
                    0.02 * P_acc_weather['snow'] +   # 2% snowy weather
                    0.13 * P_acc_weather['cloudy'])  # 13% cloudy weather

# Calculate the weighted average probability of an accident in Chicago
# based on the weather distribution in Chicago
P_acc_chicago = (0.60 * P_acc_weather['clear'] +    # 60% clear weather
                 0.15 * P_acc_weather['rain'] +     # 15% rainy weather
                 0.20 * P_acc_weather['snow'] +     # 20% snowy weather
                 0.05 * P_acc_weather['cloudy'])    # 5% cloudy weather

# Output the overall probability of an accident for California and Chicago
P_acc_california, P_acc_chicago

(np.float64(0.03648), np.float64(0.0742))

##### 3. In Part 2, Task 3.4, you calculated the AV’s probability of an accident per mile for the California DMV dataset. Suppose you want to compare the simulated accident rate with the real dataset accident rate.

In [31]:
# a. Unfortunately, the real DMV data only includes sunny and cloudy weather conditions.
# i. To make a reasonable comparison between the accident probabilities in the simulated dataset and the real dataset,
#    we will adjust the simulated data's probabilities to only include sunny and cloudy conditions.

# Calculate the adjusted weights for sunny and cloudy weather based on their proportions
a = 0.8 / (0.8 + 0.13)  # Weight for sunny (clear) weather
b = 0.13 / (0.8 + 0.13)  # Weight for cloudy weather

# Adjusted probability of an accident under sunny (clear) weather in the simulated dataset
P_sim_sunny = a * P_acc_weather['clear']

# Adjusted probability of an accident under cloudy weather in the simulated dataset
P_sim_cloudy = b * P_acc_weather['cloudy']

# Combined probability of an accident considering only sunny and cloudy weather conditions in the simulated dataset
P_sim_sunny_cloudy = a * P_acc_weather['clear'] + b * P_acc_weather['cloudy']

# Output the probabilities for sunny, cloudy, and combined sunny/cloudy conditions in the simulated dataset
P_sim_sunny, P_sim_cloudy, P_sim_sunny_cloudy

(np.float64(0.02064516129032258),
 np.float64(0.007548387096774193),
 np.float64(0.028193548387096774))

In [32]:
# ii. the probability you calculated in Part2, Task 3.4 = the AV’s marginal (unconditional) accident rate per mile for the real CA DMV dataset. 
accident_prob_per_mile_clear, accident_prob_per_mile_cloudy, total_accident_prob_per_mile = (0.00010171405221966454, 0.0017264475242685948, 0.0018281615764882593)