# Accident Propensity Index Calculation v04
--- Ready for Deployment ---

Including efficiency features. Assumes that splines are not longer than around 5.5km since for each start point, only accidents within a radius of, in the most sub-optimal case, 5.5km are checked. This takes .08 seconds. When using 10km radius, it takes .17 seconds. With 100km radius it thakes .36 seconds.

## Import libraries

In [1]:
# Import relevant libraries
import pandas as pd
import math
import folium
import numpy as np

## Efficiency variables

In [2]:
# Define the bucket size: 1 would be every longitude/latitude (ca. 110km); 10 is 1/10th (ca. 11km); 100 is 1/100th (ca. 1.1km); 1000 is 1/1000th (ca. 0.11km)
bucket_length = 1000

# Maximal distance of accidents from route in kilometers
max_distance = 0.05

## Split accident data into buckets - run once at start

In [3]:
# Read the dataframe with the accident locations -> has only columns ID, Start_Lat, and Start_Lng
raw_data = pd.read_csv('final_accident_data.csv')

# Functions to generate latitude and longitude IDs for the bucketing
def get_lat_id(lat):
    return int(lat*bucket_length)
def get_lng_id(lng):
    return int(lng*bucket_length)

# Assign bucketing IDs to each accident
raw_data["lat_id"] = raw_data["Start_Lat"].apply(get_lat_id)
raw_data["lng_id"] = raw_data["Start_Lng"].apply(get_lng_id)

# Group by bucketing IDs
groups = raw_data.groupby(['lat_id', 'lng_id'])

# Iterate over the groups and create individual dataframes
for name, group in groups:
    # Create the dataframe name
    df_name = f"accidents_{name[0]}_{name[1]}"
    
    # Create the dataframe
    vars()[df_name] = group.copy()

# Get a copy of all global variables
global_vars = globals().copy()

## Load severity data

In [4]:
# Load csv
severity_data = pd.read_csv('severity_data.csv')

## Functions to calculate distances and find accidents on route - run once at start

In [5]:
# Define a function to calculate the distance between two points
def distance(point1, point2):
    lat1, lon1 = point1
    lat2, lon2 = point2
    km_per_lat = 110.574 # km per degree latitude
    km_per_lon = 111.320 # km per degree longitude at the equator
    dx = (lon2 - lon1) * km_per_lon * math.cos((lat1 + lat2) / 2)
    dy = (lat2 - lat1) * km_per_lat
    return math.sqrt(dx**2 + dy**2)

# Define a function to calculate the distance between a point and a line segment
def distance_to_segment(point, segment_start, segment_end):
    px, py = point
    x1, y1 = segment_start
    x2, y2 = segment_end
    dx, dy = x2 - x1, y2 - y1
    segment_length_squared = dx*dx + dy*dy
    if segment_length_squared == 0:
        return distance(point, segment_start)
    t = max(0, min(1, ((px - x1) * dx + (py - y1) * dy) / segment_length_squared))
    x = x1 + t * dx
    y = y1 + t * dy
    return distance(point, (x, y))

# Generate dataframe of accidents close to the segment
def generate_data(start_point):
    # Extract the integer values of the start point lat and lng
    start_lat = int(start_point[0]*bucket_length)
    start_lng = int(start_point[1]*bucket_length)

    # Get the dataframes that match the criteria
    dfs_to_use = []
    for lat_offset in [-1, 0, 1]:
        for lng_offset in [-1, 0, 1]:
            lat_id = start_lat + lat_offset
            lng_id = start_lng + lng_offset
            df_name = f"accidents_{lat_id}_{lng_id}"
            if df_name in global_vars and isinstance(global_vars[df_name], pd.DataFrame):
                dfs_to_use.append(global_vars[df_name])

    # Concatenate the dataframes and reset the index
    combined_df = pd.concat(dfs_to_use)
    data = combined_df.reset_index(drop=True)
    return data

# Define a function to find accidents on a given route within a maximum distance
def find_accidents_on_route(start_point, end_point):
    # Create a mask for accidents that are within the maximum distance from the route
    data = generate_data(start_point)
    mask = data.apply(lambda row: distance_to_segment((row['Start_Lat'], row['Start_Lng']), start_point, end_point) <= max_distance, axis=1)

    # Return the accidents that match the mask
    accidents = data.loc[mask]
    return accidents[['ID', 'Start_Lat', 'Start_Lng']]

## Find accidents on route - run every time

In [6]:
route_data = {'route_lat': [33.78252, 33.78343, 33.78438, 33.78526, 33.78612, 33.78701, 33.78794, 33.78884, 33.78974, 33.79064],
              'route_long': [-84.39096, -84.39099, -84.39091, -84.39087, -84.39082, -84.39077, -84.39078, -84.39077, -84.39079, -84.39077]}

route_df = pd.DataFrame(route_data)

In [7]:
route_df

Unnamed: 0,route_lat,route_long
0,33.78252,-84.39096
1,33.78343,-84.39099
2,33.78438,-84.39091
3,33.78526,-84.39087
4,33.78612,-84.39082
5,33.78701,-84.39077
6,33.78794,-84.39078
7,33.78884,-84.39077
8,33.78974,-84.39079
9,33.79064,-84.39077


In [27]:
# Split the DataFrame into three equally sized parts
df_list = np.array_split(route_df, 5)

# Loop through the DataFrames
for i, df in enumerate(df_list):
    # Create a new DataFrame to store the results
    accidents_df = pd.DataFrame()
    # Loop through the pairs of subsequent coordinates
    for j in range(len(df) - 1):
        start_point = j
        end_point = j + 1
        # Call the function and append the results to the accidents DataFrame
        accidents_df = accidents_df.append(find_accidents_on_route(df.iloc[start_point], df.iloc[end_point]), ignore_index=True)
    # Drop duplicate rows from the accidents DataFrame
    accidents_df.drop_duplicates(inplace=True)
    
    # Assign a name to the accidents DataFrame based on the index of the original DataFrame
    df_index = i+1
    globals()[f'accidents_df_{df_index}'] = accidents_df
    
    # Print the resulting accidents DataFrame
    print(f'accidents_df_{df_index}:')
    print(accidents_df)

accidents_df_1:
           ID  Start_Lat  Start_Lng
0    A-152680  33.782870 -84.391229
1   A-1061831  33.782424 -84.391208
2   A-2505844  33.782878 -84.391229
3   A-2616607  33.782698 -84.391220
4   A-2738627  33.782366 -84.391216
5    A-140524  33.782290 -84.390880
6    A-493905  33.782457 -84.390849
7    A-503690  33.782457 -84.390849
8   A-1221326  33.782457 -84.390849
9   A-1409445  33.782457 -84.390849
10  A-1465177  33.782457 -84.390849
11  A-1780717  33.782457 -84.390849
12  A-1801859  33.782457 -84.390849
13  A-2485907  33.782900 -84.390850
14  A-2512851  33.782290 -84.390880
15    A-43876  33.783160 -84.390840
16   A-177208  33.783495 -84.390835
17  A-2150991  33.783230 -84.390840
18  A-2184775  33.783230 -84.390840
accidents_df_2:
           ID  Start_Lat  Start_Lng
0    A-504119  33.784686 -84.391389
1   A-1053987  33.784686 -84.391389
2   A-1266886  33.784483 -84.391373
3   A-1415465  33.784686 -84.391389
4     A-63151  33.784120 -84.390830
5     A-80758  33.784110 -84.390

  accidents_df = accidents_df.append(find_accidents_on_route(df.iloc[start_point], df.iloc[end_point]), ignore_index=True)
  accidents_df = accidents_df.append(find_accidents_on_route(df.iloc[start_point], df.iloc[end_point]), ignore_index=True)
  accidents_df = accidents_df.append(find_accidents_on_route(df.iloc[start_point], df.iloc[end_point]), ignore_index=True)
  accidents_df = accidents_df.append(find_accidents_on_route(df.iloc[start_point], df.iloc[end_point]), ignore_index=True)
  accidents_df = accidents_df.append(find_accidents_on_route(df.iloc[start_point], df.iloc[end_point]), ignore_index=True)


## Calculate Accident Propensity Index

In [28]:
# create a list of dataframes
accidents_dfs = [accidents_df_1, accidents_df_2, accidents_df_3, accidents_df_4, accidents_df_5]
api_dict = {}

# loop through the list of dataframes and join each one with the all_accidents dataframe
for i, df in enumerate(accidents_dfs, start=1):
    exec(f'accidents_df_{i} = df.join(severity_data.set_index("ID"), on="ID", how="left")')
    exec(f'total_severity = accidents_df_{i}["Severity"].sum()')
    api = total_severity / 6035011
    api_dict[f'api_{i}'] = round(api,8)

In [29]:
accidents_df_1

Unnamed: 0,ID,Start_Lat,Start_Lng,Severity
0,A-152680,33.78287,-84.391229,2
1,A-1061831,33.782424,-84.391208,2
2,A-2505844,33.782878,-84.391229,3
3,A-2616607,33.782698,-84.39122,3
4,A-2738627,33.782366,-84.391216,3
5,A-140524,33.78229,-84.39088,3
6,A-493905,33.782457,-84.390849,2
7,A-503690,33.782457,-84.390849,2
8,A-1221326,33.782457,-84.390849,2
9,A-1409445,33.782457,-84.390849,2


In [30]:
api_dict

{'api_1': 7.62e-06,
 'api_2': 1.243e-05,
 'api_3': 1.375e-05,
 'api_4': 3.31e-06,
 'api_5': 1.74e-05}