In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, box, Polygon
import numpy as np
import matplotlib.pyplot as plt
import ast
import datetime
from sklearn.model_selection import TimeSeriesSplit

In [2]:
def setupApiFile():
    apiFile = pd.read_csv("../data/processed/crash_street_api_features.csv")

    # remove columns we basically already have``
    apiFile.drop(columns={"maxspeed", "road_type", "AADT"}, inplace=True)

    # just print out before filterin
    print(f"With and without lane count, we have: {apiFile.shape[0]} crashes")

    apiFile = apiFile.dropna(subset=["lane_count"])

    apiFile.fillna(0, inplace=True)

    print(f"With only lane count, we have: {apiFile.shape[0]} crashes")

    for index, crash in apiFile.iterrows():
        try:
            apiFile.at[index, "lane_count"] = int(crash["lane_count"])
        except:
            laneCount = ast.literal_eval(crash["lane_count"])
            laneCount = int(laneCount[0]) + int(laneCount[1])
            apiFile.at[index, "lane_count"] = laneCount
    return apiFile

In [None]:
apiFile = setupApiFile()
crash_frame = pd.read_csv("../data/processed/encoded_data_binary_encoding.csv")

# API already has this
crash_frame.drop(columns="Adjusted Average Daily Traffic Amount",inplace=True)

gdf = gpd.GeoDataFrame(
    crash_frame, 
    geometry=gpd.points_from_xy(crash_frame["Longitude"], crash_frame["Latitude"]), 
    crs="EPSG:4326"
)

# left upper bound - 33°15'08.7"N 97°21'46.3"W
# right lower bound - 32°22'36.5"N 96°07'37.0"W

#using our bounds, create the corners we'll use in our polygon
upperLeft = (-1 * (97 + 21/60 + 46.3/3600), 33 + 15/60 + 8.7/3600)
bottomRight = (-1 * (96 + 7/60 + 37/3600), 32 + 22/60 + 36.5/3600)

upperRight = (bottomRight[0], upperLeft[1])
bottomLeft = (upperLeft[0], bottomRight[1])

# create the polygon
dallasBounds = Polygon([upperLeft, upperRight, bottomRight, bottomLeft, upperLeft])

# create teh dataframe
dallasFrame = gpd.GeoDataFrame({"geometry": [dallasBounds]}, crs="EPSG:4326")

# cahnge coordinate system of both
dallasFrame = dallasFrame.to_crs(epsg=32614)
gdf = gdf.to_crs(epsg=32614)

cell_size = 500

minx, miny, maxx, maxy = dallasFrame.total_bounds

grids = []
index = 0
# create grids based on the bounds
for x in np.arange(minx, maxx, cell_size):
    for y in np.arange(miny, maxy, cell_size):
        grids.append(box(x, y, x+cell_size, y+cell_size))

# 2. Optional: intersect with Dallas boundary to crop
grid = gpd.GeoDataFrame({"geometry": grids}, crs=dallasFrame.crs)

grid = gpd.overlay(grid, dallasFrame, how="intersection")

# combine it based on which cells are matching. Now this is an array of cells
joined_data = gpd.sjoin(grid, gdf, how="left", predicate="intersects")

noCrashIndices = 0

# Replace NaN values in 'index_right' with -1
joined_data["index_right"] = joined_data["index_right"].fillna(0)

print(joined_data[joined_data["index_right"] == 0])



# then, join by api data
joined_data = pd.merge(joined_data, apiFile, on=["Latitude", "Longitude"], how="left")

# drops ones that either aren't an empty cell or don't have api values
joined_data = joined_data.drop(
    joined_data[
        (joined_data["index_right"] != 0) &
        (joined_data["lane_count"].isna())
    ].index
)


joined_data.drop(columns=["Crash ID", "geometry", "Latitude", "Longitude"], inplace=True)

final_cells = []

cellId = 0

hoursPerPeriod = 4

for i in range(0, 24, hoursPerPeriod):
    joined_data[f"time_bin_{i}_{i + 3}"] = 0

joined_data["crash_count_7d"] = 0
joined_data["crash_count_30d"] = 0
joined_data["cell_id"] = 0
joined_data["label"] = 0

joined_data["Crash Date"] = pd.to_datetime(joined_data["Crash Date"], format="%Y-%m-%d")

numCrashGroups = 0
numCrashesTotal = 0

# group by when they go to similar cells
for _, crashes in joined_data.groupby("index_right"):
    numCrashGroups+= 1
    numCrashesTotal += len(crashes)
    
    cellIndex = crashes.iloc[0]["index_right"]
    crashes["cell_id"] = cellId
    crashes.drop(columns=["index_right"], inplace=True)
    print(f'In cell {cellIndex} we have {len(crashes)} crashes!')

    crashes = crashes.sort_values(["Crash Date", "Hour of Day"], ascending=True).reset_index(drop=True)

    # loop through each crash to add it to each timeslot
    for i, crash in crashes.iterrows():        
        crashIndex = crash["Hour of Day"]
        # loop through each possible timeslot represneted by j, and put into the cells the value with the correct label based on if the time matches.
        for currentLowerHourBound in range(0, 24, hoursPerPeriod):
            currentUpperHourBound = currentLowerHourBound + (hoursPerPeriod - 1)

            crashToAppend = crash.copy()

            # check if the timeslot matches. If so, put crash with label of 1, otherwise keep it as the 0. Also, keep current hour of day
            if(crashIndex >= currentLowerHourBound and crashIndex <= currentUpperHourBound):
                crashToAppend["label"] = 1
            # otherwise, keep label as 0, and make the hour of day the halfway point between the 2 bounds
            else:
                crashToAppend["Hour of Day"] = int((currentLowerHourBound + currentUpperHourBound) / 2)

            rolling_7_count = 0
            rolling_30_count = 0
            currentCrashDate = crash["Crash Date"]
            # loop through previous crashes
            for z in range(i - 1, -1, -1):
                prevCrashDate = crashes.iloc[z]["Crash Date"]
                daysPassed = (currentCrashDate - prevCrashDate).days
                # use this if sattement to add it
                if(daysPassed == 0):
                    prevHour = crashes.iloc[z]["Hour of Day"]
                    if(prevHour < crashToAppend["Hour of Day"]):
                        rolling_7_count += 1
                        rolling_30_count += 1
                if(daysPassed > 30):
                    break
                # note that we don't use an else statement. This way if both are true, it'll be added to both
                if(daysPassed <= 7):
                    rolling_7_count += 1
                if(daysPassed <= 30):
                    rolling_30_count += 1
            # now, add it to the count
            crashToAppend["crash_count_7d"] = rolling_7_count
            crashToAppend["crash_count_30d"] = rolling_30_count

            # don't need it anymore, so remove it
            # make the right one hot encoded value as true
            crashToAppend[f"time_bin_{currentLowerHourBound}_{currentLowerHourBound+3}"] = 1

            final_cells.append(crashToAppend)
    cellId += 1

mean = numCrashesTotal / numCrashGroups

With and without lane count, we have: 41632 crashes
With only lane count, we have: 33649 crashes
                                                geometry  index_right  \
0      POLYGON ((653006.073 3648381.413, 653006.073 3...          0.0   
1      POLYGON ((653006.073 3648881.413, 653006.073 3...          0.0   
2      POLYGON ((653006.073 3649381.413, 653006.073 3...          0.0   
3      POLYGON ((653006.073 3649881.413, 653006.073 3...          0.0   
4      POLYGON ((653006.073 3650381.413, 653006.073 3...          0.0   
...                                                  ...          ...   
45398  POLYGON ((770006.073 3594881.413, 770006.073 3...          0.0   
45399  POLYGON ((770006.073 3595381.413, 770006.073 3...          0.0   
45400  POLYGON ((770006.073 3595881.413, 770006.073 3...          0.0   
45401  POLYGON ((770006.073 3596381.413, 770006.073 3...          0.0   
45402  POLYGON ((770006.073 3596881.413, 770006.073 3...          0.0   

       Crash ID At Interse

In [None]:
exportedDf = pd.DataFrame(final_cells)

# Removed b/c of time series
# exportedDf.drop(columns=["Hour of Day", "Crash Date"], inplace=True)

exportedDf.to_csv("../data/final/true_preprocessed_data.csv", index=False)

<bound method NDFrame.describe of     At Intersection Flag  Construction Zone Flag Crash Date  Crash Month  \
0                  False                   False 2023-02-11            2   
1                  False                   False 2023-02-19            2   
2                  False                   False 2023-03-16            3   
3                  False                   False 2023-03-17            3   
4                  False                   False 2023-06-24            6   
..                   ...                     ...        ...          ...   
33                  True                    True 2025-02-19            2   
34                  True                   False 2025-07-08            7   
35                  True                   False 2025-07-11            7   
36                 False                   False 2025-08-24            8   
37                 False                   False 2025-09-09            9   

    Day of Week_0  Day of Week_1  Day of Week_2  Hour

In [None]:
numCrashGroups = 0
numCrashesTotal = 0

# group by when they go to similar cells
for _, crashes in joined_data.groupby("index_right"):
    numCrashGroups+= 1
    numCrashesTotal += len(crashes)
mean = numCrashesTotal / numCrashGroups

standardDev = 0

# group by when they go to similar cells
for _, crashes in joined_data.groupby("index_right"):
    standardDev += (len(crashes) - mean)**2
standardDev /= (numCrashGroups - 1)

standardDev = standardDev ** (1/2)

print(f"Mean crashes was {mean}, standard deviation was {standardDev}")

Mean crashes was 42.045099739809196, standard deviation was 57.124815028371316
