In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import time

In [None]:
crash_frame = pd.read_csv("../PotentialCSV.csv")
    
gdf = gpd.GeoDataFrame(
    crash_frame, 
    geometry=gpd.points_from_xy(crash_frame["Longitude"], crash_frame["Latitude"]), 
    crs="EPSG:4326"
)

gdf = gdf.to_crs(epsg=32614)

grids = gpd.read_parquet("../../data/processing/cells.parquet")

# combine it based on which cells are matching. Now this is an array of cells
joined_data = gpd.sjoin(gdf, grids, how="left", predicate="intersects")

# drops crashes that don't correspond to a cell
joined_data = joined_data.drop(joined_data[joined_data["index_right"].isna()].index)

matching_cells_indices = joined_data["index_right"].unique()

non_matching_cells = gpd.GeoDataFrame(
    grids.drop(index=matching_cells_indices),
    geometry="geometry",
    crs=grids.crs
)

# Add index_right column to unmatched grids for consistency
non_matching_cells["index_right"] = non_matching_cells.index
# Combine efficiently
joined_data = pd.concat([joined_data, non_matching_cells], ignore_index=True)
joined_data = gpd.GeoDataFrame(joined_data, geometry="geometry", crs=grids.crs)
joined_data = joined_data.to_crs("EPSG:4326")

finalCrashCells = []
finalFalseCells = []

joined_data["crash_count_7d"] = 0
joined_data["crash_count_20d"] = 0
joined_data["crash_count_30d"] = 0

# they both are functionally the same, so rename for easier mapping
joined_data.rename(columns = {"index_right": "cell_id", "Crash_Date": "Crash Date", "Crash_Time": "Crash Time", "Day_of_Week": "Day of Week"}, inplace=True)
joined_data["label"] = 0

joined_data["Crash Date"] = pd.to_datetime(joined_data["Crash Date"], format="%Y-%m-%d")

With and without lane count, we have: 41632 crashes
With only lane count, we have: 33649 crashes
Finished concatenating


In [None]:
def getRollingCounts(theCrash, crashTime, allCrashes, startIndex):
    rolling_7_count = 0
    rolling_20_count = 0
    rolling_30_count = 0

    # loop through previous crashes
    for z in range(startIndex - 1, -1, -1):
        prevCrashDate = allCrashes.iloc[z]["Crash Date"]
        daysPassed = (theCrash["Crash Date"] - prevCrashDate).days
        # use this if sattement to add it
        if(daysPassed == 0):
            prevTime = allCrashes.iloc[z]["Crash Time"]
            prevTime = datetime.strptime(prevTime, "%I:%M %p").time()
            if(prevTime < crashTime):
                rolling_7_count += 1
                rolling_20_count += 1
                rolling_30_count += 1
        if(daysPassed > 30):
            break
        # note that we don't use an else statement. This way if both are true, it'll be added to both
        if(daysPassed <= 7):
            rolling_7_count += 1
        if(daysPassed <= 20):
            rolling_20_count += 1
        if(daysPassed <= 30):
            rolling_30_count += 1
    
    return (rolling_7_count, rolling_20_count, rolling_30_count)

def processCell(currentCrashes, coordinates, columns):
    trueCrashArr = []
    numFalseCrashes = max(10, int(2.5 * len(currentCrashes)))

    dayDict = {"SUN": 0, "MON": 1, "TUE": 2, "WED": 3, "THU": 4, "FRI": 5, "SAT": 6}

    weekHourDistribution = np.ones(24 * 7)
    weekHourArr = np.arange(0, len(weekHourDistribution), 1)

    for i, realCrash in currentCrashes.iterrows():
        crashTime = realCrash["Crash Time"]
        crashTime = datetime.strptime(crashTime, "%I:%M %p").time()

        dayOfWeek = dayDict[realCrash["Day of Week"]]
        realCrash["Day of Week"] = dayOfWeek
        
        realCrash["Crash Month"] = realCrash["Crash Date"].month

        weekHour = 24 * dayOfWeek + crashTime.hour
        weekHourDistribution[weekHour] += 1

        rollingCounts = getRollingCounts(realCrash, crashTime, currentCrashes, i)
        realCrash["crash_count_7d"] = rollingCounts[0]
        realCrash["crash_count_20d"] = rollingCounts[1]
        realCrash["crash_count_30d"] = rollingCounts[2]

        realCrash["Hour_Of_Day"] = crashTime.hour

        realCrash["label"] = 1
        trueCrashArr.append(realCrash)

    minuteArr = np.arange(0, 60, 1)

    falseCrashArr = []
    # normalized to sum to 1
    weekHourDistribution = weekHourDistribution / weekHourDistribution.sum()

    start_date = pd.Timestamp("2022-01-01")
    end_date = pd.Timestamp('2025-12-31')

    # Generate all Sundays between start and end dates
    week_starts = pd.date_range(start=start_date, end=end_date, freq='W-SUN')

    weekHour = np.random.choice(weekHourArr, size = numFalseCrashes, p=weekHourDistribution)
    minute = np.random.choice(minuteArr, size = numFalseCrashes)
    selectedWeek = np.random.choice(week_starts, size = numFalseCrashes)

    
    for i in range(numFalseCrashes):

        correctInWeek = selectedWeek[i] + pd.Timedelta(hours = weekHour)
        day = correctInWeek.day
        month = correctInWeek.month
        year = correctInWeek.year

        currentDate = datetime(year, month, day)

        dayOfWeek = int(weekHour[i] / 24)
        hour = weekHour[i] % 24

        t = time(hour=hour, minute=minute[i])
        timeFormatted = t.strftime("%I:%M %p")
        correctTimeObj = datetime.strptime(timeFormatted, "%I:%M %p").time()

        newRow = pd.Series(index = columns, dtype=object)
        newRow["Crash Date"] = pd.to_datetime(currentDate, format="%Y-%m-%d")
        newRow["Crash Time"] = timeFormatted
        newRow["Hour of Day"] = hour
        newRow["Day of Week"] = dayOfWeek
        newRow["Longitude"] = coordinates[0]
        newRow["Latitude"] = coordinates[1]

        rollingCounts = getRollingCounts(newRow, correctTimeObj, currentCrashes, currentCrashes.shape[0])
        newRow["crash_count_7d"] = rollingCounts[0]
        newRow["crash_count_20d"] = rollingCounts[1]
        newRow["crash_count_30d"] = rollingCounts[2]
        newRow["label"] = 0

        falseCrashArr.append(newRow)
    return (pd.DataFrame(trueCrashArr), pd.DataFrame(falseCrashArr))
                        
columns = joined_data.columns

# group by when they go to similar cells
for _, crashes in joined_data.groupby("cell_id"):
    cellGeometry = crashes.iloc[0]["geometry"]

    # longitude = (cellGeometry[0][0] - cellGeometry[2][0]) / 2
    # latitude = (cellGeometry[0][1] + cellGeometry[2][1]) / 2
    center = cellGeometry.centroid

    # Extract coordinates
    longitude = center.x
    latitude = center.y

    centerCoords = (longitude, latitude)

    cellIndex = crashes.iloc[0]["cell_id"]

    if(not pd.isna(crashes.iloc[0]["Latitude"])):
        print(f'In cell {cellIndex} we have {len(crashes)} crashes!')

    crashes = crashes.drop(columns=["geometry"])
    crashes = crashes.sort_values(["Crash Date", "Crash Time"], ascending=True).reset_index(drop=True)

    # if empty row, make it an empty dataframe
    if(pd.isna(crashes.iloc[0]["Latitude"])):
        crashes.drop(0, inplace=True)
    
    addedCrashes, addedNonCrashes = processCell(crashes, centerCoords, columns)
    finalCrashCells.extend(addedCrashes)
    finalFalseCells.extend(addedNonCrashes)


In [None]:
exportedCrashes = pd.DataFrame(finalCrashCells)
# Removed b/c of time series
# exportedDf.drop(columns=["Hour of Day", "Crash Date"], inplace=True)

exportedCrashes.to_csv("../data/final/preprocessed_crashes.csv", index=False)

exportedNegatives = pd.DataFrame(addedNonCrashes)
# Removed b/c of time series
# exportedDf.drop(columns=["Hour of Day", "Crash Date"], inplace=True)

exportedNegatives.to_csv("../data/final/preprocessed_negatives.csv", index=False)

In [None]:
# numCrashGroups = 0  
# numCrashesTotal = 0

# # group by when they go to similar cells
# for _, crashes in joined_data.groupby("index_left"):
    
#     numCrashGroups+= 1
#     numCrashesTotal += len(crashes)
# mean = numCrashesTotal / numCrashGroups

# standardDev = 0

# # group by when they go to similar cells
# for _, crashes in joined_data.groupby("index_left"):
#     standardDev += (len(crashes) - mean)**2
# standardDev /= (numCrashGroups - 1)

# standardDev = standardDev ** (1/2)

# print(f"Mean crashes was {mean}, standard deviation was {standardDev}")

Mean crashes was 1.9060398531292546, standard deviation was 199.48931918548627
