In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, box, Polygon
import numpy as np
import matplotlib.pyplot as plt
import ast
import datetime
from sklearn.model_selection import TimeSeriesSplit

In [2]:
def setupApiFile():
    apiFile = pd.read_csv("../data/processed/crash_street_api_features.csv")

    # remove columns we basically already have``
    apiFile.drop(columns={"maxspeed", "road_type", "AADT"}, inplace=True)

    # just print out before filterin
    print(f"With and without lane count, we have: {apiFile.shape[0]} crashes")

    apiFile = apiFile.dropna(subset=["lane_count"])

    apiFile.fillna(0, inplace=True)

    print(f"With only lane count, we have: {apiFile.shape[0]} crashes")

    for index, crash in apiFile.iterrows():
        try:
            apiFile.at[index, "lane_count"] = int(crash["lane_count"])
        except:
            laneCount = ast.literal_eval(crash["lane_count"])
            laneCount = int(laneCount[0]) + int(laneCount[1])
            apiFile.at[index, "lane_count"] = laneCount
    return apiFile

In [3]:
apiFile = setupApiFile()
crash_frame = pd.read_csv("../data/processed/encoded_data_binary_encoding.csv")

# API already has this
crash_frame.drop(columns="Adjusted Average Daily Traffic Amount",inplace=True)

gdf = gpd.GeoDataFrame(
    crash_frame, 
    geometry=gpd.points_from_xy(crash_frame["Longitude"], crash_frame["Latitude"]), 
    crs="EPSG:4326"
)

# left upper bound - 33°15'08.7"N 97°21'46.3"W
# right lower bound - 32°22'36.5"N 96°07'37.0"W

#using our bounds, create the corners we'll use in our polygon
upperLeft = (-1 * (97 + 21/60 + 46.3/3600), 33 + 15/60 + 8.7/3600)
bottomRight = (-1 * (96 + 7/60 + 37/3600), 32 + 22/60 + 36.5/3600)

upperRight = (bottomRight[0], upperLeft[1])
bottomLeft = (upperLeft[0], bottomRight[1])

# create the polygon
dallasBounds = Polygon([upperLeft, upperRight, bottomRight, bottomLeft, upperLeft])

# create teh dataframe
dallasFrame = gpd.GeoDataFrame({"geometry": [dallasBounds]}, crs="EPSG:4326")

# cahnge coordinate system of both
dallasFrame = dallasFrame.to_crs(epsg=32614)
gdf = gdf.to_crs(epsg=32614)

cell_size = 500

minx, miny, maxx, maxy = dallasFrame.total_bounds

grids = []
index = 0
# create grids based on the bounds
for x in np.arange(minx, maxx, cell_size):
    for y in np.arange(miny, maxy, cell_size):
        grids.append(box(x, y, x+cell_size, y+cell_size))

# 2. Optional: intersect with Dallas boundary to crop
grid = gpd.GeoDataFrame({"geometry": grids}, crs=dallasFrame.crs)

grid = gpd.overlay(grid, dallasFrame, how="intersection")

# combine it based on which cells are matching. Now this is an array of cells
joined_data = gpd.sjoin(gdf, grid, how="left", predicate="intersects")

# drops crashes that don't correspond to a cell
joined_data = joined_data.drop(joined_data[joined_data["index_right"].isna()].index)

missingCells = []

for index, theGrid in grid.iterrows():
    if index not in joined_data["index_right"].values:
        new_row = gpd.GeoDataFrame([theGrid], crs=grid.crs)
        new_row["index_right"] = index
        missingCells.append(new_row)

if missingCells:
    missingCells = pd.concat(missingCells, ignore_index = True)
    joined_data = pd.concat([joined_data, missingCells], ignore_index=True)

print("Finished concatenating")


noCrashIndices = 0


# then, join by api data
joined_data = pd.merge(joined_data, apiFile, on=["Latitude", "Longitude"], how="left")


joined_data.drop(columns=["Crash ID"], inplace=True)

final_cells = []

hoursPerPeriod = 4

for i in range(0, 24, hoursPerPeriod):
    joined_data[f"time_bin_{i}_{i + 3}"] = 0

joined_data["crash_count_7d"] = 0
joined_data["crash_count_30d"] = 0

# they both are functionally the same, so rename for easier mapping
joined_data.rename(columns = {"index_right": "cell_id"}, inplace=True)
joined_data["label"] = 0

joined_data["Crash Date"] = pd.to_datetime(joined_data["Crash Date"], format="%Y-%m-%d")

joined_data = joined_data.to_crs("EPSG:4326")

With and without lane count, we have: 41632 crashes
With only lane count, we have: 33649 crashes
Finished concatenating


In [None]:
def processCell():
    pass

# group by when they go to similar cells
for _, crashes in joined_data.groupby("cell_id"):
    cellGeometry = crashes.iloc[0]["geometry"]
    # cellCenterCoord = cellGeometry.coords
    # latitude = cellCenterCoord
    cellIndex = crashes.iloc[0]["cell_id"]
    
    if(not pd.isna(crashes.iloc[0]["Latitude"])):
        print(f'In cell {cellIndex} we have {len(crashes)} crashes!')

    if(cellIndex == 0):
        pass
    crashes.drop(columns=["geometry"], inplace=True)

    crashes = crashes.sort_values(["Crash Date", "Hour of Day"], ascending=True).reset_index(drop=True)

In cell 12031 we have 1 crashes!
In cell 12040 we have 1 crashes!
In cell 12170 we have 27 crashes!
In cell 12178 we have 5 crashes!
In cell 12184 we have 13 crashes!
In cell 12188 we have 61 crashes!
In cell 12206 we have 82 crashes!
In cell 12225 we have 1 crashes!
In cell 12226 we have 4 crashes!
In cell 12230 we have 9 crashes!
In cell 12235 we have 2 crashes!
In cell 12236 we have 13 crashes!
In cell 12365 we have 46 crashes!
In cell 12372 we have 1 crashes!
In cell 12373 we have 4 crashes!
In cell 12379 we have 9 crashes!
In cell 12383 we have 167 crashes!
In cell 12384 we have 29 crashes!
In cell 12401 we have 62 crashes!
In cell 12419 we have 2 crashes!
In cell 12420 we have 14 crashes!
In cell 12425 we have 16 crashes!
In cell 12431 we have 6 crashes!
In cell 12432 we have 26 crashes!
In cell 12433 we have 15 crashes!
In cell 12560 we have 129 crashes!
In cell 12567 we have 35 crashes!
In cell 12574 we have 22 crashes!
In cell 12575 we have 5 crashes!
In cell 12578 we have 64 

In [None]:
exportedDf = pd.DataFrame(final_cells)

# Removed b/c of time series
# exportedDf.drop(columns=["Hour of Day", "Crash Date"], inplace=True)

exportedDf.to_csv("../data/final/true_preprocessed_data.csv", index=False)

In [None]:
numCrashGroups = 0
numCrashesTotal = 0

# group by when they go to similar cells
for _, crashes in joined_data.groupby("index_left"):
    
    numCrashGroups+= 1
    numCrashesTotal += len(crashes)
mean = numCrashesTotal / numCrashGroups

standardDev = 0

# group by when they go to similar cells
for _, crashes in joined_data.groupby("index_left"):
    standardDev += (len(crashes) - mean)**2
standardDev /= (numCrashGroups - 1)

standardDev = standardDev ** (1/2)

print(f"Mean crashes was {mean}, standard deviation was {standardDev}")

Mean crashes was 1.9060398531292546, standard deviation was 199.48931918548627
