In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import Polygon, box
from concurrent.futures import ProcessPoolExecutor, as_completed
from mapToCells import processCell

In [2]:
crash_frame = pd.read_csv("../data/processed/encoded_data_binary_encoding.csv")

gdf = gpd.GeoDataFrame(
    crash_frame, 
    geometry=gpd.points_from_xy(crash_frame["Longitude"], crash_frame["Latitude"]), 
    crs="EPSG:4326"
)

# left upper bound - 33°15'08.7"N 97°21'46.3"W
# right lower bound - 32°22'36.5"N 96°07'37.0"W

#using our bounds, create the corners we'll use in our polygon
upperLeft = (-1 * (97 + 21/60 + 46.3/3600), 33 + 15/60 + 8.7/3600)
bottomRight = (-1 * (96 + 7/60 + 37/3600), 32 + 22/60 + 36.5/3600)

upperRight = (bottomRight[0], upperLeft[1])
bottomLeft = (upperLeft[0], bottomRight[1])

# create the polygon
dallasBounds = Polygon([upperLeft, upperRight, bottomRight, bottomLeft, upperLeft])

# create teh dataframe
dallasFrame = gpd.GeoDataFrame({"geometry": [dallasBounds]}, crs="EPSG:4326")

# cahnge coordinate system of both
dallasFrame = dallasFrame.to_crs(epsg=32614)
gdf = gdf.to_crs(epsg=32614)

cell_size = 500

minx, miny, maxx, maxy = dallasFrame.total_bounds

grids = []
index = 0
# create grids based on the bounds
for x in np.arange(minx, maxx, cell_size):
    for y in np.arange(miny, maxy, cell_size):
        grids.append(box(x, y, x+cell_size, y+cell_size))

# 2. Optional: intersect with Dallas boundary to crop
grid = gpd.GeoDataFrame({"geometry": grids}, crs=dallasFrame.crs)

grid = gpd.overlay(grid, dallasFrame, how="intersection")

# combine it based on which cells are matching
joined_data = gpd.sjoin(gdf, grid, how="left", predicate="intersects")

In [None]:
final_cells = []
numWithoutData = 0

if __name__ == "__main__":    
    with ProcessPoolExecutor(max_workers=10) as executor:
        # group by when they go to similar cells
        futures = [executor.submit(processCell, crashes) for _, crashes in joined_data.groupby("index_right")]

        for future in as_completed(futures):
            try:
                result = future.result()  # will raise exception if the process failed
                print("Result:", result[0])
                final_cells.extend(result[1])
                numWithoutData += result[2]
            except Exception as e:
                print("Error:", e)

Result: In cell 12031 we have 1 crashes! New count also is 1
Result: In cell 12040 we have 1 crashes! New count also is 1
Result: In cell 12225 we have 1 crashes! New count also is 0
Result: In cell 12226 we have 4 crashes! New count also is 4
Result: In cell 12178 we have 5 crashes! New count also is 1
Result: In cell 12235 we have 2 crashes! New count also is 0
Result: In cell 12372 we have 1 crashes! New count also is 1
Result: In cell 12184 we have 13 crashes! New count also is 0
Result: In cell 12230 we have 9 crashes! New count also is 1
Result: In cell 12373 we have 4 crashes! New count also is 1
Result: In cell 12419 we have 2 crashes! New count also is 0
Result: In cell 12236 we have 13 crashes! New count also is 4
Result: In cell 12379 we have 9 crashes! New count also is 0
Result: In cell 12431 we have 6 crashes! New count also is 2
Result: In cell 12170 we have 27 crashes! New count also is 12
Result: In cell 12420 we have 14 crashes! New count also is 0
Result: In cell 124

In [None]:
exportedDf = pd.DataFrame(final_cells)

exportedDf.drop(columns="Crash Date", inplace=True)

exportedDf.to_csv("../data/final/preprocessed_data.csv", index=False)

print(f"Percent of crashes without data was {numWithoutData / (len(final_cells) / 24)}")

KeyError: "['Crash Date'] not found in axis"