In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import Polygon, box
from concurrent.futures import ProcessPoolExecutor, as_completed
from mapToCells import processCell

In [2]:
crash_frame = pd.read_csv("../data/processed/encoded_data_binary_encoding.csv")

gdf = gpd.GeoDataFrame(
    crash_frame, 
    geometry=gpd.points_from_xy(crash_frame["Longitude"], crash_frame["Latitude"]), 
    crs="EPSG:4326"
)

# left upper bound - 33°15'08.7"N 97°21'46.3"W
# right lower bound - 32°22'36.5"N 96°07'37.0"W

#using our bounds, create the corners we'll use in our polygon
upperLeft = (-1 * (97 + 21/60 + 46.3/3600), 33 + 15/60 + 8.7/3600)
bottomRight = (-1 * (96 + 7/60 + 37/3600), 32 + 22/60 + 36.5/3600)

upperRight = (bottomRight[0], upperLeft[1])
bottomLeft = (upperLeft[0], bottomRight[1])

# create the polygon
dallasBounds = Polygon([upperLeft, upperRight, bottomRight, bottomLeft, upperLeft])

# create teh dataframe
dallasFrame = gpd.GeoDataFrame({"geometry": [dallasBounds]}, crs="EPSG:4326")

# cahnge coordinate system of both
dallasFrame = dallasFrame.to_crs(epsg=32614)
gdf = gdf.to_crs(epsg=32614)

cell_size = 500

minx, miny, maxx, maxy = dallasFrame.total_bounds

grids = []
index = 0
# create grids based on the bounds
for x in np.arange(minx, maxx, cell_size):
    for y in np.arange(miny, maxy, cell_size):
        grids.append(box(x, y, x+cell_size, y+cell_size))

# 2. Optional: intersect with Dallas boundary to crop
grid = gpd.GeoDataFrame({"geometry": grids}, crs=dallasFrame.crs)

grid = gpd.overlay(grid, dallasFrame, how="intersection")

# combine it based on which cells are matching
joined_data = gpd.sjoin(gdf, grid, how="left", predicate="intersects")

In [3]:
lowerBound = 18000
upperBound = 19000
newCellFrame = None

for _, crashes in joined_data.groupby("index_right"):
    cellIndex = crashes.iloc[0]["index_right"]
    if lowerBound <= cellIndex < upperBound:
        if newCellFrame is None:
            newCellFrame = crashes.copy()
        else:
            newCellFrame = pd.concat([newCellFrame, crashes], ignore_index=True)


In [4]:
final_cells = []
numWithoutData = 0

if __name__ == "__main__":    
    with ProcessPoolExecutor(max_workers=10) as executor:
        # group by when they go to similar cells
        futures = []
        for _, crashes in newCellFrame.groupby("index_right"):
            future = executor.submit(processCell, crashes)
            futures.append(future)

        for future in as_completed(futures):
            try:
                result = future.result()  # will raise exception if the process failed
                if(result[0] != "NOT APPLICABLE"):
                    print ("Result:", result[0])
                    final_cells.extend(result[1])
                    numWithoutData += result[2]
            except Exception as e:
                print("Error:", e)

Result: In cell 18024 we have 30 crashes! New count also is 0
Result: In cell 18033 we have 17 crashes! New count also is 1
Result: In cell 18026 we have 4 crashes! New count also is 0
Result: In cell 18070 we have 5 crashes! New count also is 1
Result: In cell 18010 we have 13 crashes! New count also is 7
Result: In cell 18206 we have 19 crashes! New count also is 5
Result: In cell 18064 we have 10 crashes! New count also is 0
Result: In cell 18050 we have 152 crashes! New count also is 15
Result: In cell 18062 we have 28 crashes! New count also is 12
Result: In cell 18025 we have 98 crashes! New count also is 1
Result: In cell 18229 we have 1 crashes! New count also is 0
Result: In cell 18069 we have 55 crashes! New count also is 23
Result: In cell 18223 we have 32 crashes! New count also is 1
Result: In cell 18082 we have 63 crashes! New count also is 1
Result: In cell 18278 we have 2 crashes! New count also is 0
Result: In cell 18266 we have 5 crashes! New count also is 0
Result: I

In [5]:
fileNameCells = f"CellDataFrom{lowerBound}To{upperBound}"
fileNameFailures = f"FailuresFrom{lowerBound}To{upperBound}.txt"

exportedDf = pd.DataFrame(final_cells)
exportedDf.drop(columns="Crash Date", inplace=True)

exportedDf.to_csv(f"{fileNameCells}.csv", index=False)

print(f"Percent of crashes without data was {numWithoutData / (len(final_cells) / 24)}")



with open(fileNameFailures, "w") as f2:
    f2.write(f"Num failures was: {numWithoutData} when total was {len(final_cells) / 24}")
        


Percent of crashes without data was 0.10999371464487744


In [6]:
# exportedDf = pd.DataFrame(final_cells)

# exportedDf.drop(columns="Crash Date", inplace=True)

# exportedDf.to_csv("../data/final/preprocessed_data.csv", index=False)

# print(f"Percent of crashes without data was {numWithoutData / (len(final_cells) / 24)}")