# Traffic Collisions Dataset
This file will group traffic collisions by neighbourhood, and by both neighbourhood and month/year

In [1]:
import geopandas as gpd
import pandas as pd
import shapely
import ast
import os

In [2]:
curr_dir = os.path.abspath('')
main_dir = os.path.dirname(curr_dir)
data_dir = os.path.join(main_dir, 'train-data')

## Traffic Collisions.csv

In [3]:
collision_df = pd.read_csv(os.path.join(data_dir, "Traffic Collisions - 4326.csv"))

In [4]:
collision_df.head()

Unnamed: 0,_id,EventUniqueId,OccurrenceDate,Month,Day_of_Week,Year,Hour,Division,Atom,Neighbourhood,Fatalities,Injury_Collisions,FTR_Collisions,PD_Collisions,geometry
0,1,GO-20148000204,2014-01-03T05:00:00,January,Friday,2014,8.0,NSA,NSA,NSA,,NO,NO,YES,"{'type': 'Point', 'coordinates': (0.0, 0.0)}"
1,2,GO-20148000205,2014-01-03T05:00:00,January,Friday,2014,12.0,D54/D55,68,North Riverdale (68),,NO,NO,YES,"{'type': 'Point', 'coordinates': (-79.35389302..."
2,3,GO-20148000206,2014-01-03T05:00:00,January,Friday,2014,8.0,NSA,NSA,NSA,,NO,NO,YES,"{'type': 'Point', 'coordinates': (0.0, 0.0)}"
3,4,GO-20148000208,2014-01-03T05:00:00,January,Friday,2014,11.0,D52,79,University (79),,NO,NO,YES,"{'type': 'Point', 'coordinates': (-79.40207036..."
4,5,GO-20148000209,2014-01-03T05:00:00,January,Friday,2014,13.0,D32,50,Newtonbrook East (50),,NO,NO,YES,"{'type': 'Point', 'coordinates': (-79.42004278..."


### Transform to GeoPandas.GeoDataFrame

In [5]:
## Get list of shapely points corresponding to locations of accidents
geometry = [shapely.Point(ast.literal_eval(geom)['coordinates']) for geom in collision_df['geometry']]

In [6]:
collisions_gdf = gpd.GeoDataFrame(collision_df.drop('geometry', axis = 1), geometry = geometry)
collisions_gdf.crs = "urn:ogc:def:crs:OGC:1.3:CRS84"

In [7]:
collisions_gdf.head()

Unnamed: 0,_id,EventUniqueId,OccurrenceDate,Month,Day_of_Week,Year,Hour,Division,Atom,Neighbourhood,Fatalities,Injury_Collisions,FTR_Collisions,PD_Collisions,geometry
0,1,GO-20148000204,2014-01-03T05:00:00,January,Friday,2014,8.0,NSA,NSA,NSA,,NO,NO,YES,POINT (0.00000 0.00000)
1,2,GO-20148000205,2014-01-03T05:00:00,January,Friday,2014,12.0,D54/D55,68,North Riverdale (68),,NO,NO,YES,POINT (-79.35389 43.67049)
2,3,GO-20148000206,2014-01-03T05:00:00,January,Friday,2014,8.0,NSA,NSA,NSA,,NO,NO,YES,POINT (0.00000 0.00000)
3,4,GO-20148000208,2014-01-03T05:00:00,January,Friday,2014,11.0,D52,79,University (79),,NO,NO,YES,POINT (-79.40207 43.66305)
4,5,GO-20148000209,2014-01-03T05:00:00,January,Friday,2014,13.0,D32,50,Newtonbrook East (50),,NO,NO,YES,POINT (-79.42004 43.79800)


### Clean data

#### Change 'Fatalities' to a binary value

In [8]:
collisions_gdf['Fatalities'] = ~collisions_gdf["Fatalities"].isna().astype(bool)

#### Change 'Injury_Collisions' to a binary value, and rename as 'Injuries'

In [9]:
collisions_gdf['Injuries'] = collisions_gdf['Injury_Collisions'].replace({'NO': 0, 'YES': 1}).astype(bool)
collisions_gdf = collisions_gdf.drop('Injury_Collisions', axis = 1)

#### Add 'Date_in_Month'

In [10]:
collisions_gdf['Date_in_Month'] = collisions_gdf['OccurrenceDate'].str.split("-", expand=True)[2].str.split("T", expand=True)[0].astype(int)

#### Change 'Month' to an integer value

In [11]:
months_mapping = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}
collisions_gdf['Month'] = collisions_gdf['Month'].replace(months_mapping).astype(int)

#### Keep only used columns

In [15]:
collisions_gdf = collisions_gdf[["Year", "Month", "Date_in_Month", "Day_of_Week", "Hour", "Fatalities", "Injuries", "geometry"]]

#### Drop columns where coordinates are (0, 0)

In [13]:
collisions_gdf = collisions_gdf[~(collisions_gdf['geometry'] == shapely.Point(0, 0))]

#### Let's see what we have done

In [14]:
collisions_gdf.head()

Unnamed: 0,Year,Month,Date_in_Month,Day_of_Week,Hour,Atom,Fatalities,Injuries,geometry
1,2014,1,3,Friday,12.0,68,False,False,POINT (-79.35389 43.67049)
3,2014,1,3,Friday,11.0,79,False,False,POINT (-79.40207 43.66305)
4,2014,1,3,Friday,13.0,50,False,False,POINT (-79.42004 43.79800)
5,2014,1,3,Friday,10.0,31,False,False,POINT (-79.44586 43.71415)
6,2014,1,3,Friday,10.0,4,False,False,POINT (-79.56666 43.71171)


## Neighbourhood