# Preprocessing Playground

This is just a jupyter notebook playground for preprocessing of the data.
The final script should run in `preprocessing.py`.

In [1]:
import pandas as pd

### Downloading the data manually

<https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95>

In [2]:
df = pd.read_csv('Motor_Vehicle_Collisions_-_Crashes.csv', low_memory=False)

In [3]:
df.drop(columns=['LOCATION'], inplace=True)
df = df.loc[:, ~df.columns.str.startswith('VEHICLE TYPE')]
# df = df.loc[:, ~df.columns.str.startswith('CONTRIBUTING FACTOR')]

In [4]:
df['timestamp'] = pd.to_datetime(df['CRASH DATE'] + ' ' + df['CRASH TIME'], format='%m/%d/%Y %H:%M')

In [5]:
integer_columns = df.columns.str.contains('NUMBER OF')

In [6]:
df.loc[:, integer_columns] = df.loc[:, integer_columns].astype(pd.Int16Dtype())

In [7]:
df.head()

Unnamed: 0,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,NUMBER OF PERSONS INJURED,...,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,COLLISION_ID,timestamp
0,04/14/2021,5:32,,,,,BRONX WHITESTONE BRIDGE,,,0,...,0,0,0,Following Too Closely,Unspecified,,,,4407480,2021-04-14 05:32:00
1,04/13/2021,21:35,BROOKLYN,11217.0,40.68358,-73.97617,,,620 ATLANTIC AVENUE,1,...,0,0,0,Unspecified,,,,,4407147,2021-04-13 21:35:00
2,04/15/2021,16:15,,,,,HUTCHINSON RIVER PARKWAY,,,0,...,0,0,0,Pavement Slippery,,,,,4407665,2021-04-15 16:15:00
3,04/13/2021,16:00,BROOKLYN,11222.0,,,VANDERVORT AVENUE,ANTHONY STREET,,0,...,0,0,0,Following Too Closely,Unspecified,,,,4407811,2021-04-13 16:00:00
4,04/12/2021,8:25,,,0.0,0.0,EDSON AVENUE,,,0,...,0,0,0,Unspecified,Unspecified,,,,4406885,2021-04-12 08:25:00


In [None]:
df.info()

In [8]:
df = df.convert_dtypes()
df.columns= df.columns.str.lower()
df.columns = df.columns.str.replace(' ','_')

In [9]:
df.sort_values(by=['timestamp'], inplace=True, ascending=False)
df.set_index('timestamp', inplace=True)

In [None]:
df.describe().transpose()

In [10]:
df.dtypes

crash_date                        string
crash_time                        string
borough                           string
zip_code                          string
latitude                         Float64
longitude                        Float64
on_street_name                    string
cross_street_name                 string
off_street_name                   string
number_of_persons_injured          Int16
number_of_persons_killed           Int16
number_of_pedestrians_injured      Int16
number_of_pedestrians_killed       Int16
number_of_cyclist_injured          Int16
number_of_cyclist_killed           Int16
number_of_motorist_injured         Int16
number_of_motorist_killed          Int16
contributing_factor_vehicle_1     string
contributing_factor_vehicle_2     string
contributing_factor_vehicle_3     string
contributing_factor_vehicle_4     string
contributing_factor_vehicle_5     string
collision_id                       Int64
dtype: object

In [None]:
# df.reset_index().to_feather('crashes.feather', compression='lz4', compression_level=10)

In [None]:
# write to parquet file
df.to_parquet('crashes.parquet', engine='pyarrow', compression='brotli', index=True)

### Read parquet file

In [None]:
df1 = pd.read_parquet('crashes.parquet', engine='pyarrow')

In [None]:
df1.info()