In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import datetime as dt
import warnings
warnings.filterwarnings("ignore")

#### 2019 + 2020

In [4]:
data19 = pd.read_csv("NYC_Crash_Data_2019.csv")
data20 = pd.read_csv("NYC_Crash_Data_2020.csv")

In [5]:
data = data19.append(data20, sort = False)

In [6]:
data['Crash_Fatal_Fl'] = data['NUMBER OF PERSONS KILLED'].apply(lambda x: 1 if x > 0 else 0)
dataSub = data[['COLLISION_ID', 'Crash_Fatal_Fl','CRASH DATE', 'NUMBER OF PERSONS KILLED']]
dataSub['CRASH DATE'] = pd.to_datetime(dataSub['CRASH DATE'])

In [7]:
dataSub.head()

Unnamed: 0,COLLISION_ID,Crash_Fatal_Fl,CRASH DATE,NUMBER OF PERSONS KILLED
0,4070993,0,2019-01-29,0.0
1,4083608,0,2019-02-18,0.0
2,4077792,0,2019-02-07,0.0
3,4084942,0,2019-02-21,0.0
4,4071210,0,2019-01-29,0.0


In [8]:
dataSubAgg = dataSub.copy()

In [9]:
dataSubAgg['yearMonth'] = dataSubAgg['CRASH DATE'].dt.strftime('%Y-%m')
dataSubAgg['Year'] = dataSubAgg['CRASH DATE'].dt.strftime('%Y')
dataSubAgg['Month'] = dataSubAgg['CRASH DATE'].dt.strftime('%m')
dataSubAgg = dataSubAgg.drop(['CRASH DATE'], axis = 1)

In [10]:
dataSubAgg.rename(columns={"COLLISION_ID": "Crash_ID", "NUMBER OF PERSONS KILLED": "Death_Cnt"}, inplace = True)

In [50]:
dataSubAgg = dataSubAgg[~((dataSubAgg['Month'] == '06') & (dataSubAgg['Year'] == '2020' ))]

In [51]:
dataSubAgg.head()

Unnamed: 0,Crash_ID,Crash_Fatal_Fl,Death_Cnt,yearMonth,Year,Month
0,4070993,0,0.0,2019-01,2019,1
1,4083608,0,0.0,2019-02,2019,2
2,4077792,0,0.0,2019-02,2019,2
3,4084942,0,0.0,2019-02,2019,2
4,4071210,0,0.0,2019-01,2019,1


## YearMonth

In [52]:
dataSubYearMonth = dataSubAgg.groupby(['Year','Month']).agg({'Crash_ID': 'count',
                                                                   'Crash_Fatal_Fl': 'sum',
                                                                   'Death_Cnt': 'sum',
                                                                   'yearMonth': 'max'}).reset_index()

In [53]:
dataSubYearMonth['crashPctChange'] = dataSubYearMonth.Crash_ID.pct_change()
dataSubYearMonth['deathCntPctChange'] = dataSubYearMonth.Death_Cnt.pct_change()
dataSubYearMonth['fatalPctChange'] = dataSubYearMonth.Crash_Fatal_Fl.pct_change()
dataSubYearMonth['State']='New York City'
dataSubYearMonth['Country']='United States'

In [54]:
dataSubYearMonth['deathRate']=dataSubYearMonth['Death_Cnt']/dataSubYearMonth['Crash_ID']
dataSubYearMonth['fatRate']=dataSubYearMonth['Crash_Fatal_Fl']/dataSubYearMonth['Crash_ID']

In [55]:
dataSubYearMonth['deathRatePctChange'] = dataSubYearMonth.deathRate.pct_change()
dataSubYearMonth['fatRatePctChange'] = dataSubYearMonth.fatRate.pct_change()

In [56]:
with open("NYC_dataSubYearMonth.csv",'w', newline='') as outputfile:
    dataSubYearMonth.to_csv(outputfile, header=True, index = False) 

In [57]:
datacheck = pd.read_csv('NYC_dataSubYearMonth.csv')

In [58]:
datacheck.shape

(17, 15)

## MonthYear

In [59]:
dataSubMonthYear = dataSubAgg.groupby(['Month','Year']).agg({'Crash_ID': 'count',
                                                                   'Crash_Fatal_Fl': 'sum',
                                                                   'Death_Cnt': 'sum',
                                                                   'yearMonth': 'max'}).reset_index()

In [60]:
dataSubMonthYear['crashPctChange'] = dataSubMonthYear.groupby('Month')['Crash_ID'].apply(lambda x: x.div(x.iloc[0]).subtract(1))
dataSubMonthYear['deathCntPctChange'] = dataSubMonthYear.groupby('Month')['Death_Cnt'].apply(lambda x: x.div(x.iloc[0]).subtract(1))
dataSubMonthYear['fatalPctChange'] = dataSubMonthYear.groupby('Month')['Crash_Fatal_Fl'].apply(lambda x: x.div(x.iloc[0]).subtract(1))
dataSubMonthYear['State']='New York City'
dataSubMonthYear['Country']='United States'

In [61]:
dataSubMonthYear['deathRate']=dataSubMonthYear['Death_Cnt']/dataSubMonthYear['Crash_ID']
dataSubMonthYear['fatRate']=dataSubMonthYear['Crash_Fatal_Fl']/dataSubMonthYear['Crash_ID']

In [62]:
dataSubMonthYear['deathRatePctChange'] = dataSubMonthYear.groupby('Month')['deathRate'].apply(lambda x: x.div(x.iloc[0]).subtract(1))
dataSubMonthYear['fatRatePctChange'] = dataSubMonthYear.groupby('Month')['fatRate'].apply(lambda x: x.div(x.iloc[0]).subtract(1))

In [63]:
with open("NYC_dataSubMonthYear.csv",'w', newline='') as outputfile:
    dataSubMonthYear.to_csv(outputfile, header=True, index = False) 

In [64]:
datacheck = pd.read_csv('NYC_dataSubMonthYear.csv')
datacheck.shape

(17, 15)

#### The december tied data
In this section we compare the all the months we have in 2020 to the most recent "normal month" which is December.

In [65]:
compare = dataSubYearMonth[['yearMonth', 'Crash_ID', 'Crash_Fatal_Fl']][dataSubYearMonth.yearMonth >= '2019-12']
compare['crashPctChangeCompareDec'] = compare.Crash_ID / compare.Crash_ID.iloc[0] - 1
compare['fatalPctChangeCompareDec'] = compare.Crash_Fatal_Fl / compare.Crash_Fatal_Fl.iloc[0] - 1

compare['State']='New York City'
compare['Country']='United States'

In [66]:
compare

Unnamed: 0,yearMonth,Crash_ID,Crash_Fatal_Fl,crashPctChangeCompareDec,fatalPctChangeCompareDec,State,Country
11,2019-12,17056,25,0.0,0.0,New York City,United States
12,2020-01,14278,17,-0.162875,-0.32,New York City,United States
13,2020-02,13674,20,-0.198288,-0.2,New York City,United States
14,2020-03,11034,8,-0.353072,-0.68,New York City,United States
15,2020-04,4103,13,-0.759439,-0.48,New York City,United States
16,2020-05,5954,11,-0.650915,-0.56,New York City,United States


In [67]:
with open("NYC_PctChangeCompareDec.csv",'w', newline = '') as outputfile:
    compare.to_csv(outputfile, header=True, index = False) 