In [36]:
import pandas as pd
import numpy as np
import datetime
import time
df_orig = pd.read_csv('Collisions.csv')
df_orig.shape

(221525, 40)

Now let's have a closer look at what the data looks like. I will use pandas_profiling package to provide details and I will use it to ascertain which columns to drop (identifier columns) and which need some cleaning up

In [3]:
# from pandas_profiling import ProfileReport
# profile = ProfileReport(df,title='Collisions initial profile')
# profile.to_file('profile.html')

In [4]:
# columns to drop: OBJECTID, INCKEY, COLDETKEY, REPORTNO, LOCATION, STATUS, INJURIES, SERIOUSINJURIES, FATALITIES, SDOTCOLNUM, SEGLANEKEY, CROSSWALKKEY
# columns to keep separate: SEVERITYCODE, SEVERITYDESC, COLLISIONTYPE, SDOT_COLDESC, ST_COLDESC

In [17]:
df = df_orig.copy()
df.drop(columns=['OBJECTID','INCKEY','COLDETKEY','REPORTNO','LOCATION','STATUS','INJURIES','SERIOUSINJURIES','FATALITIES','SDOTCOLNUM','SEGLANEKEY','CROSSWALKKEY','SEVERITYDESC','ST_COLDESC'],inplace=True)

# clean ST_COLCODE
def st_colcode(val):
    try:
        return int(val)
    except:
        return np.nan

df['ST_COLCODE'] = df['ST_COLCODE'].apply(st_colcode)

df.columns

Index([&#39;X&#39;, &#39;Y&#39;, &#39;ADDRTYPE&#39;, &#39;INTKEY&#39;, &#39;EXCEPTRSNCODE&#39;, &#39;EXCEPTRSNDESC&#39;,
       &#39;SEVERITYCODE&#39;, &#39;COLLISIONTYPE&#39;, &#39;PERSONCOUNT&#39;, &#39;PEDCOUNT&#39;,
       &#39;PEDCYLCOUNT&#39;, &#39;VEHCOUNT&#39;, &#39;INCDATE&#39;, &#39;INCDTTM&#39;, &#39;JUNCTIONTYPE&#39;,
       &#39;SDOT_COLCODE&#39;, &#39;SDOT_COLDESC&#39;, &#39;INATTENTIONIND&#39;, &#39;UNDERINFL&#39;,
       &#39;WEATHER&#39;, &#39;ROADCOND&#39;, &#39;LIGHTCOND&#39;, &#39;PEDROWNOTGRNT&#39;, &#39;SPEEDING&#39;,
       &#39;ST_COLCODE&#39;, &#39;HITPARKEDCAR&#39;],
      dtype=&#39;object&#39;)

In [18]:
df['INCDATE'].head(20)

0     2013/03/14 00:00:00+00
1     2006/01/15 00:00:00+00
2     2019/09/09 00:00:00+00
3     2019/12/19 00:00:00+00
4     2013/03/27 00:00:00+00
5     2005/07/07 00:00:00+00
6     2020/07/31 00:00:00+00
7     2013/04/01 00:00:00+00
8     2006/04/11 00:00:00+00
9     2013/04/03 00:00:00+00
10    2013/03/30 00:00:00+00
11    2013/03/31 00:00:00+00
12    2006/06/13 00:00:00+00
13    2019/12/23 00:00:00+00
14    2007/04/17 00:00:00+00
15    2004/09/17 00:00:00+00
16    2019/12/20 00:00:00+00
17    2013/03/27 00:00:00+00
18    2020/05/03 00:00:00+00
19    2019/12/22 00:00:00+00
Name: INCDATE, dtype: object

#### Feature engineering

In [28]:
df['DATE'] = pd.to_datetime(df['INCDATE'],format=r'%Y/%m/%d %H:%M:%S+00')
df['DATE']
df['YEAR'] = df['DATE'].apply(lambda x: x.year)
df['DAYOFYEAR'] = df['DATE'].apply(lambda x: x.dayofyear)


In [75]:
def parse_datetime(val:str):
    if ' ' in val:
        min_char = val.find(' ') + 1
        new_time = time.strptime(val[min_char:],r'%I:%M:%S %p')
        return new_time.tm_hour*60+new_time.tm_min*60
    else:
        return np.nan

df['TIME'] = df['INCDTTM'].apply(parse_datetime)
df['TIME'].replace(to_replace=np.nan, value=int(df['TIME'].mean()),inplace=True)

In [86]:
def combine_date_time(val):
    date = val['DATE']
    hour = int(val['TIME']/3600)
    minute = int((val['TIME']-3600*hour)/60)
    return datetime.datetime.combine(date,datetime.time(hour,minute,0))

df['DATETIME'] = df[['DATE','TIME']].apply(combine_date_time,axis=1)
df['DATETIME'] = df['DATETIME'].apply(lambda x: x.timestamp())    

In [89]:
df.drop(columns=['INCDATE','INCDTTM','DATE','YEAR','TIME','SDOT_COLDESC']).columns

Index([&#39;X&#39;, &#39;Y&#39;, &#39;ADDRTYPE&#39;, &#39;INTKEY&#39;, &#39;EXCEPTRSNCODE&#39;, &#39;EXCEPTRSNDESC&#39;,
       &#39;SEVERITYCODE&#39;, &#39;COLLISIONTYPE&#39;, &#39;PERSONCOUNT&#39;, &#39;PEDCOUNT&#39;,
       &#39;PEDCYLCOUNT&#39;, &#39;VEHCOUNT&#39;, &#39;JUNCTIONTYPE&#39;, &#39;SDOT_COLCODE&#39;,
       &#39;INATTENTIONIND&#39;, &#39;UNDERINFL&#39;, &#39;WEATHER&#39;, &#39;ROADCOND&#39;, &#39;LIGHTCOND&#39;,
       &#39;PEDROWNOTGRNT&#39;, &#39;SPEEDING&#39;, &#39;ST_COLCODE&#39;, &#39;HITPARKEDCAR&#39;, &#39;DATETIME&#39;],
      dtype=&#39;object&#39;)