In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.inspection import permutation_importance
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler
import copy
import time
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="123")

In [4]:
df_original = pd.read_csv("data/US_Accidents_June20.csv")
print(df_original.columns)
print(len(df_original))
df_original.head()

Index(['ID', 'Source', 'TMC', 'Severity', 'Start_Time', 'End_Time',
       'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)',
       'Description', 'Number', 'Street', 'Side', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')
3513617


Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


In [5]:
df_full = copy.deepcopy(df_original)

In [6]:
df_full.loc[df_full['Severity'].isin([1, 2]), 'Severity'] = 0
df_full.loc[df_full['Severity'].isin([3, 4]), 'Severity'] = 1

In [7]:
# date as str to datetime
df_full['Start_Time'] = df_full['Start_Time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df_full['End_Time'] = df_full['End_Time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df_full = df_full.sort_values('Start_Time')

In [8]:
mask_full = (df_full['Start_Time'] >= '01-01-2018')

In [40]:
rel = ['Stop', 'Station', 'Visibility(mi)', 'Junction', 'Crossing', 'Wind_Speed(mph)',
       'Pressure(in)', 'Humidity(%)', 'Traffic_Signal', 'Temperature(F)', 'Distance(mi)',
       'Start_Lat', 'Start_Lng', 'Severity', 'Airport_Code', 'City', 'County', 'Street',
       'Start_Time', 'End_Time', 'Civil_Twilight']

In [43]:
full = df_full[rel].loc[mask_full]
full.dropna(inplace=True, subset=rel)

## Add custom features

In [49]:
full['Day_Of_Week'] = full['Start_Time'].dt.day_name()

In [51]:
full.loc[full['Day_Of_Week'].isin([5, 6]), 'Weekend'] = 1
full.loc[full['Day_Of_Week'].isin(list(range(5))), 'Weekend'] = 0

In [53]:
full['Hour'] = full.apply(lambda x: x['Start_Time'].hour, axis=1)

In [54]:
def add_duration(x):
    try:
        return (x['Start_Time'] - x['End_Time']).total_seconds()
    except:
        return np.nan

full['Duration'] = full.apply(add_duration, axis=1)

## Train test split

In [55]:
train = full.head(int(len(full)*(70/100)))
test = full.tail(len(full) - int(len(full)*(70/100)))

In [56]:
print(len(full), len(train), len(test))

2126987 1488890 638097


## Noramliztion

In [57]:
print(full.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2126987 entries, 3362224 to 560472
Data columns (total 27 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Stop             bool          
 1   Station          bool          
 2   Visibility(mi)   float64       
 3   Junction         bool          
 4   Crossing         bool          
 5   Wind_Speed(mph)  float64       
 6   Pressure(in)     float64       
 7   Humidity(%)      float64       
 8   Traffic_Signal   bool          
 9   Temperature(F)   float64       
 10  Distance(mi)     float64       
 11  Start_Lat        float64       
 12  Start_Lng        float64       
 13  Severity         int64         
 14  Airport_Code     object        
 15  City             object        
 16  County           object        
 17  Street           object        
 18  Start_Time       datetime64[ns]
 19  End_Time         datetime64[ns]
 20  Civil_Twilight   object        
 21  duration         float64  

### Bool - change to zero one

In [58]:
bool_col = full.columns[full.dtypes.values == np.dtype('bool')]
print(bool_col)

Index(['Stop', 'Station', 'Junction', 'Crossing', 'Traffic_Signal'], dtype='object')


In [59]:
full[bool_col].head()

Unnamed: 0,Stop,Station,Junction,Crossing,Traffic_Signal
3362224,False,False,False,False,False
3362225,False,False,False,False,False
3362226,False,False,False,False,False
3362258,False,False,False,False,False
3362469,False,False,True,False,False


In [60]:
full[bool_col] = full[bool_col].astype(int)

### Continuous - minmax

In [61]:
con_col = full.columns[full.dtypes.values == np.dtype(np.float)]
print(con_col)

Index(['Visibility(mi)', 'Wind_Speed(mph)', 'Pressure(in)', 'Humidity(%)',
       'Temperature(F)', 'Distance(mi)', 'Start_Lat', 'Start_Lng', 'duration',
       'Duration', 'Weekend'],
      dtype='object')


In [62]:
full[con_col] = full[con_col].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

## Save full train test split to csv

In [63]:
train.to_csv('train.csv')
test.to_csv('test.csv')