# Working Notebook

__Phase 3 Project - Chicago Traffic Crash Classification__

### Business Understanding

### Data Understanding and Preparation

Load dependencies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats as stats
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score, RandomizedSearchCV


Import data files

In [2]:
crashes = pd.read_csv('Chicago-Crashes/data/Traffic_Crashes_-_Crashes.csv', low_memory=False)
people = pd.read_csv('Chicago-Crashes/data/Traffic_Crashes_-_People.csv', low_memory=False)
vehicles = pd.read_csv('Chicago-Crashes/data/Traffic_Crashes_-_Vehicles.csv', low_memory=False)

In [3]:
crashes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541142 entries, 0 to 541141
Data columns (total 49 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                541142 non-null  object 
 1   RD_NO                          536550 non-null  object 
 2   CRASH_DATE_EST_I               41029 non-null   object 
 3   CRASH_DATE                     541142 non-null  object 
 4   POSTED_SPEED_LIMIT             541142 non-null  int64  
 5   TRAFFIC_CONTROL_DEVICE         541142 non-null  object 
 6   DEVICE_CONDITION               541142 non-null  object 
 7   WEATHER_CONDITION              541142 non-null  object 
 8   LIGHTING_CONDITION             541142 non-null  object 
 9   FIRST_CRASH_TYPE               541142 non-null  object 
 10  TRAFFICWAY_TYPE                541142 non-null  object 
 11  LANE_CNT                       198968 non-null  float64
 12  ALIGNMENT                     

In [4]:
people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1195747 entries, 0 to 1195746
Data columns (total 30 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   PERSON_ID              1195747 non-null  object 
 1   PERSON_TYPE            1195747 non-null  object 
 2   CRASH_RECORD_ID        1195747 non-null  object 
 3   RD_NO                  1185613 non-null  object 
 4   VEHICLE_ID             1172146 non-null  float64
 5   CRASH_DATE             1195747 non-null  object 
 6   SEAT_NO                244512 non-null   float64
 7   CITY                   879728 non-null   object 
 8   STATE                  890189 non-null   object 
 9   ZIPCODE                803192 non-null   object 
 10  SEX                    1177665 non-null  object 
 11  AGE                    852450 non-null   float64
 12  DRIVERS_LICENSE_STATE  705542 non-null   object 
 13  DRIVERS_LICENSE_CLASS  608027 non-null   object 
 14  SAFETY_EQUIPMENT  

Remove columns with 80% or more of null values.

In [5]:
df_list=[crashes, people, vehicles]
for df in df_list:
    min_count =  int((20/100)*df.shape[0] + 1)
    df = df.dropna(axis=1, thresh=min_count, inplace=True)
    

In [6]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107267 entries, 0 to 1107266
Data columns (total 19 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   CRASH_UNIT_ID        1107267 non-null  int64  
 1   CRASH_RECORD_ID      1107267 non-null  object 
 2   RD_NO                1097786 non-null  object 
 3   CRASH_DATE           1107267 non-null  object 
 4   UNIT_NO              1107267 non-null  int64  
 5   UNIT_TYPE            1105670 non-null  object 
 6   VEHICLE_ID           1081934 non-null  float64
 7   MAKE                 1081929 non-null  object 
 8   MODEL                1081786 non-null  object 
 9   LIC_PLATE_STATE      987487 non-null   object 
 10  VEHICLE_YEAR         905696 non-null   float64
 11  VEHICLE_DEFECT       1081934 non-null  object 
 12  VEHICLE_TYPE         1081934 non-null  object 
 13  VEHICLE_USE          1081934 non-null  object 
 14  TRAVEL_DIRECTION     1081934 non-null  object 
 15

Drop columns that will not be used

In [7]:
vehicles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107267 entries, 0 to 1107266
Data columns (total 19 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   CRASH_UNIT_ID        1107267 non-null  int64  
 1   CRASH_RECORD_ID      1107267 non-null  object 
 2   RD_NO                1097786 non-null  object 
 3   CRASH_DATE           1107267 non-null  object 
 4   UNIT_NO              1107267 non-null  int64  
 5   UNIT_TYPE            1105670 non-null  object 
 6   VEHICLE_ID           1081934 non-null  float64
 7   MAKE                 1081929 non-null  object 
 8   MODEL                1081786 non-null  object 
 9   LIC_PLATE_STATE      987487 non-null   object 
 10  VEHICLE_YEAR         905696 non-null   float64
 11  VEHICLE_DEFECT       1081934 non-null  object 
 12  VEHICLE_TYPE         1081934 non-null  object 
 13  VEHICLE_USE          1081934 non-null  object 
 14  TRAVEL_DIRECTION     1081934 non-null  object 
 15

In [8]:
crashes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541142 entries, 0 to 541141
Data columns (total 41 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                541142 non-null  object 
 1   RD_NO                          536550 non-null  object 
 2   CRASH_DATE                     541142 non-null  object 
 3   POSTED_SPEED_LIMIT             541142 non-null  int64  
 4   TRAFFIC_CONTROL_DEVICE         541142 non-null  object 
 5   DEVICE_CONDITION               541142 non-null  object 
 6   WEATHER_CONDITION              541142 non-null  object 
 7   LIGHTING_CONDITION             541142 non-null  object 
 8   FIRST_CRASH_TYPE               541142 non-null  object 
 9   TRAFFICWAY_TYPE                541142 non-null  object 
 10  LANE_CNT                       198968 non-null  float64
 11  ALIGNMENT                      541142 non-null  object 
 12  ROADWAY_SURFACE_COND          

In [9]:
crashes_mod = crashes.drop(['CRASH_DATE','RD_NO','REPORT_TYPE', 'DATE_POLICE_NOTIFIED', 'STREET_NO', 
              'STREET_DIRECTION', 'STREET_NAME', 'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
             'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT', 
             'INJURIES_NO_INDICATION', 'INJURIES_UNKNOWN'], axis =1)

In [10]:
people_mod = people.drop(['CITY', 'ZIPCODE','RD_NO'], axis =1)

In [13]:
vehicles_mod = vehicles.drop(['RD_NO', 'CRASH_DATE', 'UNIT_NO', 'AREA_01_I'], axis =1)

In [14]:
people_mod.shape

(1195747, 19)

In [15]:
vehicles_mod.shape

(1107267, 15)

In [16]:
crashes_mod.shape

(541142, 26)

## First Simple Model

In [13]:
crashes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541142 entries, 0 to 541141
Data columns (total 35 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                541142 non-null  object 
 1   CRASH_DATE                     541142 non-null  object 
 2   POSTED_SPEED_LIMIT             541142 non-null  int64  
 3   TRAFFIC_CONTROL_DEVICE         541142 non-null  object 
 4   DEVICE_CONDITION               541142 non-null  object 
 5   WEATHER_CONDITION              541142 non-null  object 
 6   LIGHTING_CONDITION             541142 non-null  object 
 7   FIRST_CRASH_TYPE               541142 non-null  object 
 8   TRAFFICWAY_TYPE                541142 non-null  object 
 9   LANE_CNT                       198968 non-null  float64
 10  ALIGNMENT                      541142 non-null  object 
 11  ROADWAY_SURFACE_COND           541142 non-null  object 
 12  ROAD_DEFECT                   

In [14]:
crashes.MOST_SEVERE_INJURY.value_counts()

NO INDICATION OF INJURY     468994
NONINCAPACITATING INJURY     39675
REPORTED, NOT EVIDENT        21763
INCAPACITATING INJURY         9040
FATAL                          547
Name: MOST_SEVERE_INJURY, dtype: int64

Set up target variable:
 - 0: NO INDICATION OF INJURY, NONINCAPACITATING INJURY, REPORTED, NOT EVIDENT
 - 1: INCAPACITATING INJURY, FATAL
 

In [15]:
crashes['TARGET']= crashes['MOST_SEVERE_INJURY'].map({'NO INDICATION OF INJURY': 0,
                                                      'NONINCAPACITATING INJURY': 0,
                                                      'REPORTED, NOT EVIDENT': 0,
                                                      'INCAPACITATING INJURY': 1,
                                                      'FATAL': 1})

In [16]:
crashes.TARGET.fillna(0, inplace=True)

In [17]:
crashes.TARGET.value_counts()

0.0    531555
1.0      9587
Name: TARGET, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
X = crashes.drop(['MOST_SEVERE_INJURY','INJURIES_TOTAL', 'INJURIES_FATAL','INJURIES_INCAPACITATING',
                      'INJURIES_NON_INCAPACITATING', 'INJURIES_REPORTED_NOT_EVIDENT',
                      'INJURIES_NO_INDICATION', 'TARGET', 'CRASH_RECORD_ID', 'CRASH_DATE', 'LATITUDE', 
                      'LONGITUDE','LOCATION'], axis=1)
y = crashes['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [19]:
X_train_nums = X_train.select_dtypes(include=['float64', 'int64'])
X_train_cat = X_train.select_dtypes('object')

In [20]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="median")),
    ('ss', StandardScaler())
])
                
categorical_pipeline = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first',
                         sparse=False))
])

trans = ColumnTransformer(transformers=[
    ('numerical', numerical_pipeline, X_train_nums.columns),
    ('categorical', categorical_pipeline, X_train_cat.columns)
])

In [21]:
model_pipe = Pipeline(steps=[
    ('trans', trans),
    ('dc', DummyClassifier(strategy='most_frequent'))
])

In [22]:
model_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('ss',
                                                                   StandardScaler())]),
                                                  Index(['POSTED_SPEED_LIMIT', 'LANE_CNT', 'BEAT_OF_OCCURRENCE', 'NUM_UNITS',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH'],
      dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('ohe'...
                                                  Index(['TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION',
       'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE',
       'A

In [23]:
model_pipe.score(X_train, y_train)

0.9824026255617756

In [24]:
model_pipe.score(X_test, y_test)

0.9819271765001553

### Merging Dataframes

From the people data, the target will be the injury classification so that the model will predict the severity of injuries.

In [25]:
people_mod.INJURY_CLASSIFICATION.value_counts()

NO INDICATION OF INJURY     1096283
NONINCAPACITATING INJURY      55296
REPORTED, NOT EVIDENT         32092
INCAPACITATING INJURY         10827
FATAL                           656
Name: INJURY_CLASSIFICATION, dtype: int64

To avoid too many repeated rows from crashes_dropped_df, we'll use the `CRASH_RECORD_ID` to only merge in rows from people_dropped_df where `DRIVER_TYPE == DRIVER`, then use the `VEHICLE_ID` to only merge in that driver's vehicle from vehicles_dropped_df.

This will allow us to still retain over 75% of the rows from people.

In [26]:
people_mod.PERSON_TYPE.value_counts(normalize=True)

DRIVER                 0.776163
PASSENGER              0.204485
PEDESTRIAN             0.011393
BICYCLE                0.006911
NON-MOTOR VEHICLE      0.000866
NON-CONTACT VEHICLE    0.000181
Name: PERSON_TYPE, dtype: float64

In [18]:
people_crashes_df = people_mod.merge(crashes_mod, on='CRASH_RECORD_ID', how='left')

In [20]:
people_crashes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1195747 entries, 0 to 1195746
Data columns (total 44 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   PERSON_ID                1195747 non-null  object 
 1   PERSON_TYPE              1195747 non-null  object 
 2   CRASH_RECORD_ID          1195747 non-null  object 
 3   VEHICLE_ID               1172146 non-null  float64
 4   CRASH_DATE               1195747 non-null  object 
 5   SEAT_NO                  244512 non-null   float64
 6   STATE                    890189 non-null   object 
 7   SEX                      1177665 non-null  object 
 8   AGE                      852450 non-null   float64
 9   DRIVERS_LICENSE_STATE    705542 non-null   object 
 10  DRIVERS_LICENSE_CLASS    608027 non-null   object 
 11  SAFETY_EQUIPMENT         1192258 non-null  object 
 12  AIRBAG_DEPLOYED          1173141 non-null  object 
 13  EJECTION                 1181206 non-null 

In [23]:
people_crashes_df.INJURY_CLASSIFICATION.value_counts()

NO INDICATION OF INJURY     1096283
NONINCAPACITATING INJURY      55296
REPORTED, NOT EVIDENT         32092
INCAPACITATING INJURY         10827
FATAL                           656
Name: INJURY_CLASSIFICATION, dtype: int64

In [27]:
people_crashes_df['TARGET'] = people_crashes_df['INJURY_CLASSIFICATION'].map({np.nan: 0,
                                                      'NO INDICATION OF INJURY': 0,
                                                      'NONINCAPACITATING INJURY': 1,
                                                      'REPORTED, NOT EVIDENT': 1,
                                                      'INCAPACITATING INJURY': 2,
                                                      'FATAL': 3})

In [28]:
people_crashes_df.TARGET.isna().sum()

0

In [31]:
people_crashes_df['CRASH_YEAR']=pd.to_datetime(people_crashes_df['CRASH_DATE']).dt.year

In [32]:
people_crashes_df['CRASH_YEAR'].value_counts()

2018    265694
2019    263972
2020    202084
2017    185328
2021    161704
2016     96020
2015     20931
2014        11
2013         3
Name: CRASH_YEAR, dtype: int64

In [29]:
people_crashes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1195747 entries, 0 to 1195746
Data columns (total 45 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   PERSON_ID                1195747 non-null  object 
 1   PERSON_TYPE              1195747 non-null  object 
 2   CRASH_RECORD_ID          1195747 non-null  object 
 3   VEHICLE_ID               1172146 non-null  float64
 4   CRASH_DATE               1195747 non-null  object 
 5   SEAT_NO                  1172607 non-null  float64
 6   STATE                    890189 non-null   object 
 7   SEX                      1177665 non-null  object 
 8   AGE                      852450 non-null   float64
 9   DRIVERS_LICENSE_STATE    705542 non-null   object 
 10  DRIVERS_LICENSE_CLASS    608027 non-null   object 
 11  SAFETY_EQUIPMENT         1192258 non-null  object 
 12  AIRBAG_DEPLOYED          1173141 non-null  object 
 13  EJECTION                 1181206 non-null 

In [34]:
from sklearn.model_selection import train_test_split
X = people_crashes_df.drop(['PERSON_ID', 'CRASH_RECORD_ID','VEHICLE_ID','CRASH_DATE', 'LATITUDE', 
                      'LONGITUDE','LOCATION', 'TARGET'], axis=1)
y = people_crashes_df['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [37]:
X_train.head()

Unnamed: 0,PERSON_TYPE,SEAT_NO,STATE,SEX,AGE,DRIVERS_LICENSE_STATE,DRIVERS_LICENSE_CLASS,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,EJECTION,...,HIT_AND_RUN_I,DAMAGE,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,BEAT_OF_OCCURRENCE,NUM_UNITS,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,CRASH_YEAR
635474,DRIVER,1.0,IL,F,34.0,IL,D,SAFETY BELT USED,DID NOT DEPLOY,NONE,...,,"OVER $1,500",IMPROPER TURNING/NO SIGNAL,FAILING TO YIELD RIGHT-OF-WAY,1922.0,2.0,11,7,2,2019
425369,DRIVER,1.0,IL,M,29.0,IL,D,SAFETY BELT USED,DID NOT DEPLOY,NONE,...,,"$501 - $1,500",DISREGARDING TRAFFIC SIGNALS,NOT APPLICABLE,1232.0,2.0,2,2,3,2018
1152397,DRIVER,1.0,,M,39.0,IL,,SAFETY BELT USED,NOT APPLICABLE,NONE,...,,"$501 - $1,500",NOT APPLICABLE,NOT APPLICABLE,823.0,2.0,17,3,5,2021
75101,DRIVER,1.0,IL,M,26.0,WA,9,SAFETY BELT USED,DID NOT DEPLOY,NONE,...,,"OVER $1,500",DISREGARDING ROAD MARKINGS,NOT APPLICABLE,1732.0,2.0,23,1,4,2018
265075,DRIVER,1.0,IL,M,24.0,IL,D,USAGE UNKNOWN,DID NOT DEPLOY,NONE,...,,"$501 - $1,500",FAILING TO REDUCE SPEED TO AVOID CRASH,FOLLOWING TOO CLOSELY,831.0,2.0,6,1,6,2017


In [21]:
people_crashes_df['SEAT_NO'][people_crashes_df.PERSON_TYPE == 'DRIVER'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  people_crashes_df['SEAT_NO'][people_crashes_df.PERSON_TYPE == 'DRIVER'] = 1


In [22]:
people_crashes_df.head()

Unnamed: 0,PERSON_ID,PERSON_TYPE,CRASH_RECORD_ID,VEHICLE_ID,CRASH_DATE,SEAT_NO,STATE,SEX,AGE,DRIVERS_LICENSE_STATE,...,PRIM_CONTRIBUTORY_CAUSE,SEC_CONTRIBUTORY_CAUSE,BEAT_OF_OCCURRENCE,NUM_UNITS,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,O749947,DRIVER,81dc0de2ed92aa62baccab641fa377be7feb1cc47e6554...,834816.0,09/28/2019 03:30:00 AM,1.0,IL,M,25.0,IL,...,UNABLE TO DETERMINE,NOT APPLICABLE,1531.0,3.0,3,7,9,41.900043,-87.755577,POINT (-87.755576950444 41.900042872883)
1,O871921,DRIVER,af84fb5c8d996fcd3aefd36593c3a02e6e7509eeb27568...,827212.0,04/13/2020 10:50:00 PM,1.0,IL,M,37.0,IL,...,IMPROPER OVERTAKING/PASSING,FAILING TO REDUCE SPEED TO AVOID CRASH,613.0,2.0,22,2,4,41.736044,-87.653404,POINT (-87.653404241798 41.736044089544)
2,O10018,DRIVER,71162af7bf22799b776547132ebf134b5b438dcf3dac6b...,9579.0,11/01/2015 05:00:00 AM,1.0,,X,,,...,DRIVING SKILLS/KNOWLEDGE/EXPERIENCE,NOT APPLICABLE,821.0,2.0,5,1,11,41.808173,-87.708484,POINT (-87.708483889951 41.808172614433)
3,O10038,DRIVER,c21c476e2ccc41af550b5d858d22aaac4ffc88745a1700...,9598.0,11/01/2015 08:00:00 AM,1.0,,X,,,...,UNABLE TO DETERMINE,UNABLE TO DETERMINE,2023.0,2.0,8,1,11,41.981425,-87.659861,POINT (-87.65986141844 41.981425369089)
4,O10039,DRIVER,eb390a4c8e114c69488f5fb8a097fe629f5a92fd528cf4...,9600.0,11/01/2015 10:15:00 AM,1.0,,X,,,...,UNABLE TO DETERMINE,NOT APPLICABLE,223.0,2.0,10,1,11,41.809489,-87.606417,POINT (-87.606417429394 41.809489246531)
