In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('data/Traffic_Crashes_People.csv', low_memory=False)

In [3]:
df.drop(['PERSON_ID', 'PERSON_TYPE', 'RD_NO', 'VEHICLE_ID', 'SEAT_NO', 'CITY', 'STATE', 'ZIPCODE', 'SEX', 
         'DRIVERS_LICENSE_STATE', 'DRIVERS_LICENSE_CLASS', 'SAFETY_EQUIPMENT', 'AIRBAG_DEPLOYED', 'EJECTION', 
         'INJURY_CLASSIFICATION', 'HOSPITAL', 'EMS_AGENCY', 'EMS_RUN_NO', 'PEDPEDAL_ACTION', 'PEDPEDAL_VISIBILITY', 
         'PEDPEDAL_LOCATION', 'BAC_RESULT', 'BAC_RESULT VALUE', 'CELL_PHONE_USE'], axis=1, inplace=True)
df.columns

Index(['CRASH_RECORD_ID', 'CRASH_DATE', 'AGE', 'DRIVER_ACTION',
       'DRIVER_VISION', 'PHYSICAL_CONDITION'],
      dtype='object')

In [4]:
#Drop missing and null values
df.dropna(subset=['AGE'], inplace=True)
df.dropna(subset=['DRIVER_ACTION'], inplace=True)
df.dropna(subset=['DRIVER_VISION'], inplace=True)
df.dropna(subset=['PHYSICAL_CONDITION'], inplace=True)


In [5]:
#Cleaning the 3 columns with many unknown values
df = df[df['DRIVER_VISION']!='UNKNOWN']
df = df[df['DRIVER_ACTION']!='UNKNOWN']
df = df[df['PHYSICAL_CONDITION']!='UNKNOWN']

In [6]:
#Formatting, cleaning, and binning the AGE column. 15 is the youngest age to legally drive in the state of Illinois
#with a learner's permit. 
df = df[df['AGE']>=10]
bins = [9, 14, 19, 29, 39, 49, 59, 69, np.inf]
names = ['Underage 10-14', '15-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70+']
df['AGE_RANGES'] = pd.cut(df['AGE'], bins, labels=names)
print(df['AGE_RANGES'].value_counts())
# df.drop('AGE', axis=1, inplace=True)

20-29             116624
30-39             105008
40-49              82265
50-59              69003
60-69              41753
70+                18967
15-19              16790
Underage 10-14       645
Name: AGE_RANGES, dtype: int64


In [7]:
#value counts of underage drivers between 10-15. 9 and under deleted.
df[df['AGE_RANGES']=='Underage 10-14']['AGE'].value_counts()

14.0    242
13.0    132
12.0     93
11.0     92
10.0     86
Name: AGE, dtype: int64

In [10]:
print(df['DRIVER_ACTION'].value_counts())
print(df['DRIVER_VISION'].value_counts())
print(df['PHYSICAL_CONDITION'].value_counts())

NONE                                 271746
OTHER                                 47478
FAILED TO YIELD                       44505
FOLLOWED TOO CLOSELY                  28518
IMPROPER TURN                         11857
IMPROPER BACKING                      10877
IMPROPER LANE CHANGE                  10599
IMPROPER PASSING                       7038
TOO FAST FOR CONDITIONS                6993
DISREGARDED CONTROL DEVICES            6196
IMPROPER PARKING                       1394
WRONG WAY/SIDE                         1200
CELL PHONE USE OTHER THAN TEXTING       672
OVERCORRECTED                           638
EMERGENCY VEHICLE ON CALL               636
EVADING POLICE VEHICLE                  430
TEXTING                                 178
STOPPED SCHOOL BUS                       80
LICENSE RESTRICTIONS                     20
Name: DRIVER_ACTION, dtype: int64
NOT OBSCURED              431116
OTHER                       7782
MOVING VEHICLES             4965
PARKED VEHICLES             300

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 451055 entries, 1 to 1224557
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   CRASH_RECORD_ID     451055 non-null  object  
 1   CRASH_DATE          451055 non-null  object  
 2   AGE                 451055 non-null  float64 
 3   DRIVER_ACTION       451055 non-null  object  
 4   DRIVER_VISION       451055 non-null  object  
 5   PHYSICAL_CONDITION  451055 non-null  object  
 6   AGE_RANGES          451055 non-null  category
dtypes: category(1), float64(1), object(5)
memory usage: 24.5+ MB
