In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('data/Traffic_Crashes_People.csv', low_memory=False)

In [3]:
df.drop(['PERSON_ID', 'PERSON_TYPE', 'RD_NO', 'VEHICLE_ID', 'SEAT_NO', 'CITY', 'STATE', 'ZIPCODE', 'SEX', 
         'DRIVERS_LICENSE_STATE', 'DRIVERS_LICENSE_CLASS', 'SAFETY_EQUIPMENT', 'AIRBAG_DEPLOYED', 'EJECTION', 
         'INJURY_CLASSIFICATION', 'HOSPITAL', 'EMS_AGENCY', 'EMS_RUN_NO', 'PEDPEDAL_ACTION', 'PEDPEDAL_VISIBILITY', 
         'PEDPEDAL_LOCATION', 'BAC_RESULT', 'BAC_RESULT VALUE', 'CELL_PHONE_USE'], axis=1, inplace=True)
df.columns

Index(['CRASH_RECORD_ID', 'CRASH_DATE', 'AGE', 'DRIVER_ACTION',
       'DRIVER_VISION', 'PHYSICAL_CONDITION'],
      dtype='object')

In [4]:
#Drop missing and null values
df.dropna(subset=['AGE'], inplace=True)
df.dropna(subset=['DRIVER_ACTION'], inplace=True)
df.dropna(subset=['DRIVER_VISION'], inplace=True)
df.dropna(subset=['PHYSICAL_CONDITION'], inplace=True)


In [5]:
#Cleaning the 3 columns with many unknown values
df = df[df['DRIVER_VISION']!='UNKNOWN']
df = df[df['DRIVER_ACTION']!='UNKNOWN']
df = df[df['PHYSICAL_CONDITION']!='UNKNOWN']

In [6]:
#Formatting, cleaning, and binning the AGE column. 15 is the youngest age to legally drive in the state of Illinois
#with a learner's permit. 
df = df[df['AGE']>=15]
bins = [14, 23, 64, np.inf]
names = ['15-23', '24-64', '65+']
df['AGE_RANGES'] = pd.cut(df['AGE'], bins, labels=names)
print(df['AGE_RANGES'].value_counts())
# df.drop('AGE', axis=1, inplace=True)

24-64    359431
15-23     55573
65+       35406
Name: AGE_RANGES, dtype: int64


In [7]:
# #value counts of underage drivers between 10-15. 9 and under deleted.
# df[df['AGE_RANGES']=='Underage 10-14']['AGE'].value_counts()

In [8]:
# print(df['DRIVER_ACTION'].value_counts())
# print(df['DRIVER_VISION'].value_counts())
# print(df['PHYSICAL_CONDITION'].value_counts())

In [9]:
dangerous_beh = ['IMPAIRED - ALCOHOL', 'FATIGUED/ASLEEP', 'EMOTIONAL', 'ILLNESS/FAINTED', 'HAD BEEN DRINKING', 
                 'IMPAIRED - DRUGS', 'IMPAIRED - ALCOHOL AND DRUGS', 'MEDICATED']

def physical_condition(val):
    if val in dangerous_beh:
        return 1
    else:
        return 0
    
df['DANGEROUS_BEH'] = df['PHYSICAL_CONDITION'].map(physical_condition)

In [10]:
bad_vision = ['OTHER', 'MOVING VEHICLES', 'PARKED VEHICLES', 'WINDSHIELD (WATER/ICE)', 'BLINDED - SUNLIGHT', 'HILLCREST',
             'EMBANKMENT', 'BLOWING MATERIALS', 'SIGNBOARD']

def obscured_vision(val):
    if val in bad_vision:
        return 1
    else:
        return 0
    
df['OBSCURED_VIZ'] = df['DRIVER_VISION'].map(obscured_vision)

In [11]:
bad_action = ['OTHER', 'FAILED TO YIELD', 'FOLLOWED TOO CLOSELY', 'IMPROPER TURN', 'IMPROPER BACKING', 'IMPROPER LANE CHANGE',
             'IMPROPER PASSING', 'TOO FAST FOR CONDITIONS', 'DISREGARDED CONTROL DEVICES', 'IMPROPER PARKING',
             'WRONG WAY/SIDE', 'CELL PHONE USE OTHER THAN TEXTING', 'OVERCORRECTED', 'EMERGENCY VEHICLE ON CALL',
             'EVADING POLICE VEHICLE', 'TEXTING', 'STOPPED SCHOOL BUS', 'LICENSE RESTRICTIONS']

def driver_error(val):
    if val in bad_action:
        return 1
    else:
        return 0
    
df['DRIVER_ERROR'] = df['DRIVER_ACTION'].map(driver_error)

In [12]:
highrisk_ages = ['15-23', '65+']

def ages(val):
    if val in highrisk_ages:
        return 1
    else:
        return 0
    
df['HIGHRISK_AGERANGES'] = df['AGE_RANGES'].map(ages)

In [13]:
unnecessary_columns = ['AGE', 'DRIVER_ACTION', 'DRIVER_VISION', 'PHYSICAL_CONDITION', 'AGE_RANGES']
df.drop(unnecessary_columns, axis = 1, inplace=True)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 450410 entries, 1 to 1224557
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   CRASH_RECORD_ID     450410 non-null  object
 1   CRASH_DATE          450410 non-null  object
 2   DANGEROUS_BEH       450410 non-null  int64 
 3   OBSCURED_VIZ        450410 non-null  int64 
 4   DRIVER_ERROR        450410 non-null  int64 
 5   HIGHRISK_AGERANGES  450410 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 24.1+ MB
