# Task for Today  

***

## Automobile Accident Severity Prediction  

Given *data about accidents in the US*, let's try to predict the **severity** of a given accident.  
  
We will use a TensorFlow ANN to make our predictions.

# Getting Started

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [2]:
data = pd.read_csv('../input/us-accidents/US_Accidents_June20.csv', nrows=400000)

In [3]:
data

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,A-400001,MapQuest,241.0,3,2017-04-25 11:53:42,2017-04-25 12:23:16,37.717747,-121.532150,,,...,False,False,False,False,False,False,Day,Day,Day,Day
399996,A-400002,MapQuest,201.0,3,2017-04-25 12:08:17,2017-04-25 12:37:47,37.932465,-122.403290,,,...,False,False,False,False,False,False,Day,Day,Day,Day
399997,A-400003,MapQuest,201.0,3,2017-04-25 12:06:21,2017-04-25 12:35:52,37.799576,-122.222092,,,...,False,False,False,False,False,False,Day,Day,Day,Day
399998,A-400004,MapQuest,201.0,2,2017-04-25 12:00:56,2017-04-25 12:29:00,37.009869,-121.515793,,,...,False,False,False,False,True,False,Day,Day,Day,Day


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 49 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID                     400000 non-null  object 
 1   Source                 400000 non-null  object 
 2   TMC                    400000 non-null  float64
 3   Severity               400000 non-null  int64  
 4   Start_Time             400000 non-null  object 
 5   End_Time               400000 non-null  object 
 6   Start_Lat              400000 non-null  float64
 7   Start_Lng              400000 non-null  float64
 8   End_Lat                0 non-null       float64
 9   End_Lng                0 non-null       float64
 10  Distance(mi)           400000 non-null  float64
 11  Description            400000 non-null  object 
 12  Number                 142925 non-null  float64
 13  Street                 400000 non-null  object 
 14  Side                   400000 non-nu

# Missing Values

In [5]:
data.isna().mean()

ID                       0.000000
Source                   0.000000
TMC                      0.000000
Severity                 0.000000
Start_Time               0.000000
End_Time                 0.000000
Start_Lat                0.000000
Start_Lng                0.000000
End_Lat                  1.000000
End_Lng                  1.000000
Distance(mi)             0.000000
Description              0.000000
Number                   0.642687
Street                   0.000000
Side                     0.000000
City                     0.000048
County                   0.000000
State                    0.000000
Zipcode                  0.000115
Country                  0.000000
Timezone                 0.000115
Airport_Code             0.000115
Weather_Timestamp        0.008027
Temperature(F)           0.014793
Wind_Chill(F)            0.852263
Humidity(%)              0.016278
Pressure(in)             0.011622
Visibility(mi)           0.021952
Wind_Direction           0.008080
Wind_Speed(mph

In [6]:
null_columns = ['End_Lat', 'End_Lng', 'Number', 'Wind_Chill(F)', 'Precipitation(in)']

data = data.drop(null_columns, axis=1)

In [7]:
data.isna().sum()

ID                           0
Source                       0
TMC                          0
Severity                     0
Start_Time                   0
End_Time                     0
Start_Lat                    0
Start_Lng                    0
Distance(mi)                 0
Description                  0
Street                       0
Side                         0
City                        19
County                       0
State                        0
Zipcode                     46
Country                      0
Timezone                    46
Airport_Code                46
Weather_Timestamp         3211
Temperature(F)            5917
Humidity(%)               6511
Pressure(in)              4649
Visibility(mi)            8781
Wind_Direction            3232
Wind_Speed(mph)          74175
Weather_Condition         8210
Amenity                      0
Bump                         0
Crossing                     0
Give_Way                     0
Junction                     0
No_Exit 

In [8]:
data = data.dropna(axis=0).reset_index(drop=True)

In [9]:
print("Total missing values:", data.isna().sum().sum())

Total missing values: 0


In [10]:
data

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Accident on OH-32 State Route 32 Westbound at ...,...,False,False,False,False,True,False,Night,Night,Day,Day
1,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Accident on I-75 Southbound at Exits 52 52B US...,...,False,False,False,False,False,False,Night,Day,Day,Day
2,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Accident on McEwen Rd at OH-725 Miamisburg Cen...,...,False,False,False,False,True,False,Day,Day,Day,Day
3,A-6,MapQuest,201.0,3,2016-02-08 07:44:26,2016-02-08 08:14:26,40.100590,-82.925194,0.01,Accident on I-270 Outerbelt Northbound near Ex...,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-7,MapQuest,201.0,2,2016-02-08 07:59:35,2016-02-08 08:29:35,39.758274,-84.230507,0.00,Accident on Oakridge Dr at Woodward Ave. Expec...,...,False,False,False,False,False,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320976,A-400001,MapQuest,241.0,3,2017-04-25 11:53:42,2017-04-25 12:23:16,37.717747,-121.532150,0.01,One lane blocked due to accident on I-580 West...,...,False,False,False,False,False,False,Day,Day,Day,Day
320977,A-400002,MapQuest,201.0,3,2017-04-25 12:08:17,2017-04-25 12:37:47,37.932465,-122.403290,0.01,Right hand shoulder blocked due to accident on...,...,False,False,False,False,False,False,Day,Day,Day,Day
320978,A-400003,MapQuest,201.0,3,2017-04-25 12:06:21,2017-04-25 12:35:52,37.799576,-122.222092,0.01,Slow lane blocked due to accident on I-580 Wes...,...,False,False,False,False,False,False,Day,Day,Day,Day
320979,A-400004,MapQuest,201.0,2,2017-04-25 12:00:56,2017-04-25 12:29:00,37.009869,-121.515793,0.01,Turning lane blocked due to accident on CA-152...,...,False,False,False,False,True,False,Day,Day,Day,Day


# Unnecessary Columns

In [11]:
{column: len(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

{'ID': 320981,
 'Source': 2,
 'Start_Time': 316629,
 'End_Time': 314439,
 'Description': 236513,
 'Street': 36206,
 'Side': 3,
 'City': 4023,
 'County': 548,
 'State': 28,
 'Zipcode': 57076,
 'Country': 1,
 'Timezone': 4,
 'Airport_Code': 638,
 'Weather_Timestamp': 78674,
 'Wind_Direction': 23,
 'Weather_Condition': 67,
 'Sunrise_Sunset': 2,
 'Civil_Twilight': 2,
 'Nautical_Twilight': 2,
 'Astronomical_Twilight': 2}

In [12]:
unneeded_columns = ['ID', 'Description', 'Street', 'City', 'Zipcode', 'Country']

data = data.drop(unneeded_columns, axis=1)

In [13]:
data

Unnamed: 0,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Side,County,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,R,Clermont,...,False,False,False,False,True,False,Night,Night,Day,Day
1,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,R,Montgomery,...,False,False,False,False,False,False,Night,Day,Day,Day
2,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,R,Montgomery,...,False,False,False,False,True,False,Day,Day,Day,Day
3,MapQuest,201.0,3,2016-02-08 07:44:26,2016-02-08 08:14:26,40.100590,-82.925194,0.01,R,Franklin,...,False,False,False,False,False,False,Day,Day,Day,Day
4,MapQuest,201.0,2,2016-02-08 07:59:35,2016-02-08 08:29:35,39.758274,-84.230507,0.00,R,Montgomery,...,False,False,False,False,False,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320976,MapQuest,241.0,3,2017-04-25 11:53:42,2017-04-25 12:23:16,37.717747,-121.532150,0.01,R,San Joaquin,...,False,False,False,False,False,False,Day,Day,Day,Day
320977,MapQuest,201.0,3,2017-04-25 12:08:17,2017-04-25 12:37:47,37.932465,-122.403290,0.01,R,Contra Costa,...,False,False,False,False,False,False,Day,Day,Day,Day
320978,MapQuest,201.0,3,2017-04-25 12:06:21,2017-04-25 12:35:52,37.799576,-122.222092,0.01,R,Alameda,...,False,False,False,False,False,False,Day,Day,Day,Day
320979,MapQuest,201.0,2,2017-04-25 12:00:56,2017-04-25 12:29:00,37.009869,-121.515793,0.01,R,Santa Clara,...,False,False,False,False,True,False,Day,Day,Day,Day


In [14]:
def get_years(df, column):
    return df[column].apply(lambda date: date[0:4])

def get_months(df, column):
    return df[column].apply(lambda date: date[5:7])

In [15]:
data['Start_Time_Month'] = get_months(data, 'Start_Time')
data['Start_Time_Year'] = get_years(data, 'Start_Time')

data['End_Time_Month'] = get_months(data, 'End_Time')
data['End_Time_Year'] = get_years(data, 'End_Time')

data['Weather_Timestamp_Month'] = get_months(data, 'Weather_Timestamp')
data['Weather_Timestamp_Year'] = get_years(data, 'Weather_Timestamp')


data = data.drop(['Start_Time', 'End_Time', 'Weather_Timestamp'], axis=1)

In [16]:
data

Unnamed: 0,Source,TMC,Severity,Start_Lat,Start_Lng,Distance(mi),Side,County,State,Timezone,...,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Start_Time_Month,Start_Time_Year,End_Time_Month,End_Time_Year,Weather_Timestamp_Month,Weather_Timestamp_Year
0,MapQuest,201.0,2,39.063148,-84.032608,0.01,R,Clermont,OH,US/Eastern,...,Night,Night,Day,Day,02,2016,02,2016,02,2016
1,MapQuest,201.0,3,39.747753,-84.205582,0.01,R,Montgomery,OH,US/Eastern,...,Night,Day,Day,Day,02,2016,02,2016,02,2016
2,MapQuest,201.0,2,39.627781,-84.188354,0.01,R,Montgomery,OH,US/Eastern,...,Day,Day,Day,Day,02,2016,02,2016,02,2016
3,MapQuest,201.0,3,40.100590,-82.925194,0.01,R,Franklin,OH,US/Eastern,...,Day,Day,Day,Day,02,2016,02,2016,02,2016
4,MapQuest,201.0,2,39.758274,-84.230507,0.00,R,Montgomery,OH,US/Eastern,...,Day,Day,Day,Day,02,2016,02,2016,02,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320976,MapQuest,241.0,3,37.717747,-121.532150,0.01,R,San Joaquin,CA,US/Pacific,...,Day,Day,Day,Day,04,2017,04,2017,04,2017
320977,MapQuest,201.0,3,37.932465,-122.403290,0.01,R,Contra Costa,CA,US/Pacific,...,Day,Day,Day,Day,04,2017,04,2017,04,2017
320978,MapQuest,201.0,3,37.799576,-122.222092,0.01,R,Alameda,CA,US/Pacific,...,Day,Day,Day,Day,04,2017,04,2017,04,2017
320979,MapQuest,201.0,2,37.009869,-121.515793,0.01,R,Santa Clara,CA,US/Pacific,...,Day,Day,Day,Day,04,2017,04,2017,04,2017


# Encoding

In [17]:
def onehot_encode(df, columns, prefixes):
    df = df.copy()
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [18]:
{column: len(data[column].unique()) for column in data.columns if data.dtypes[column] == 'object'}

{'Source': 2,
 'Side': 3,
 'County': 548,
 'State': 28,
 'Timezone': 4,
 'Airport_Code': 638,
 'Wind_Direction': 23,
 'Weather_Condition': 67,
 'Sunrise_Sunset': 2,
 'Civil_Twilight': 2,
 'Nautical_Twilight': 2,
 'Astronomical_Twilight': 2,
 'Start_Time_Month': 12,
 'Start_Time_Year': 2,
 'End_Time_Month': 12,
 'End_Time_Year': 2,
 'Weather_Timestamp_Month': 12,
 'Weather_Timestamp_Year': 2}

In [19]:
data = onehot_encode(
    data,
    columns=['Side', 'County', 'State', 'Timezone', 'Airport_Code', 'Wind_Direction', 'Weather_Condition'],
    prefixes=['SI', 'CO', 'ST', 'TZ', 'AC', 'WD', 'WC']
)

In [20]:
data

Unnamed: 0,Source,TMC,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),...,WC_Snow,WC_Snow Grains,WC_Snow Showers,WC_Squalls,WC_T-Storm,WC_Thunder,WC_Thunder in the Vicinity,WC_Thunderstorm,WC_Thunderstorms and Rain,WC_Widespread Dust
0,MapQuest,201.0,2,39.063148,-84.032608,0.01,36.0,100.0,29.67,10.0,...,0,0,0,0,0,0,0,0,0,0
1,MapQuest,201.0,3,39.747753,-84.205582,0.01,35.1,96.0,29.64,9.0,...,0,0,0,0,0,0,0,0,0,0
2,MapQuest,201.0,2,39.627781,-84.188354,0.01,36.0,89.0,29.65,6.0,...,0,0,0,0,0,0,0,0,0,0
3,MapQuest,201.0,3,40.100590,-82.925194,0.01,37.9,97.0,29.63,7.0,...,0,0,0,0,0,0,0,0,0,0
4,MapQuest,201.0,2,39.758274,-84.230507,0.00,34.0,100.0,29.66,7.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320976,MapQuest,241.0,3,37.717747,-121.532150,0.01,60.1,55.0,30.09,10.0,...,0,0,0,0,0,0,0,0,0,0
320977,MapQuest,201.0,3,37.932465,-122.403290,0.01,63.0,52.0,30.05,10.0,...,0,0,0,0,0,0,0,0,0,0
320978,MapQuest,201.0,3,37.799576,-122.222092,0.01,63.0,54.0,30.11,10.0,...,0,0,0,0,0,0,0,0,0,0
320979,MapQuest,201.0,2,37.009869,-121.515793,0.01,62.6,48.0,30.11,10.0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
def get_binary_column(df, column):
    if column == 'Source':
        return df[column].apply(lambda x: 1 if x == 'MapQuest' else 0)
    else:
        return df[column].apply(lambda x: 1 if x == 'Day' else 0)

In [22]:
data['Source'] = get_binary_column(data, 'Source')

data['Sunrise_Sunset'] = get_binary_column(data, 'Sunrise_Sunset')
data['Civil_Twilight'] = get_binary_column(data, 'Civil_Twilight')
data['Nautical_Twilight'] = get_binary_column(data, 'Nautical_Twilight')
data['Astronomical_Twilight'] = get_binary_column(data, 'Astronomical_Twilight')

In [23]:
data

Unnamed: 0,Source,TMC,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),...,WC_Snow,WC_Snow Grains,WC_Snow Showers,WC_Squalls,WC_T-Storm,WC_Thunder,WC_Thunder in the Vicinity,WC_Thunderstorm,WC_Thunderstorms and Rain,WC_Widespread Dust
0,1,201.0,2,39.063148,-84.032608,0.01,36.0,100.0,29.67,10.0,...,0,0,0,0,0,0,0,0,0,0
1,1,201.0,3,39.747753,-84.205582,0.01,35.1,96.0,29.64,9.0,...,0,0,0,0,0,0,0,0,0,0
2,1,201.0,2,39.627781,-84.188354,0.01,36.0,89.0,29.65,6.0,...,0,0,0,0,0,0,0,0,0,0
3,1,201.0,3,40.100590,-82.925194,0.01,37.9,97.0,29.63,7.0,...,0,0,0,0,0,0,0,0,0,0
4,1,201.0,2,39.758274,-84.230507,0.00,34.0,100.0,29.66,7.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320976,1,241.0,3,37.717747,-121.532150,0.01,60.1,55.0,30.09,10.0,...,0,0,0,0,0,0,0,0,0,0
320977,1,201.0,3,37.932465,-122.403290,0.01,63.0,52.0,30.05,10.0,...,0,0,0,0,0,0,0,0,0,0
320978,1,201.0,3,37.799576,-122.222092,0.01,63.0,54.0,30.11,10.0,...,0,0,0,0,0,0,0,0,0,0
320979,1,201.0,2,37.009869,-121.515793,0.01,62.6,48.0,30.11,10.0,...,0,0,0,0,0,0,0,0,0,0


# Splitting/Scaling

In [24]:
y = data['Severity'].copy()
X = data.drop('Severity', axis=1).copy()

In [25]:
y.unique()

array([2, 3, 1, 4])

In [26]:
y = y - 1

In [27]:
X = X.astype(np.float)

In [28]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

# Training

In [30]:
X.shape

(320981, 1344)

In [31]:
inputs = tf.keras.Input(shape=(X.shape[1],))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(4, activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

batch_size = 32
epochs = 20

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


# Results

In [32]:
print("Test Accuracy:", model.evaluate(X_test, y_test, verbose=0)[1])

Test Accuracy: 0.8110182285308838


# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/hB6Wx7HX0c4