# Libary Import

In [None]:
#Importing libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.tree import plot_tree

# Reading data and cleaning

In [None]:
#Reading in the dare
accidents = pd.read_csv('/Users/pedropereira/Downloads/US_Accidents_June20.csv')

In [None]:
#Storing the in a different variable. This is to save time if a new copy of the original data is needed
df = accidents.copy()

In [None]:
#Displaying sample fo the data
df.head()

In [None]:
#View columns and data types
df.info()

In [None]:
#Selecting columns to drop
delcols = ['ID','TMC','Source', 'End_Lat', 'End_Lng', 'Number', 'Street', 'Airport_Code', 'Weather_Timestamp', 'Civil_Twilight', 
           'Nautical_Twilight', 'Astronomical_Twilight']
#Dropping columns
df.drop(delcols, axis=1, inplace=True)
#Replacing null values
df.fillna(df.median(), inplace=True)
#Replacing boolean values
df.replace({True:1,False:0}, inplace=True)
#Converting start and end time to datetime data type
df.Start_Time =  pd.to_datetime(df.Start_Time)
df.End_Time =  pd.to_datetime(df.End_Time)
#Converting severity to factor
df.Severity = df.Severity.astype(object)
#Calculating duration in hours
df['Duration'] = df.End_Time - df.Start_Time
df['Duration'] = df['Duration'] / np.timedelta64(1, 'h')
#Replacing outliers
df.loc[df['Duration'] > 5, 'Duration'] = 0.74 
df.loc[df['Duration'] < 0, 'Duration'] = 0.74 
df.drop(columns=['Start_Time', 'End_Time', 'Description'], inplace = True)
df.fillna(df.median(), inplace=True)

# EDA

In [None]:
#Exploring the accident severity
df.Severity.value_counts(normalize = True)

In [None]:
#Plotting the frequency of the structures
plt.figure(figsize=(10,10))
structural_conditions = df[['Turning_Loop', 'Traffic_Signal', 'Traffic_Calming', 'Stop', 'Station', 'Roundabout', 'Railway', 'No_Exit', 'Junction', 'Give_Way', 'Crossing', 'Bump', 'Amenity']]
total = len(structural_conditions)
z = structural_conditions.sum(axis=0)
x = pd.DataFrame(z, columns=['Structure'])
x.reset_index(inplace=True)
x.rename(columns={'index':'Structure', 'Structure':'Frequency'}, inplace=True)
x['Proportion'] = x['Frequency'] / total * 100
x.sort_values(by=['Proportion'], ascending=False, inplace=True)
sns.barplot(data= x, x =  x['Structure'], y= x['Proportion'])
plt.xticks(rotation=90)

In [None]:
#Plotting the frequency of other columns
plt.figure(figsize=(40,40))
un = df[["Side", "City", "County", "State", "Country", "Timezone", "Weather_Condition", "Zipcode"]]
un = un.nunique().reset_index()
un.columns = ['feature','nunique']
un.sort_values(by=["nunique"], inplace = True)
un

In [None]:
#Independent columns
X = df.drop(columns=['Severity', 'City', 'County', 'Zipcode', 'Country', 'Weather_Condition', 'Wind_Direction', 
                     'Roundabout', 'Bump', 'Turning_Loop', 'Traffic_Calming', 'State', 'Sunrise_Sunset', 'Timezone', 
                     'Stop', 'Amenity', 'Give_Way', 'No_Exit', 'Station', 'Railway' ], axis = 1)  
#Dependent column
y = df.Severity   
#One hot encoding variables
X = pd.get_dummies(X)


In [None]:
from datetime import datetime
#Creating hour column
new_col = []
for i in df['Start_Time']:
    new_col.append(i.hour) 
df['hour'] = new_col

In [None]:
#Creating the day column
new_col = []
for i in df['Start_Time']:
    new_col.append(i.day) 
df['day'] = new_col

In [None]:
#Creating the month column
new_col = []
for i in df['Start_Time']:
    new_col.append(i.month) 
df['month'] = new_col

In [None]:
#Plot by hour
import seaborn as sns
plt.figure(figsize=(7,7))
plt.title('Accident Frequency by Hour')
sns.countplot(df['hour'])

In [None]:
#Plot by month
plt.figure(figsize=(7,7))
plt.title('Accidents by Month')
df.month.value_counts().plot.bar()

In [None]:
#Plot by day
plt.figure(figsize=(7,7))
plt.title('Accident Frequency by Day')
sns.countplot(df['day'])

In [None]:
#Plot severity by hour
plt.figure(figsize=(7,7))
x = df.groupby('hour')['Severity'].mean()
x = pd.DataFrame(x)
x.reset_index(inplace = True)
x = x.sort_values(by = 'Severity', ascending = False)
plt.title('Accident Severity by Hour')
sns.barplot(data = x, x = 'hour', y = 'Severity', order = x.sort_values('Severity', ascending = False)['hour'])

In [None]:
#Dictionary containing US States and abbreviations
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}


In [None]:
#Reading population file
pop = pd.read_csv('Desktop/Population.csv')
#Replacing pop file with abbreviation
pop['State'].replace(us_state_abbrev, inplace = True)

In [None]:
#Joining data by the abbreviations and aggregating count by population proportion
x = pd.DataFrame(x)
x.reset_index(inplace = True)
x.prop = round(x.prop / 10)
x.sort_values(by='prop', ascending = False)
x.rename(columns={'Start_Lat':'Count'}, inplace = True)
x.sort_values(by='prop', ascending = False)
x = x.merge(pop, left_on = 'State', right_on = 'State')

# Random Forest

In [None]:
#Creating sample of the data
df = accidents.sample(frac= 0.70, replace=False, random_state=1)

In [None]:
#Splitting the data into training and testing sets with 20% reserved for test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
#Training the model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20, max_depth = 20, random_state=0).fit(X_train, y_train)
#Making predictions
y_pred_test = rf.predict(X_test)
#Displaying model accuracy
round(rf.score(X_test, y_test), 4)

In [None]:
#Displaying more detailed metrics and visual confusion matrix
from sklearn import metrics
print("Classification report for classifier %s:\n%s\n"
      % (rf, metrics.classification_report(y_test, y_pred_test)))
disp = metrics.plot_confusion_matrix(rf, X_test, y_test)
disp.figure_.suptitle("Confusion Matrix")
print("Confusion matrix:\n%s" % disp.confusion_matrix)

plt.show()

In [None]:
#Plotting the most significant variables from the model
plt.figure(figsize=(10,10))
sorted_idx = rf.feature_importances_.argsort()
plt.barh(X.columns[sorted_idx], rf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

# Decision Tree

In [None]:
#Dependent variable
X = df.drop(columns=['Severity', 'City', 'County', 'Zipcode', 'Country', 'Weather_Condition', 'Wind_Direction', 
                     'Roundabout', 'Bump', 'Turning_Loop', 'Traffic_Calming', 'State', 'Sunrise_Sunset', 'Timezone', 
                     'Stop', 'Amenity', 'Give_Way', 'No_Exit', 'Station', 'Railway', 'Duration', 'Severe_Delay' ], axis = 1)  
#Independent columns
y = df.Severe_Delay  
#One hot encoding variables
X = pd.get_dummies(X)

In [None]:
#Splitting the data into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
#Training model and making predictions
tree = tree.DecisionTreeClassifier(max_depth = 3)
y_pred = tree.fit(X_train, y_train).predict(X_test)

#Displaying algorithm metrics and confusion matrix plot
print("Classification report for classifier %s:\n%s\n"
      % (tree, metrics.classification_report(y_test, y_pred)))
disp = metrics.plot_confusion_matrix(tree, X_test, y_test)
disp.figure_.suptitle("Confusion Matrix")
print("Confusion matrix:\n%s" % disp.confusion_matrix)

plt.show()

In [None]:
#Displaying decision tree
plt.figure(figsize=(15,15))
from sklearn.tree import plot_tree
a = plot_tree(tree, 
              feature_names=X.columns, 
              class_names=str(np.unique(y.values)), 
              filled=True)
              