In [1]:
# import dependencies

import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np


# may delete or move these these once model is more defined
#  moved down to split section-> from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

In [2]:
# import data and create dataframe
# current path uses preliminary accident file and not real data output including vehicles
data = Path('Resources/accident.csv')
df = pd.read_csv(data)
df.head(5)

Unnamed: 0,CASENUM,STRATUM,STRATUMNAME,REGION,REGIONNAME,PSU,PJ,PSU_VAR,URBANICITY,URBANICITYNAME,...,WEATHR_IM,WEATHR_IMNAME,MAXSEV_IM,MAXSEV_IMNAME,NO_INJ_IM,NO_INJ_IMNAME,ALCHL_IM,ALCHL_IMNAME,PSUSTRAT,WEIGHT
0,202002121240,9,Stratum 9 - LMY PV No Injuries in Crash,4,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",20,4140,20,2,Rural Area,...,10,Cloudy,0,No Apparent Injury (O),0,No Person Injured/Property Damage Only Crash,2,No Alcohol Involved,25,161.35828
1,202002121829,8,Stratum 8 - NLMY PV Minor Injury,3,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",28,4139,28,1,Urban Area,...,1,Clear,2,Suspected Minor Injury (B),1,1,2,No Alcohol Involved,11,131.962215
2,202002121849,9,Stratum 9 - LMY PV No Injuries in Crash,3,"South (MD, DE, DC, WV, VA, KY, TN, NC, SC, GA,...",12,4142,12,1,Urban Area,...,1,Clear,0,No Apparent Injury (O),0,No Person Injured/Property Damage Only Crash,2,No Alcohol Involved,12,165.957768
3,202002123484,10,Stratum 10 - Other,4,"West (MT, ID, WA, OR, CA, NV, NM, AZ, UT, CO, ...",20,4140,20,2,Rural Area,...,4,Snow,0,No Apparent Injury (O),0,No Person Injured/Property Damage Only Crash,2,No Alcohol Involved,25,214.206572
4,202002123576,10,Stratum 10 - Other,1,"Northeast (PA, NJ, NY, NH, VT, RI, MA, ME, CT)",22,4149,22,2,Rural Area,...,4,Snow,0,No Apparent Injury (O),0,No Person Injured/Property Damage Only Crash,2,No Alcohol Involved,5,220.296289


In [3]:
# separate targets

y = df["MAXSEV_IM"]
y.tail(5)


54740    2
54741    1
54742    2
54743    0
54744    0
Name: MAXSEV_IM, dtype: int64

In [4]:
#create resulting features dataframe
X_df = df[["STRATUM","REGION","URBANICITY","MONTH","YEAR","DAY_WEEK","HARM_EV","ALCOHOL","MAX_SEV","MAN_COLL","RELJCT1","RELJCT2","TYP_INT","WRK_ZONE","REL_ROAD","LGT_COND","INT_HWY","WEATHER","WKDY_IM","EVENT1_IM","MANCOL_IM","RELJCT1_IM","RELJCT2_IM","LGTCON_IM","WEATHR_IM","ALCHL_IM",
]]
X_df.tail(5)

Unnamed: 0,STRATUM,REGION,URBANICITY,MONTH,YEAR,DAY_WEEK,HARM_EV,ALCOHOL,MAX_SEV,MAN_COLL,...,INT_HWY,WEATHER,WKDY_IM,EVENT1_IM,MANCOL_IM,RELJCT1_IM,RELJCT2_IM,LGTCON_IM,WEATHR_IM,ALCHL_IM
54740,6,3,2,12,2020,4,52,2,2,0,...,0,10,4,52,0,0,1,1,10,2
54741,8,3,1,11,2020,5,11,2,1,0,...,0,1,5,11,0,0,1,2,1,2
54742,8,3,1,11,2020,3,34,9,2,0,...,0,1,3,34,0,0,1,1,1,2
54743,10,3,1,11,2020,2,12,2,0,6,...,0,1,2,12,6,1,2,2,1,2
54744,7,3,1,12,2020,2,12,2,0,6,...,0,1,2,12,6,0,8,3,1,2


In [5]:
# partition data into training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, 
                                                    y, random_state =1,test_size = .2)
X_train.shape

(43796, 26)

In [6]:
# create the model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=400,
                                random_state=1)

In [7]:
# train the model
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=400, random_state=1)

In [8]:
# make preditions with the model
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,2,3
1,3,3
2,2,3
3,2,3
4,1,2
5,0,0
6,0,0
7,0,0
8,2,0
9,0,0


In [11]:

df_results=pd.DataFrame(results)
df_results.to_csv('results.csv')

In [12]:
# validate the model

In [13]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6165859895880902


In [14]:
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[4313  489   58   38    0    0    0]
 [ 742 1334  331   47    0    0    0]
 [ 252  926  662  106    0    0    0]
 [  20  118  724  442    0    0    0]
 [   4    1  122  176    0    0    0]
 [   4    0   20   14    0    0    0]
 [   0    0    2    4    0    0    0]]


In [15]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.81      0.88      0.84      4898
           1       0.47      0.54      0.50      2454
           2       0.34      0.34      0.34      1946
           3       0.53      0.34      0.41      1304
           4       0.00      0.00      0.00       303
           5       0.00      0.00      0.00        38
           8       0.00      0.00      0.00         6

    accuracy                           0.62     10949
   macro avg       0.31      0.30      0.30     10949
weighted avg       0.59      0.62      0.60     10949



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
