In [1]:
import statsmodels.api as sm
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from category_encoders import TargetEncoder

In [2]:
#read in data
accidents = pd.read_csv('data/MA3_finished.csv', index_col='ID')
accidents = accidents.sample(frac=0.1)

In [3]:
#look at data
print(accidents.head)

<bound method NDFrame.head of          Unnamed: 0  Severity           Start_Time             End_Time  \
ID                                                                        
2486886     2486886         2  2018-06-14 16:09:59  2018-06-14 17:09:37   
772867       772867         2  2021-09-20 06:52:39  2021-09-20 07:22:13   
3522166     3522166         2  2022-03-10 16:10:06  2022-03-10 16:38:59   
4839643     4839643         2  2022-08-27 12:31:41  2022-08-27 13:34:28   
5365303     5365303         2  2021-07-29 08:10:35  2021-07-29 08:36:00   
...             ...       ...                  ...                  ...   
6473153     6473153         4  2020-03-20 18:02:35  2020-03-20 18:37:33   
1315603     1315603         2  2020-05-30 10:55:23  2020-05-30 12:29:35   
4169049     4169049         4  2022-11-05 21:17:00  2022-11-06 01:00:49   
3340293     3340293         2  2022-10-07 05:53:00  2022-10-07 08:02:32   
4394540     4394540         2  2022-09-23 16:58:05  2022-09-23 20:30:0

In [4]:
#create copy for this file and drop unnamed column name
regression_df = accidents.copy()
regression_df.drop('Unnamed: 0', axis=1, inplace=True)
#check new df
regression_df.head()

Unnamed: 0_level_0,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,City,County,...,Nautical_Twilight,Astronomical_Twilight,Year,Month,Day,Hour,Duration,State Name,Region,Division
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2486886,2,2018-06-14 16:09:59,2018-06-14 17:09:37,30.676197,-88.127373,0.0,Accident on County Hwy-56 Airport Blvd Westbou...,I-65 S,Mobile,Mobile,...,Day,Day,2018,6,3,16,3578.0,Alabama,South,East South Central
772867,2,2021-09-20 06:52:39,2021-09-20 07:22:13,33.64711,-84.397804,0.0,Right hand shoulder blocked due to accident on...,GA-401 N,Atlanta,Clayton,...,Day,Day,2021,9,0,6,1774.0,Georgia,South,South Atlantic
3522166,2,2022-03-10 16:10:06,2022-03-10 16:38:59,35.269857,-80.79435,0.464,Stationary traffic from US-29/NC-49/N Tryon St...,Munsee St,Charlotte,Mecklenburg,...,Day,Day,2022,3,3,16,1733.0,North Carolina,South,South Atlantic
4839643,2,2022-08-27 12:31:41,2022-08-27 13:34:28,30.378275,-97.688442,0.278,Slow traffic on TX-275 Loop from W Grady Dr (N...,N Lamar Blvd,Austin,Travis,...,Day,Day,2022,8,5,12,3767.0,Texas,South,West South Central
5365303,2,2021-07-29 08:10:35,2021-07-29 08:36:00,33.70543,-117.855026,1.939,Slow traffic on Costa Mesa Fwy S - CA-55 S fro...,CA-55 S,Santa Ana,Orange,...,Day,Day,2021,7,3,8,1525.0,California,West,Pacific


In [5]:
#describe the data
regression_df.describe()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Temperature(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Year,Month,Day,Hour,Duration
count,705156.0,705156.0,705156.0,705156.0,705156.0,705156.0,705156.0,705156.0,705156.0,705156.0,705156.0,705156.0,705156.0,705156.0
mean,2.202698,36.193256,-94.567841,0.569845,61.902302,64.376226,29.51416,9.101926,7.69102,2020.028401,6.683036,2.599813,12.439467,26096.51
std,0.480788,5.117869,17.306861,1.730488,19.04247,22.777913,1.014116,2.643746,5.563616,1.865549,3.635368,1.806146,5.428459,795616.8
min,1.0,24.5548,-124.535893,0.0,-35.0,1.0,2.99,0.0,0.0,2016.0,1.0,0.0,0.0,180.0
25%,2.0,33.315374,-117.159039,0.0,49.5,48.0,29.34,10.0,4.6,2019.0,3.0,1.0,8.0,2059.0
50%,2.0,35.819729,-87.664725,0.037,64.0,66.0,29.84,10.0,7.0,2020.0,7.0,3.0,13.0,4502.0
75%,2.0,40.121761,-80.369779,0.479,76.0,84.0,30.02,10.0,10.4,2022.0,10.0,4.0,17.0,7546.0
max,4.0,48.999569,-68.13219,175.679993,196.0,100.0,58.39,100.0,822.8,2023.0,12.0,6.0,23.0,134181300.0


In [6]:
#Wind_Speed 822 mph --> outliner
#print(regression_df.index[accidents ['Wind_Speed(mph)'] >822].tolist())
#print(regression_df.loc[[86481]])
#regression_df.drop(labels=[86481],axis=0, inplace=True)

In [7]:
#Wind_Speed 243 mph --> outliner
#print(regression_df.index[regression_df ['Wind_Speed(mph)'] >242].tolist())
#print (regression_df.loc[[776748]])
#regression_df.drop(labels=[776748],axis=0, inplace=True)

In [8]:
regression_df.dtypes

Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Temperature(F)           float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Weather_Condition         object
Amenity                     bool
Bump                        bool
Crossing                    bool
Give_Way                    bool
Junction                    bool
No_Exit                     bool
Railway                     bool
Roundabout                  bool
Station   

In [9]:
#Rename Columns with () or ' ' for the equation --> otherwise it leads to PatsyError
regression_df = regression_df.rename(columns={
    'Temperature(F)': 'Temperature',
    'Humidity(%)': 'Humidity',
    'Visibility(mi)': 'Visibility',
    'Pressure(in)': 'Pressure',
    'Wind_Speed(mph)': 'Wind_Speed',
    'Distance(mi)': 'Distance',
    'State Name': 'StateName',
})

In [10]:
#basic linearRegressionModel
severity_basic = ols("Severity ~ Weather_Condition", regression_df).fit()
print(severity_basic.summary())

                            OLS Regression Results                            
Dep. Variable:               Severity   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.032
Method:                 Least Squares   F-statistic:                     205.7
Date:                Fri, 05 Jan 2024   Prob (F-statistic):               0.00
Time:                        13:44:57   Log-Likelihood:            -4.7273e+05
No. Observations:              705156   AIC:                         9.457e+05
Df Residuals:                  705042   BIC:                         9.470e+05
Df Model:                         113                                         
Covariance Type:            nonrobust                                         
                                                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------

In [15]:
severity_full = ols("Severity ~ Start_Lat + Start_Lng + Distance + Temperature + Humidity + Pressure + Visibility + Wind_Speed + Amenity + Bump + Crossing + Give_Way + Junction + No_Exit + Railway + Roundabout + Station + Stop + Traffic_Calming + Traffic_Signal + Turning_Loop + Year + Month + Day + Hour + Duration", regression_df).fit()
print(severity_full.summary())

                            OLS Regression Results                            
Dep. Variable:               Severity   R-squared:                       0.102
Model:                            OLS   Adj. R-squared:                  0.102
Method:                 Least Squares   F-statistic:                     3214.
Date:                Fri, 05 Jan 2024   Prob (F-statistic):               0.00
Time:                        13:49:38   Log-Likelihood:            -4.4611e+05
No. Observations:              705156   AIC:                         8.923e+05
Df Residuals:                  705130   BIC:                         8.926e+05
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 