In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict

import pandas as pd

In [2]:
df_accidents = pd.read_csv('data/MA3_finished.csv', index_col='ID')

In [3]:
df_accidents

Unnamed: 0_level_0,Unnamed: 0,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,City,...,Nautical_Twilight,Astronomical_Twilight,Year,Month,Day,Hour,Duration,State Name,Region,Division
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.010,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,Williamsburg,...,Day,Day,2016,2,0,6,1800.0,Ohio,Midwest,East North Central
1,1,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.010,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,Dayton,...,Day,Day,2016,2,0,7,1800.0,Ohio,Midwest,East North Central
2,2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.010,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,Dayton,...,Day,Day,2016,2,0,7,1800.0,Ohio,Midwest,East North Central
3,3,3,2016-02-08 07:44:26,2016-02-08 08:14:26,40.100590,-82.925194,0.010,Accident on I-270 Outerbelt Northbound near Ex...,Westerville Rd,Westerville,...,Day,Day,2016,2,0,7,1800.0,Ohio,Midwest,East North Central
4,4,2,2016-02-08 07:59:35,2016-02-08 08:29:35,39.758274,-84.230507,0.000,Accident on Oakridge Dr at Woodward Ave. Expec...,N Woodward Ave,Dayton,...,Day,Day,2016,2,0,7,1800.0,Ohio,Midwest,East North Central
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7051551,7051551,2,2019-08-23 18:03:25,2019-08-23 18:32:01,34.002480,-117.379360,0.543,At Market St - Accident.,Pomona Fwy E,Riverside,...,Day,Day,2019,8,4,18,1716.0,California,West,Pacific
7051552,7051552,2,2019-08-23 19:11:30,2019-08-23 19:38:23,32.766960,-117.148060,0.338,At Camino Del Rio/Mission Center Rd - Accident.,I-8 W,San Diego,...,Day,Day,2019,8,4,19,1613.0,California,West,Pacific
7051553,7051553,2,2019-08-23 19:00:21,2019-08-23 19:28:49,33.775450,-117.847790,0.561,At Glassell St/Grand Ave - Accident. in the ri...,Garden Grove Fwy,Orange,...,Day,Day,2019,8,4,19,1708.0,California,West,Pacific
7051554,7051554,2,2019-08-23 19:00:21,2019-08-23 19:29:42,33.992460,-118.403020,0.772,At CA-90/Marina Fwy/Jefferson Blvd - Accident.,San Diego Fwy S,Culver City,...,Day,Day,2019,8,4,19,1761.0,California,West,Pacific


In [4]:
df_accidents = df_accidents.sample(n=35000, random_state=1)

In [5]:
#featureselection
#features = df_accidents[['Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
#               'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
#               'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']]

features = df_accidents[['Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
                         'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']]

In [6]:
#Encoding text/categorical variables
le = LabelEncoder()
features['Weather_Condition']= le.fit_transform(features['Weather_Condition'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Weather_Condition']= le.fit_transform(features['Weather_Condition'])


In [7]:
#target
target = df_accidents['Severity']

In [8]:
#(80/20 split)
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [9]:
#Create a pipeline
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False), threshold='mean')),
    ('clf', SVC(max_iter=5000))  
])

In [10]:
#parameters for gridSearch
parameters = {
    'feature_selection__threshold': (None, 'mean'), 
    'clf__kernel': ('linear', 'rbf')
}

#Perform GridSearch
grid_search = GridSearchCV(pipe, param_grid=parameters, verbose=10)
grid_search.fit(features_train, target_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 1/5; 1/4] END clf__kernel=linear, feature_selection__threshold=None;, score=0.438 total time=   4.5s
[CV 2/5; 1/4] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 2/5; 1/4] END clf__kernel=linear, feature_selection__threshold=None;, score=0.345 total time=   4.4s
[CV 3/5; 1/4] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 3/5; 1/4] END clf__kernel=linear, feature_selection__threshold=None;, score=0.613 total time=   4.4s
[CV 4/5; 1/4] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 4/5; 1/4] END clf__kernel=linear, feature_selection__threshold=None;, score=0.377 total time=   4.4s
[CV 5/5; 1/4] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 5/5; 1/4] END clf__kernel=linear, feature_selection__threshold=None;, score=0.337 total time=   4.3s
[CV 1/5; 2/4] START clf__kernel=linear, feature_selection__threshold=mean.......




[CV 1/5; 2/4] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.288 total time=   4.5s
[CV 2/5; 2/4] START clf__kernel=linear, feature_selection__threshold=mean.......




[CV 2/5; 2/4] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.406 total time=   4.0s
[CV 3/5; 2/4] START clf__kernel=linear, feature_selection__threshold=mean.......




[CV 3/5; 2/4] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.651 total time=   4.2s
[CV 4/5; 2/4] START clf__kernel=linear, feature_selection__threshold=mean.......




[CV 4/5; 2/4] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.533 total time=   4.7s
[CV 5/5; 2/4] START clf__kernel=linear, feature_selection__threshold=mean.......
[CV 5/5; 2/4] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.806 total time=   4.5s
[CV 1/5; 3/4] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 1/5; 3/4] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.806 total time=  14.2s
[CV 2/5; 3/4] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 2/5; 3/4] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.806 total time=  13.9s
[CV 3/5; 3/4] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 3/5; 3/4] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.806 total time=  14.4s
[CV 4/5; 3/4] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 4/5; 3/4] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.806 total time=  14.8s
[CV 5/5; 3/4] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 5/5; 3/4] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.806 total time=  13.9s
[CV 1/5; 4/4] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 1/5; 4/4] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.806 total time=  11.3s
[CV 2/5; 4/4] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 2/5; 4/4] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.806 total time=  10.8s
[CV 3/5; 4/4] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 3/5; 4/4] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.806 total time=  12.3s
[CV 4/5; 4/4] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 4/5; 4/4] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.806 total time=  11.9s
[CV 5/5; 4/4] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 5/5; 4/4] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.806 total time=  11.8s




In [11]:
target_predicted = grid_search.predict(features_test)

In [12]:
#Evaluate the model
print("Default-Score des Klassifizierers: Accuracy=", grid_search.score(features_test, target_test), "\n")
print(classification_report(target_test, target_predicted))

Default-Score des Klassifizierers: Accuracy= 0.8011428571428572 

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        64
           2       0.80      1.00      0.89      5608
           3       0.00      0.00      0.00      1163
           4       0.00      0.00      0.00       165

    accuracy                           0.80      7000
   macro avg       0.20      0.25      0.22      7000
weighted avg       0.64      0.80      0.71      7000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
