In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict

import pandas as pd

In [2]:
df_accidents = pd.read_csv('data/MA4_finished.csv', index_col='ID')

In [3]:
df_accidents = df_accidents.dropna()

In [4]:
df_accidents = df_accidents.sample(n=35000, random_state=1)

In [5]:
#featureselection
features = df_accidents[['Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
               'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
               'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']]

In [6]:
#Encoding text/categorical variables
le = LabelEncoder()
features['Weather_Condition']= le.fit_transform(features['Weather_Condition'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Weather_Condition']= le.fit_transform(features['Weather_Condition'])


In [7]:
#target
target = df_accidents['Severity']

In [8]:
#(80/20 split)
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [9]:
#Create a pipeline
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False), threshold='mean')),
    ('clf', SVC(max_iter=5000))  
])

In [10]:
#parameters for gridSearch
parameters = {
    'feature_selection__threshold': (None, 'mean'), 
    'clf__kernel': ('linear', 'poly', 'rbf')
}

#Perform GridSearch
grid_search = GridSearchCV(pipe, param_grid=parameters, verbose=10)
grid_search.fit(features_train, target_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5; 1/6] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 1/5; 1/6] END clf__kernel=linear, feature_selection__threshold=None;, score=0.942 total time=   6.6s
[CV 2/5; 1/6] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 2/5; 1/6] END clf__kernel=linear, feature_selection__threshold=None;, score=0.942 total time=   5.5s
[CV 3/5; 1/6] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 3/5; 1/6] END clf__kernel=linear, feature_selection__threshold=None;, score=0.942 total time=   5.8s
[CV 4/5; 1/6] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 4/5; 1/6] END clf__kernel=linear, feature_selection__threshold=None;, score=0.941 total time=   6.1s
[CV 5/5; 1/6] START clf__kernel=linear, feature_selection__threshold=None.......




[CV 5/5; 1/6] END clf__kernel=linear, feature_selection__threshold=None;, score=0.941 total time=   6.0s
[CV 1/5; 2/6] START clf__kernel=linear, feature_selection__threshold=mean.......




[CV 1/5; 2/6] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.942 total time=   5.3s
[CV 2/5; 2/6] START clf__kernel=linear, feature_selection__threshold=mean.......




[CV 2/5; 2/6] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.942 total time=   4.7s
[CV 3/5; 2/6] START clf__kernel=linear, feature_selection__threshold=mean.......




[CV 3/5; 2/6] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.942 total time=   3.8s
[CV 4/5; 2/6] START clf__kernel=linear, feature_selection__threshold=mean.......




[CV 4/5; 2/6] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.941 total time=   4.8s
[CV 5/5; 2/6] START clf__kernel=linear, feature_selection__threshold=mean.......




[CV 5/5; 2/6] END clf__kernel=linear, feature_selection__threshold=mean;, score=0.941 total time=   5.0s
[CV 1/5; 3/6] START clf__kernel=poly, feature_selection__threshold=None.........




[CV 1/5; 3/6] END clf__kernel=poly, feature_selection__threshold=None;, score=0.940 total time=   7.6s
[CV 2/5; 3/6] START clf__kernel=poly, feature_selection__threshold=None.........




[CV 2/5; 3/6] END clf__kernel=poly, feature_selection__threshold=None;, score=0.939 total time=   7.4s
[CV 3/5; 3/6] START clf__kernel=poly, feature_selection__threshold=None.........




[CV 3/5; 3/6] END clf__kernel=poly, feature_selection__threshold=None;, score=0.940 total time=   7.0s
[CV 4/5; 3/6] START clf__kernel=poly, feature_selection__threshold=None.........




[CV 4/5; 3/6] END clf__kernel=poly, feature_selection__threshold=None;, score=0.940 total time=   7.4s
[CV 5/5; 3/6] START clf__kernel=poly, feature_selection__threshold=None.........




[CV 5/5; 3/6] END clf__kernel=poly, feature_selection__threshold=None;, score=0.940 total time=   7.5s
[CV 1/5; 4/6] START clf__kernel=poly, feature_selection__threshold=mean.........




[CV 1/5; 4/6] END clf__kernel=poly, feature_selection__threshold=mean;, score=0.942 total time=   6.5s
[CV 2/5; 4/6] START clf__kernel=poly, feature_selection__threshold=mean.........




[CV 2/5; 4/6] END clf__kernel=poly, feature_selection__threshold=mean;, score=0.942 total time=   6.2s
[CV 3/5; 4/6] START clf__kernel=poly, feature_selection__threshold=mean.........




[CV 3/5; 4/6] END clf__kernel=poly, feature_selection__threshold=mean;, score=0.942 total time=   6.0s
[CV 4/5; 4/6] START clf__kernel=poly, feature_selection__threshold=mean.........




[CV 4/5; 4/6] END clf__kernel=poly, feature_selection__threshold=mean;, score=0.941 total time=   6.1s
[CV 5/5; 4/6] START clf__kernel=poly, feature_selection__threshold=mean.........




[CV 5/5; 4/6] END clf__kernel=poly, feature_selection__threshold=mean;, score=0.939 total time=   6.0s
[CV 1/5; 5/6] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 1/5; 5/6] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.942 total time=  11.6s
[CV 2/5; 5/6] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 2/5; 5/6] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.942 total time=  11.3s
[CV 3/5; 5/6] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 3/5; 5/6] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.942 total time=  10.5s
[CV 4/5; 5/6] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 4/5; 5/6] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.941 total time=  11.1s
[CV 5/5; 5/6] START clf__kernel=rbf, feature_selection__threshold=None..........




[CV 5/5; 5/6] END clf__kernel=rbf, feature_selection__threshold=None;, score=0.942 total time=  11.1s
[CV 1/5; 6/6] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 1/5; 6/6] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.942 total time=   8.8s
[CV 2/5; 6/6] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 2/5; 6/6] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.942 total time=  10.2s
[CV 3/5; 6/6] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 3/5; 6/6] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.942 total time=   9.5s
[CV 4/5; 6/6] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 4/5; 6/6] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.941 total time=  10.1s
[CV 5/5; 6/6] START clf__kernel=rbf, feature_selection__threshold=mean..........




[CV 5/5; 6/6] END clf__kernel=rbf, feature_selection__threshold=mean;, score=0.941 total time=  10.5s




In [11]:
target_predicted = grid_search.predict(features_test)

In [12]:
#Evaluate the model
print("Default-Score des Klassifizierers: Accuracy=", grid_search.score(features_test, target_test), "\n")
print(classification_report(target_test, target_predicted))

Default-Score des Klassifizierers: Accuracy= 0.9388571428571428 

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        55
           2       0.94      1.00      0.97      6572
           3       0.00      0.00      0.00       142
           4       0.00      0.00      0.00       231

    accuracy                           0.94      7000
   macro avg       0.23      0.25      0.24      7000
weighted avg       0.88      0.94      0.91      7000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
