In [1]:
#try out other model --> LogisticRegression

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE

In [2]:
df_accidents = pd.read_csv('data/MA3_finished.csv', index_col='ID')

In [3]:
df_accidents = df_accidents.sample(n=1000000, random_state=1)

In [4]:
features = df_accidents[['Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
                         'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']]

In [5]:
#Encoding text/categorical variables
le = LabelEncoder()
features['Weather_Condition']= le.fit_transform(features['Weather_Condition'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['Weather_Condition']= le.fit_transform(features['Weather_Condition'])


In [6]:
#target
target = df_accidents['Severity']

In [7]:
#(80/20 split)
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [8]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
features_train, target_train = smote.fit_resample(features_train, target_train)

In [9]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False), threshold='mean')),
    ('clf', LogisticRegression())  
])

In [None]:
#parameters for gridSearch
parameters = {
    'feature_selection__threshold': (None, 'mean'), 
    'clf__fit_intercept': [True, False],
    'clf__n_jobs': [-1]
}

#Perform GridSearch
grid_search = GridSearchCV(pipe, param_grid=parameters, verbose=10)
grid_search.fit(features_train, target_train)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START clf__fit_intercept=True, clf__n_jobs=-1, feature_selection__threshold=None
[CV 1/5; 1/4] END clf__fit_intercept=True, clf__n_jobs=-1, feature_selection__threshold=None;, score=0.365 total time=  21.3s
[CV 2/5; 1/4] START clf__fit_intercept=True, clf__n_jobs=-1, feature_selection__threshold=None
[CV 2/5; 1/4] END clf__fit_intercept=True, clf__n_jobs=-1, feature_selection__threshold=None;, score=0.364 total time=  15.5s
[CV 3/5; 1/4] START clf__fit_intercept=True, clf__n_jobs=-1, feature_selection__threshold=None
[CV 3/5; 1/4] END clf__fit_intercept=True, clf__n_jobs=-1, feature_selection__threshold=None;, score=0.366 total time=  17.0s
[CV 4/5; 1/4] START clf__fit_intercept=True, clf__n_jobs=-1, feature_selection__threshold=None
[CV 4/5; 1/4] END clf__fit_intercept=True, clf__n_jobs=-1, feature_selection__threshold=None;, score=0.365 total time=  15.1s
[CV 5/5; 1/4] START clf__fit_intercept=True, clf__n_jobs

In [None]:
target_predicted = grid_search.predict(features_test)

In [None]:
#Evaluate the model
print("Default-Score des Klassifizierers: Accuracy=", grid_search.score(features_test, target_test), "\n")
print(classification_report(target_test, target_predicted))
print(confusion_matrix(target_test,target_predicted))