In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from os.path import dirname
import datetime as dt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import VarianceThreshold


notebook_path = os.path.abspath("DataAnalyticsKickstarterNotebook_Cedrik.ipynb")
csv_path_features = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-klassifikation-features.csv")
csv_path_trueLabels = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-klassifikation-target.csv")

In [36]:
df_features = pd.read_csv (csv_path_features, low_memory=False)
df_target = pd.read_csv (csv_path_trueLabels, low_memory=False)

In [37]:
df_features.head()

Unnamed: 0,usd_goal_real,duration,name_length,creator_type,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater
0,1.981553e-05,0.208791,0.221053,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,3.302593e-05,0.153846,0.336842,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3.302534e-06,0.142857,0.368421,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,3.301939e-07,0.076923,0.157895,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,1.321033e-05,0.879121,0.473684,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [38]:
df_target.head()

Unnamed: 0,ID,successful
0,558744194,1
1,134175247,1
2,1577455391,1
3,171893227,1
4,499552311,1


Da unser Datensatz zu groß ist, um ihn für verschiedenen Klassifizierer durchlaufen zu lassen erstellen wir ein Subset von unserem Dataframe mit 10000 Einträgen.

In [39]:
df_features["target"] = df_target["successful"]
df_features.head()

Unnamed: 0,usd_goal_real,duration,name_length,creator_type,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater,target
0,1.981553e-05,0.208791,0.221053,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,3.302593e-05,0.153846,0.336842,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,3.302534e-06,0.142857,0.368421,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,3.301939e-07,0.076923,0.157895,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
4,1.321033e-05,0.879121,0.473684,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [40]:
df_sample = df_features.sample(n=10000, random_state=42)
df_sample

Unnamed: 0,usd_goal_real,duration,name_length,creator_type,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater,target
107677,0.000165,0.318681,0.452632,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
53484,0.000099,0.373626,0.200000,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
14825,0.000040,0.318681,0.189474,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
68417,0.000396,0.417582,0.621053,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
98824,0.000033,0.318681,0.347368,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73005,0.000178,0.373626,0.568421,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
40221,0.000159,0.230769,0.547368,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
97450,0.000076,0.318681,0.315789,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
16107,0.000005,0.318681,0.084211,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [41]:
X_train, X_dev, y_train, y_dev = train_test_split(df_sample.iloc[: ,0:19], df_sample.iloc[: , -1], random_state=42)

In [42]:
feature_selection_1=SelectFromModel(LinearSVC(penalty="l1", dual=False,max_iter=2000))
#feature_selection_2=VarianceThreshold()
#'feature_selection__threshold': [0.01, 0.02, 0.03]

clf_list = []
# Initialze the estimators

clf1 = Pipeline([('feature_selection', feature_selection_1),
                     ('classifier', SVC())])
clf_list.append(clf1)

clf2 = Pipeline([('feature_selection', feature_selection_1),
                     ('classifier', RandomForestClassifier())])
clf_list.append(clf2)

clf3 = Pipeline([('feature_selection', feature_selection_1),
                     ('classifier', KNeighborsClassifier())])
clf_list.append(clf3)

clf4 = Pipeline([('feature_selection', feature_selection_1),
                     ('classifier', GaussianNB())])
clf_list.append(clf4)

clf5 = Pipeline([('feature_selection', feature_selection_1),
                     ('classifier', XGBClassifier(use_label_encoder =False))])
clf_list.append(clf5)


Überlegung, wie der threshhold für VarianceThreshold gewählt werden sollte

In [43]:
var_thr = VarianceThreshold(threshold = 0.03) #Removing both constant and quasi-constant
var_thr.fit(X_train)

concol = [column for column in X_train.columns 
          if column not in X_train.columns[var_thr.get_support()]]


for features in concol:
    print(features)
   


usd_goal_real
duration
name_length
Comics
Crafts
Dance
Design


In [44]:
search_spaces =[]
                
search_space1 = [#{'feature_selection__threshold': [None, 'mean'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__kernel': ['linear']},
                {'feature_selection__threshold': [None, 'mean'], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__kernel': ['rbf']}
                ]
search_spaces.append(search_space1)

search_space2 = {'feature_selection__threshold': [None, 'mean'],
                 'classifier__n_estimators': [10, 100],
                 'classifier__min_samples_split': [2,4],
                 'classifier__max_features': ['auto','log2']
                }
search_spaces.append(search_space2)

search_space3 = {'feature_selection__threshold': [None, 'mean'],
                 'classifier__n_neighbors': [3, 7, 11],
                 'classifier__weights': ['uniform', 'distance'],
                 'classifier__metric': ['minkowski','euclidean', 'manhatten']
                }
search_spaces.append(search_space3)

search_space4 = {'feature_selection__threshold': [None, 'mean']
                }
search_spaces.append(search_space4)

search_space5 = {'feature_selection__threshold': [None, 'mean'],
                 'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                 'classifier__max_depth': [5,6,7],
                }
search_spaces.append(search_space5)


In [45]:
best_params = []
best_scores = []
for i in range(0,5):
#f"search_space{i}",
    print("---------------------------------------------------------------------------")
    grid_search = GridSearchCV(clf_list[i],param_grid=search_spaces[i], verbose=3)
    grid_search.fit(X_train, y_train)
    best_params.append(grid_search.best_estimator_)
    best_scores.append(grid_search.best_score_)


---------------------------------------------------------------------------
Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END classifier__C=0.001, classifier__gamma=0.001, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   6.1s
[CV 2/5] END classifier__C=0.001, classifier__gamma=0.001, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   6.0s
[CV 3/5] END classifier__C=0.001, classifier__gamma=0.001, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   6.0s
[CV 4/5] END classifier__C=0.001, classifier__gamma=0.001, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   5.6s
[CV 5/5] END classifier__C=0.001, classifier__gamma=0.001, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   6.0s
[CV 1/5] END classifier__C=0.001, classifier__gamma=0.001, classifier__kernel=rbf, feature_selection__threshold=

[CV 1/5] END classifier__C=0.001, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.525 total time=   4.8s
[CV 2/5] END classifier__C=0.001, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.525 total time=   5.0s
[CV 3/5] END classifier__C=0.001, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.525 total time=   4.7s
[CV 4/5] END classifier__C=0.001, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.525 total time=   4.8s
[CV 5/5] END classifier__C=0.001, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.525 total time=   4.6s
[CV 1/5] END classifier__C=0.01, classifier__gamma=0.001, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   5.3s
[CV 2/5] END classifier__C=0.01, classifier__gamma=0.001, classifier__kernel=rbf, feature_selection__threshold=

[CV 2/5] END classifier__C=0.01, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   5.9s
[CV 3/5] END classifier__C=0.01, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   7.1s
[CV 4/5] END classifier__C=0.01, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   5.8s
[CV 5/5] END classifier__C=0.01, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.525 total time=   6.3s
[CV 1/5] END classifier__C=0.01, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.561 total time=   5.6s
[CV 2/5] END classifier__C=0.01, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.589 total time=   5.9s
[CV 3/5] END classifier__C=0.01, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=mean;, sc

[CV 4/5] END classifier__C=0.1, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.553 total time=   4.9s
[CV 5/5] END classifier__C=0.1, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.565 total time=   4.8s
[CV 1/5] END classifier__C=0.1, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.673 total time=   5.0s
[CV 2/5] END classifier__C=0.1, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.689 total time=   5.3s
[CV 3/5] END classifier__C=0.1, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.673 total time=   5.3s
[CV 4/5] END classifier__C=0.1, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.677 total time=   5.2s
[CV 5/5] END classifier__C=0.1, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.682

[CV 1/5] END classifier__C=1, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.561 total time=   5.1s
[CV 2/5] END classifier__C=1, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.592 total time=   5.4s
[CV 3/5] END classifier__C=1, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.566 total time=   5.1s
[CV 4/5] END classifier__C=1, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.555 total time=   5.0s
[CV 5/5] END classifier__C=1, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.564 total time=   5.2s
[CV 1/5] END classifier__C=1, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.687 total time=   4.9s
[CV 2/5] END classifier__C=1, classifier__gamma=100, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.705 total time=   5.

[CV 3/5] END classifier__C=10, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.696 total time=   5.4s
[CV 4/5] END classifier__C=10, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.705 total time=   5.6s
[CV 5/5] END classifier__C=10, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.707 total time=   5.6s
[CV 1/5] END classifier__C=10, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.561 total time=   5.7s
[CV 2/5] END classifier__C=10, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.597 total time=   5.8s
[CV 3/5] END classifier__C=10, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.567 total time=   5.7s
[CV 4/5] END classifier__C=10, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.554 total time=

[CV 5/5] END classifier__C=100, classifier__gamma=1, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.567 total time=   5.2s
[CV 1/5] END classifier__C=100, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.699 total time=  12.9s
[CV 2/5] END classifier__C=100, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.705 total time=  10.3s
[CV 3/5] END classifier__C=100, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.693 total time=  11.7s
[CV 4/5] END classifier__C=100, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.700 total time=  10.8s
[CV 5/5] END classifier__C=100, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=None;, score=0.697 total time=  11.3s
[CV 1/5] END classifier__C=100, classifier__gamma=10, classifier__kernel=rbf, feature_selection__threshold=mean;, score=0.567 total

[CV 5/5] END classifier__max_features=auto, classifier__min_samples_split=4, classifier__n_estimators=10, feature_selection__threshold=mean;, score=0.590 total time=   0.0s
[CV 1/5] END classifier__max_features=auto, classifier__min_samples_split=4, classifier__n_estimators=100, feature_selection__threshold=None;, score=0.707 total time=   0.9s
[CV 2/5] END classifier__max_features=auto, classifier__min_samples_split=4, classifier__n_estimators=100, feature_selection__threshold=None;, score=0.707 total time=   1.0s
[CV 3/5] END classifier__max_features=auto, classifier__min_samples_split=4, classifier__n_estimators=100, feature_selection__threshold=None;, score=0.717 total time=   0.9s
[CV 4/5] END classifier__max_features=auto, classifier__min_samples_split=4, classifier__n_estimators=100, feature_selection__threshold=None;, score=0.703 total time=   0.9s
[CV 5/5] END classifier__max_features=auto, classifier__min_samples_split=4, classifier__n_estimators=100, feature_selection__thres

[CV 3/5] END classifier__max_features=log2, classifier__min_samples_split=4, classifier__n_estimators=100, feature_selection__threshold=mean;, score=0.598 total time=   0.7s
[CV 4/5] END classifier__max_features=log2, classifier__min_samples_split=4, classifier__n_estimators=100, feature_selection__threshold=mean;, score=0.587 total time=   0.8s
[CV 5/5] END classifier__max_features=log2, classifier__min_samples_split=4, classifier__n_estimators=100, feature_selection__threshold=mean;, score=0.589 total time=   0.8s
---------------------------------------------------------------------------
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END classifier__metric=minkowski, classifier__n_neighbors=3, classifier__weights=uniform, feature_selection__threshold=None;, score=0.660 total time=   0.4s
[CV 2/5] END classifier__metric=minkowski, classifier__n_neighbors=3, classifier__weights=uniform, feature_selection__threshold=None;, score=0.687 total time=   0.3s
[CV 3/5]

[CV 2/5] END classifier__metric=minkowski, classifier__n_neighbors=11, classifier__weights=uniform, feature_selection__threshold=mean;, score=0.597 total time=   0.1s
[CV 3/5] END classifier__metric=minkowski, classifier__n_neighbors=11, classifier__weights=uniform, feature_selection__threshold=mean;, score=0.598 total time=   0.1s
[CV 4/5] END classifier__metric=minkowski, classifier__n_neighbors=11, classifier__weights=uniform, feature_selection__threshold=mean;, score=0.602 total time=   0.1s
[CV 5/5] END classifier__metric=minkowski, classifier__n_neighbors=11, classifier__weights=uniform, feature_selection__threshold=mean;, score=0.601 total time=   0.0s
[CV 1/5] END classifier__metric=minkowski, classifier__n_neighbors=11, classifier__weights=distance, feature_selection__threshold=None;, score=0.665 total time=   0.3s
[CV 2/5] END classifier__metric=minkowski, classifier__n_neighbors=11, classifier__weights=distance, feature_selection__threshold=None;, score=0.691 total time=   0

[CV 4/5] END classifier__metric=euclidean, classifier__n_neighbors=7, classifier__weights=distance, feature_selection__threshold=mean;, score=0.571 total time=   0.0s
[CV 5/5] END classifier__metric=euclidean, classifier__n_neighbors=7, classifier__weights=distance, feature_selection__threshold=mean;, score=0.575 total time=   0.0s
[CV 1/5] END classifier__metric=euclidean, classifier__n_neighbors=11, classifier__weights=uniform, feature_selection__threshold=None;, score=0.688 total time=   0.4s
[CV 2/5] END classifier__metric=euclidean, classifier__n_neighbors=11, classifier__weights=uniform, feature_selection__threshold=None;, score=0.695 total time=   0.4s
[CV 3/5] END classifier__metric=euclidean, classifier__n_neighbors=11, classifier__weights=uniform, feature_selection__threshold=None;, score=0.688 total time=   0.4s
[CV 4/5] END classifier__metric=euclidean, classifier__n_neighbors=11, classifier__weights=uniform, feature_selection__threshold=None;, score=0.683 total time=   0.4

[CV 5/5] END classifier__metric=manhatten, classifier__n_neighbors=7, classifier__weights=uniform, feature_selection__threshold=mean;, score=nan total time=   0.0s
[CV 1/5] END classifier__metric=manhatten, classifier__n_neighbors=7, classifier__weights=distance, feature_selection__threshold=None;, score=nan total time=   0.0s
[CV 2/5] END classifier__metric=manhatten, classifier__n_neighbors=7, classifier__weights=distance, feature_selection__threshold=None;, score=nan total time=   0.0s
[CV 3/5] END classifier__metric=manhatten, classifier__n_neighbors=7, classifier__weights=distance, feature_selection__threshold=None;, score=nan total time=   0.0s
[CV 4/5] END classifier__metric=manhatten, classifier__n_neighbors=7, classifier__weights=distance, feature_selection__threshold=None;, score=nan total time=   0.0s
[CV 5/5] END classifier__metric=manhatten, classifier__n_neighbors=7, classifier__weights=distance, feature_selection__threshold=None;, score=nan total time=   0.0s
[CV 1/5] EN

60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\anaconda3\envs\DataAnalyticsProjecktClone\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\anaconda3\envs\DataAnalyticsProjecktClone\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\anaconda3\envs\DataAnalyticsProjecktClone\lib\site-packages\sklearn\neighbors\_classification.py", line 198, in fit
    return self._fit(X, y)
  File "C:\anaconda3\envs\DataAnalyticsProjecktClone\lib\site-packages

[CV 2/5] END .feature_selection__threshold=None;, score=0.629 total time=   0.0s
[CV 3/5] END .feature_selection__threshold=None;, score=0.605 total time=   0.0s
[CV 4/5] END .feature_selection__threshold=None;, score=0.636 total time=   0.0s
[CV 5/5] END .feature_selection__threshold=None;, score=0.629 total time=   0.0s
[CV 1/5] END .feature_selection__threshold=mean;, score=0.541 total time=   0.0s
[CV 2/5] END .feature_selection__threshold=mean;, score=0.554 total time=   0.0s
[CV 3/5] END .feature_selection__threshold=mean;, score=0.550 total time=   0.0s
[CV 4/5] END .feature_selection__threshold=mean;, score=0.555 total time=   0.0s
[CV 5/5] END .feature_selection__threshold=mean;, score=0.553 total time=   0.0s
---------------------------------------------------------------------------
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END classifier__gamma=0.001, classifier__max_depth=5, feature_selection__threshold=None;, score=0.713 total time=   0.4s
[CV

[CV 3/5] END classifier__gamma=0.001, classifier__max_depth=6, feature_selection__threshold=mean;, score=0.626 total time=   0.3s
[CV 4/5] END classifier__gamma=0.001, classifier__max_depth=6, feature_selection__threshold=mean;, score=0.616 total time=   0.3s
[CV 5/5] END classifier__gamma=0.001, classifier__max_depth=6, feature_selection__threshold=mean;, score=0.605 total time=   0.3s
[CV 1/5] END classifier__gamma=0.001, classifier__max_depth=7, feature_selection__threshold=None;, score=0.702 total time=   0.5s
[CV 2/5] END classifier__gamma=0.001, classifier__max_depth=7, feature_selection__threshold=None;, score=0.725 total time=   0.5s
[CV 3/5] END classifier__gamma=0.001, classifier__max_depth=7, feature_selection__threshold=None;, score=0.719 total time=   0.4s
[CV 4/5] END classifier__gamma=0.001, classifier__max_depth=7, feature_selection__threshold=None;, score=0.715 total time=   0.4s
[CV 5/5] END classifier__gamma=0.001, classifier__max_depth=7, feature_selection__threshol

[CV 3/5] END classifier__gamma=0.01, classifier__max_depth=5, feature_selection__threshold=mean;, score=0.632 total time=   0.3s
[CV 4/5] END classifier__gamma=0.01, classifier__max_depth=5, feature_selection__threshold=mean;, score=0.619 total time=   0.2s
[CV 5/5] END classifier__gamma=0.01, classifier__max_depth=5, feature_selection__threshold=mean;, score=0.605 total time=   0.2s
[CV 1/5] END classifier__gamma=0.01, classifier__max_depth=6, feature_selection__threshold=None;, score=0.707 total time=   0.3s
[CV 2/5] END classifier__gamma=0.01, classifier__max_depth=6, feature_selection__threshold=None;, score=0.729 total time=   0.5s
[CV 3/5] END classifier__gamma=0.01, classifier__max_depth=6, feature_selection__threshold=None;, score=0.711 total time=   0.4s
[CV 4/5] END classifier__gamma=0.01, classifier__max_depth=6, feature_selection__threshold=None;, score=0.713 total time=   0.3s
[CV 5/5] END classifier__gamma=0.01, classifier__max_depth=6, feature_selection__threshold=None;,

[CV 3/5] END classifier__gamma=0.01, classifier__max_depth=7, feature_selection__threshold=mean;, score=0.623 total time=   0.3s
[CV 4/5] END classifier__gamma=0.01, classifier__max_depth=7, feature_selection__threshold=mean;, score=0.610 total time=   0.3s
[CV 5/5] END classifier__gamma=0.01, classifier__max_depth=7, feature_selection__threshold=mean;, score=0.608 total time=   0.4s
[CV 1/5] END classifier__gamma=0.1, classifier__max_depth=5, feature_selection__threshold=None;, score=0.716 total time=   0.5s
[CV 2/5] END classifier__gamma=0.1, classifier__max_depth=5, feature_selection__threshold=None;, score=0.722 total time=   0.3s
[CV 3/5] END classifier__gamma=0.1, classifier__max_depth=5, feature_selection__threshold=None;, score=0.713 total time=   0.3s
[CV 4/5] END classifier__gamma=0.1, classifier__max_depth=5, feature_selection__threshold=None;, score=0.724 total time=   0.3s
[CV 5/5] END classifier__gamma=0.1, classifier__max_depth=5, feature_selection__threshold=None;, scor

[CV 3/5] END classifier__gamma=0.1, classifier__max_depth=6, feature_selection__threshold=mean;, score=0.624 total time=   0.3s
[CV 4/5] END classifier__gamma=0.1, classifier__max_depth=6, feature_selection__threshold=mean;, score=0.608 total time=   0.3s
[CV 5/5] END classifier__gamma=0.1, classifier__max_depth=6, feature_selection__threshold=mean;, score=0.607 total time=   0.2s
[CV 1/5] END classifier__gamma=0.1, classifier__max_depth=7, feature_selection__threshold=None;, score=0.697 total time=   0.4s
[CV 2/5] END classifier__gamma=0.1, classifier__max_depth=7, feature_selection__threshold=None;, score=0.720 total time=   0.4s
[CV 3/5] END classifier__gamma=0.1, classifier__max_depth=7, feature_selection__threshold=None;, score=0.707 total time=   0.4s
[CV 4/5] END classifier__gamma=0.1, classifier__max_depth=7, feature_selection__threshold=None;, score=0.715 total time=   0.3s
[CV 5/5] END classifier__gamma=0.1, classifier__max_depth=7, feature_selection__threshold=None;, score=0

[CV 3/5] END classifier__gamma=1, classifier__max_depth=5, feature_selection__threshold=mean;, score=0.629 total time=   0.2s
[CV 4/5] END classifier__gamma=1, classifier__max_depth=5, feature_selection__threshold=mean;, score=0.630 total time=   0.2s
[CV 5/5] END classifier__gamma=1, classifier__max_depth=5, feature_selection__threshold=mean;, score=0.631 total time=   0.1s
[CV 1/5] END classifier__gamma=1, classifier__max_depth=6, feature_selection__threshold=None;, score=0.710 total time=   0.3s
[CV 2/5] END classifier__gamma=1, classifier__max_depth=6, feature_selection__threshold=None;, score=0.729 total time=   0.2s
[CV 3/5] END classifier__gamma=1, classifier__max_depth=6, feature_selection__threshold=None;, score=0.708 total time=   0.3s
[CV 4/5] END classifier__gamma=1, classifier__max_depth=6, feature_selection__threshold=None;, score=0.737 total time=   0.2s
[CV 5/5] END classifier__gamma=1, classifier__max_depth=6, feature_selection__threshold=None;, score=0.726 total time=

[CV 3/5] END classifier__gamma=1, classifier__max_depth=7, feature_selection__threshold=mean;, score=0.618 total time=   0.3s
[CV 4/5] END classifier__gamma=1, classifier__max_depth=7, feature_selection__threshold=mean;, score=0.629 total time=   0.3s
[CV 5/5] END classifier__gamma=1, classifier__max_depth=7, feature_selection__threshold=mean;, score=0.623 total time=   0.3s
[CV 1/5] END classifier__gamma=10, classifier__max_depth=5, feature_selection__threshold=None;, score=0.722 total time=   0.3s
[CV 2/5] END classifier__gamma=10, classifier__max_depth=5, feature_selection__threshold=None;, score=0.738 total time=   0.3s
[CV 3/5] END classifier__gamma=10, classifier__max_depth=5, feature_selection__threshold=None;, score=0.705 total time=   0.3s
[CV 4/5] END classifier__gamma=10, classifier__max_depth=5, feature_selection__threshold=None;, score=0.730 total time=   0.3s
[CV 5/5] END classifier__gamma=10, classifier__max_depth=5, feature_selection__threshold=None;, score=0.722 total 

[CV 3/5] END classifier__gamma=10, classifier__max_depth=6, feature_selection__threshold=mean;, score=0.617 total time=   0.2s
[CV 4/5] END classifier__gamma=10, classifier__max_depth=6, feature_selection__threshold=mean;, score=0.617 total time=   0.3s
[CV 5/5] END classifier__gamma=10, classifier__max_depth=6, feature_selection__threshold=mean;, score=0.615 total time=   0.3s
[CV 1/5] END classifier__gamma=10, classifier__max_depth=7, feature_selection__threshold=None;, score=0.720 total time=   0.4s
[CV 2/5] END classifier__gamma=10, classifier__max_depth=7, feature_selection__threshold=None;, score=0.735 total time=   0.5s
[CV 3/5] END classifier__gamma=10, classifier__max_depth=7, feature_selection__threshold=None;, score=0.713 total time=   0.6s
[CV 4/5] END classifier__gamma=10, classifier__max_depth=7, feature_selection__threshold=None;, score=0.733 total time=   0.5s
[CV 5/5] END classifier__gamma=10, classifier__max_depth=7, feature_selection__threshold=None;, score=0.729 tot

[CV 3/5] END classifier__gamma=100, classifier__max_depth=5, feature_selection__threshold=mean;, score=0.585 total time=   0.3s
[CV 4/5] END classifier__gamma=100, classifier__max_depth=5, feature_selection__threshold=mean;, score=0.611 total time=   0.3s
[CV 5/5] END classifier__gamma=100, classifier__max_depth=5, feature_selection__threshold=mean;, score=0.607 total time=   0.3s
[CV 1/5] END classifier__gamma=100, classifier__max_depth=6, feature_selection__threshold=None;, score=0.662 total time=   0.5s
[CV 2/5] END classifier__gamma=100, classifier__max_depth=6, feature_selection__threshold=None;, score=0.691 total time=   0.5s
[CV 3/5] END classifier__gamma=100, classifier__max_depth=6, feature_selection__threshold=None;, score=0.671 total time=   0.6s
[CV 4/5] END classifier__gamma=100, classifier__max_depth=6, feature_selection__threshold=None;, score=0.668 total time=   0.5s
[CV 5/5] END classifier__gamma=100, classifier__max_depth=6, feature_selection__threshold=None;, score=0

[CV 3/5] END classifier__gamma=100, classifier__max_depth=7, feature_selection__threshold=mean;, score=0.585 total time=   0.4s
[CV 4/5] END classifier__gamma=100, classifier__max_depth=7, feature_selection__threshold=mean;, score=0.611 total time=   0.5s
[CV 5/5] END classifier__gamma=100, classifier__max_depth=7, feature_selection__threshold=mean;, score=0.607 total time=   0.5s


In [46]:
print(best_params)
print(best_scores)

[Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, max_iter=2000,
                                                     penalty='l1'))),
                ('classifier', SVC(C=100, gamma=1))]), Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, max_iter=2000,
                                                     penalty='l1'))),
                ('classifier', RandomForestClassifier(min_samples_split=4))]), Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, max_iter=2000,
                                                     penalty='l1'))),
                ('classifier', KNeighborsClassifier(n_neighbors=11))]), Pipeline(steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False, max_iter=2000,
                                                     penalty='l1'))),
                ('classifier', GaussianNB

In [47]:
final_pipelines = []
for piepline in best_params:
    # Pipeline für die beste Feature-Kombination definieren
    train_labels = cross_val_predict(piepline, X_train, y_train, cv=10)
    # Precision/Recall/F-Wert berechnen
    final_pipelines.append(piepline)
    print(classification_report(y_train, train_labels))

              precision    recall  f1-score   support

           0       0.71      0.63      0.67      3563
           1       0.70      0.77      0.73      3937

    accuracy                           0.70      7500
   macro avg       0.71      0.70      0.70      7500
weighted avg       0.71      0.70      0.70      7500

              precision    recall  f1-score   support

           0       0.69      0.69      0.69      3563
           1       0.72      0.72      0.72      3937

    accuracy                           0.70      7500
   macro avg       0.70      0.70      0.70      7500
weighted avg       0.70      0.70      0.70      7500

              precision    recall  f1-score   support

           0       0.67      0.68      0.68      3563
           1       0.71      0.70      0.70      3937

    accuracy                           0.69      7500
   macro avg       0.69      0.69      0.69      7500
weighted avg       0.69      0.69      0.69      7500

              preci

In [48]:
# Jetzt den Lerner ein letztes Mal auf allen Trainingsdaten trainieren und dann auf den Testdaten evaluieren

# Lerner auf den gesamten Trainingsdaten trainieren
for final_pipeline in final_pipelines:
    final_pipeline.fit(X_train, y_train)
    print("Default-Score des Klassifizierers: Accuracy=",final_pipeline.score(X_dev, y_dev), "\n")
    # Labels vorhersagen lassen und dann Precision/Recall/F-Wert berechnen
    test_labels = final_pipeline.predict(X_dev)
    print(classification_report(y_dev, test_labels))

Default-Score des Klassifizierers: Accuracy= 0.7064 

              precision    recall  f1-score   support

           0       0.74      0.63      0.68      1251
           1       0.68      0.78      0.73      1249

    accuracy                           0.71      2500
   macro avg       0.71      0.71      0.70      2500
weighted avg       0.71      0.71      0.70      2500

Default-Score des Klassifizierers: Accuracy= 0.7068 

              precision    recall  f1-score   support

           0       0.72      0.69      0.70      1251
           1       0.70      0.73      0.71      1249

    accuracy                           0.71      2500
   macro avg       0.71      0.71      0.71      2500
weighted avg       0.71      0.71      0.71      2500

Default-Score des Klassifizierers: Accuracy= 0.6936 

              precision    recall  f1-score   support

           0       0.70      0.69      0.69      1251
           1       0.69      0.70      0.70      1249

    accuracy        