**1. Optimale Klassifizierungsmodelle vorbereiten**

In [None]:
import pandas as pd
import pickle

#load pipeline
pipeline = pickle.load(open("pipeline.p",'rb'))
col_names = pickle.load(open("col_names.p",'rb'))

#gather data
df_train = pd.read_csv('attrition_train.csv')
df_test = pd.read_csv('attrition_test.csv')

#extract features and target
features_train = df_train.drop('attrition', axis=1)
features_test = df_test.drop('attrition', axis=1)

target_train = df_train.loc[:,'attrition']
target_test = df_test.loc[:,'attrition']

#transform data
features_train = pd.DataFrame(pipeline.transform(features_train), columns=col_names)
features_test = pd.DataFrame(pipeline.transform(features_test), columns=col_names)

# look at raw data
features_train.head()

**2. KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline_knn = Pipeline([('std', StandardScaler()),
                         ('knn', KNeighborsClassifier())])



import numpy as np
k = np.unique(np.geomspace(1, 500, 15, dtype='int'))  # create 15 values between 1 and 500 with increasing distance

search_space_knn = {'knn__n_neighbors': k,  # use the created values as number of neighbors
                    'knn__weights': ['uniform', 'distance']}
search_space_knn



from sklearn.model_selection import GridSearchCV

model_knn = GridSearchCV(estimator=pipeline_knn, 
                         param_grid=search_space_knn, 
                         scoring='f1',
                         cv=5)

model_knn.fit(features_train, target_train)

print(model_knn.best_estimator_)
print(model_knn.best_score_)

**3. Log Reg**

In [None]:
from sklearn.linear_model import LogisticRegression

pipeline_log = Pipeline([('std', StandardScaler()),
                         ('log', LogisticRegression(solver='saga',
                                                    class_weight='balanced',
                                                    max_iter=1e4,
                                                    random_state=42))])


C_values = np.geomspace(start=0.001, stop=1000, num=14)

search_space_log = {'log__penalty': ['l1', 'l2'],
                    'log__C': C_values
                   }
search_space_log


model_log = GridSearchCV(estimator=pipeline_log,
                         param_grid=search_space_log,
                         scoring='f1',
                         cv=5)

model_log.fit(features_train, target_train)

print(model_log.best_estimator_)
print(model_log.best_score_)

**4. Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier


search_space_rf = {'max_depth': np.geomspace(start=3, stop=50, num=10, dtype='int'),
                   'min_samples_leaf': np.geomspace(start=1, stop=500, num=10, dtype='int')}


model_rf = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced',
                                                         n_estimators=50,
                                                         random_state=42),
                        param_grid=search_space_rf,
                        scoring='f1',
                        cv=5)

model_rf.fit(features_train, target_train)

print(model_rf.best_estimator_)
print(model_rf.best_score_)

**5. Ensembling**

In [None]:
from sklearn.ensemble import VotingClassifier

search_space_ens = {'voting': ['soft', 'hard'],
                    'weights': [None,
                                [model_knn.best_score_, model_log.best_score_, model_rf.best_score_]
                               ]
                   }


voting_knn_log_rf = VotingClassifier(estimators=[('knn', model_knn),
                                                 ('log', model_log),
                                                 ('rf', model_rf)])


model_ens = GridSearchCV(estimator=voting_knn_log_rf,
                         param_grid=search_space_ens, 
                         scoring='f1',
                         cv=3,
                         n_jobs=-1)

model_ens.fit(features_train, target_train)

print(model_ens.best_estimator_)
print(model_ens.best_score_)
print(model_ens.best_params_)

In [None]:
for clf in [model_log, model_rf, model_ens]:
    
    target_test_pred = clf.predict(features_test)
    
    print('\nPrecision: ', precision_score(target_test, target_test_pred))
    print('Recall: ', recall_score(target_test, target_test_pred))
    print('F1: ', f1_score(target_test, target_test_pred))