In [None]:
import pandas as pd
import numpy as np

# import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import json 

#plot confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# import stratified k fold
from sklearn.model_selection import StratifiedKFold


import statistics





# Load the Dataset

In [None]:
df = pandas = pd.read_csv('../German_Credit_data/Data/german.data', sep=' ')
df

# Da die Attributsnamen kodiert sind werden diese zu erst mithilfe der zugehörigen Dokumentation ersetzt. (Siehe Categorical.json)


In [None]:
file_path = "../German_Credit_data/Data/german.data"

names = ['Status Checking', 'Duration in Month', 'Credit History', 'Purpose', 'Credit Amount', 
         'Savings Account', 'Employement since', 'Installmentrate %', 'StatusSex', 'Otherdebtos', 
         'PresentResidence', 'Property', 'Age in years', 'Otherinstallment Plans', 'Housing', 
         'Number existing Credits', 'Job', 'Number people liable', 'Telephone', 'Foreign Worker', 'Target']
german_data = pd.read_csv(file_path,names=names,delim_whitespace=True, header=None)
catgories = open('../German_Credit_data/Data/categorical.json')
json_data = json.load(catgories)
json_data
for h in names:
    if h in json_data:
        german_data[h] = german_data[h].map(json_data[h])

In [None]:
german_data.head(10)

In [None]:
# Statistische Daten zur Datenanalyse
german_data.describe()

In [None]:
german_data.info()

In [None]:
#import dataframe_image as dfi
#german_data.head(5).dfi.export('df.png')
#df_styled = german_data.describe().style.background_gradient()
# describe the data
#dfi.export(df_styled, 'df_stats.png')

In [None]:
# check for missing values
german_data.isnull().sum()


# Prüfung der Daten auf Plausibilität und Allgemeine Datenanalyse

In [None]:
#check credit amount for outliers
german_data['Credit Amount'].hist(bins=50)




In [None]:
plt.subplot(1, 2, 1)
german_data['Credit Amount'].plot(kind='hist', logx=False, bins=50)
# plot histogram credit amout where target is 2
german_data.loc[german_data['Target'] == 2, 'Credit Amount'].plot(kind='hist', logx=False, bins=50, color='red', figsize=(15, 5))


In [None]:
# plot target variable as pie chart with target 1 and target 2 as labels colored red and blue

german_data['Target'].value_counts().plot(kind='pie', colors=['blue', 'red'], autopct='%1.1f%%', figsize=(15, 5))

# Zielvariable ist Ungleichverteilt

# Untersuchung auf Outlier

In [None]:
#box plot credit amount outliers
german_data.boxplot(column='Credit Amount', by='Target', figsize=(15, 5))

In [None]:
# scatter plot credit amount
german_data.plot(kind='scatter', x='Credit Amount', y='Target', figsize=(15, 5))

In [None]:
# plot distribution of target variable with color 2 as red and color 1 as blue

german_data['Target'].value_counts().plot(kind='bar', figsize=(15, 5), color=['blue', 'red'])

## Outlier Analysis

In [None]:
# calculate ZScore for credit amount --> for outlier detection
german_data['ZScore'] = (german_data['Credit Amount'] - german_data['Credit Amount'].mean()) / german_data['Credit Amount'].std()
#plot zscore for credit amount
german_data['ZScore'].plot(kind='hist', bins=50, figsize=(15, 5))

In [None]:
# calculate IQR for credit amount
def outlier_detection(df):
    Q1 = np.percentile(german_data['Credit Amount'], 25,
                    interpolation = 'midpoint')
    
    Q3 = np.percentile(german_data['Credit Amount'], 75,
                    interpolation = 'midpoint')
    IQR = Q3 - Q1

    upper = Q3 +1.5*IQR
    lower = Q1 - 1.5*IQR
    upper = german_data['Credit Amount'] >= (Q3+1.5*IQR)
    
    print("Upper bound:",upper)
    print(np.where(upper))
    
    # Below Lower bound
    lower = german_data['Credit Amount'] <= (Q1-1.5*IQR)
    print("Lower bound:", lower)
    print(np.where(lower))
outlier_detection(german_data)

In [None]:
# function to determine outlier values
def outlier_iqr(df):
    quartile_1, quartile_3 = np.percentile(df, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((df > upper_bound) | (df < lower_bound))
# calculate the outlier values
outlier_iqr(german_data['Credit Amount'])

# Datenanalyse

In [None]:
# transform target value 2 to 1 and 1 to 0 to be binary
# 0 = good credit ; 1 = bad credit
german_data['Target'] = german_data['Target'].map({2:1, 1:0})





In [None]:
german_data['Target'].value_counts()

In [None]:
# function to calculate correlation between target and feautures 
# only plot the 5 most correlated features with target variable

def correlation_matrix(df):
    corr_matrix = df.corr()
    plt.figure(figsize=(25,25), dpi = 480)
    sns.heatmap(corr_matrix, annot=True)
    plt.show()

    correlations = df.corr()['Target'].sort_values()

    # Display correlations
    print(' Positive Correlations:\n', correlations)
    print(' Negative Correlations:\n', correlations)

correlation_matrix(german_data)

In [None]:
#iterate through feautures and get datatype and check if they are categorical or numerical and store them into list for one hot encoding

categorical_features = []
numerical_features = []
for i in german_data.columns:
    print(i, german_data[i].dtype)
    if german_data[i].dtype == 'object':
        print(i, german_data[i].dtype)
        categorical_features.append(i)
    else:
        print(i, german_data[i].dtype)
        numerical_features.append(i)


print(categorical_features, "Numerical:" ,numerical_features)

### Stark Korrelation zwischen Credit Amount und Duration in Month --> Macht Sinn, da größere Kredite in der Regel längere Laufzeiten haben

In [None]:
# pairplot um einen überblick über die Verteilung zu bekommen
sns.pairplot(german_data, hue='Target', size=3)


In [None]:
fig, axs = plt.subplots(3, 3, figsize=(15, 15))

i = 0
j = 0

for category in numerical_features:
    sns.boxplot(y=german_data[category], x=german_data["Target"].replace({0:"good", 1:"bad"}), ax=axs[i, j], orient="v", showmeans=True)
    j += 1
    if j%3 == 0:
        j = 0
        i += 1
        
axs[2, 1].set_visible(True)
fig.delaxes(axs[2, 1])
axs[2, 2].set_visible(True)
fig.delaxes(axs[2, 2])

In [None]:
german_data

In [None]:
# functio to plot plotbar for categorical features 
def plot_bar(df, feature):
    sns.countplot(x=feature, data=df, hue='Target')
    plt.show()

plot_bar(german_data, 'Purpose')

In [None]:
plt.figure(figsize=(40, 15))
for i in range(0, len(categorical_features)):
    plt.subplot(5, 3, i+1)
    sns.countplot(x = german_data[categorical_features[i]], orient='v', hue=german_data['Target'])
    plt.tight_layout()

In [None]:
from matplotlib.pyplot import figure
figure(figsize=(10,6))
ax = sns.distplot(german_data['Duration in Month'], hist=True, kde=False, 
             bins=72, color = '#bd7a51', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})
ax.set_ylabel('# of Customers')
ax.set_xlabel('Duration in month')
ax.set_title('# of Customers by their duration')

In [None]:
data_Housing = german_data.groupby('Housing')['Housing'].count()
data_Savingaccounts = german_data.groupby('Savings Account')['Savings Account'].count()
data_Checkingaccount = german_data.groupby('Status Checking')['Status Checking'].count()
data_Purpose = german_data.groupby('Purpose')['Purpose'].count()

data_Housing = pd.DataFrame({'Housing':data_Housing.index, 'Count':data_Housing.values})
data_Savingaccounts = pd.DataFrame({'Savings Account':data_Savingaccounts.index, 'Count':data_Savingaccounts.values})
data_Checkingaccount = pd.DataFrame({'Status Checking':data_Checkingaccount.index, 'Count':data_Checkingaccount.values})
data_Purpose = pd.DataFrame({'Purpose':data_Purpose.index, 'Count':data_Purpose.values})

fig = plt.figure(figsize=(15,15))

ax1 = plt.subplot2grid((2,2),(0,0))
plt.pie(data_Housing['Count'],labels = data_Housing['Housing'],autopct='%1.1f%%');
plt.title('Housing split in data');

ax1 = plt.subplot2grid((2,2),(0,1))
plt.pie(data_Savingaccounts['Count'],labels = data_Savingaccounts['Savings Account'],autopct='%1.1f%%');
plt.title('Saving accounts Split in data');

ax1 = plt.subplot2grid((2,2),(1,0))
plt.pie(data_Checkingaccount['Count'],labels = data_Checkingaccount['Status Checking'],autopct='%1.1f%%');
plt.title('Status Checking Split in data');

ax1 = plt.subplot2grid((2,2),(1,1))
plt.pie(data_Purpose['Count'],labels = data_Purpose['Purpose'],autopct='%1.1f%%');
plt.title('Purpose Split in data');

In [None]:
german_data


# Feauture Engineering

In [None]:
# since there is some correlation between credit amount and  duration in months we create a new feature
# credit amount divided by duration in months 
german_data['Credit Amount per Month'] = german_data['Credit Amount'] / german_data['Duration in Month']

# drop the duration in month feature and credit amount feature

german_data.drop(['Duration in Month', 'Credit Amount'], axis=1, inplace=True)

In [None]:
# split column StatusSex by delimiter ":"
#german_data['StatusSex'] = german_data['StatusSex'].str.split(":", n=-1, expand=False)
# create new column for Status
#german_data['Sex'] = german_data['StatusSex'].str.get(0)
#german_data['Status'] = german_data['StatusSex'].str.get(1)

# drop column StatusSex

#german_data = german_data.drop(columns="StatusSex", axis = 1)

# One Hot Encoding

In [None]:
#iterate through feautures and get datatype and check if they are categorical or numerical and store them into list for one hot encoding

categorical_features = []
numerical_features = []
for i in german_data.columns:
    print(i, german_data[i].dtype)
    if german_data[i].dtype == 'object':
        print(i, german_data[i].dtype)
        categorical_features.append(i)
    else:
        print(i, german_data[i].dtype)
        numerical_features.append(i)


print(categorical_features, "Numerical:" ,numerical_features)
    

In [None]:
# one hot encode categorical features
german_data = pd.get_dummies(german_data, columns=categorical_features)

german_data

In [None]:
# still 1000 rows but 61 columns after one hot encoding
german_data.shape

In [None]:
# correlation after one hot encoding

corr = correlation_matrix(german_data)

corr

In [None]:
#drop zscore
german_data = german_data.drop(['ZScore'], axis=1)


# Helper Functions


In [None]:
#plot roc curve
from sklearn.metrics import roc_curve, auc
#plot roc curve
def plot_roc_curve(y_test, y_pred,label):
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic: '+label)
    plt.legend(loc="lower right")
    plt.show()



In [None]:
# plot confusion matrix function
def plot_confusion_matrix(y_test, y_pred, label):
    cm = confusion_matrix(y_test.tolist(), y_pred.tolist())
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
    plt.title('Confusion Matrix: '+label)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()


In [26]:
# function for metrics
def metrics(y_test, y_pred):
    print("Accuracy score:", accuracy_score(y_test, y_pred))
    print("Recall score:", recall_score(y_test, y_pred))
    print("Precision score:", precision_score(y_test, y_pred))
    print("fbeta score:", fbeta_score(y_test, y_pred, beta=2))

In [38]:
# append metrics to dictionary
def metrics_dict_non_opt(y_test, y_pred, label):
    Non_opt_results["Algo_name"].append(label)
    Non_opt_results["Recall"].append(recall_score(y_test, y_pred))
    Non_opt_results["Precision"].append(precision_score(y_test, y_pred))
    Non_opt_results["Accuracy"].append(accuracy_score(y_test, y_pred))
    Non_opt_results["F2"].append(fbeta_score(y_test, y_pred,beta=2))

def metrics_dict_opt(y_test, y_pred, label):
    Opt_results["Algo_name"].append(label)
    Opt_results["Recall"].append(recall_score(y_test, y_pred))
    Opt_results["Precision"].append(precision_score(y_test, y_pred))
    Opt_results["Accuracy"].append(accuracy_score(y_test, y_pred))
    Opt_results["F2"].append(fbeta_score(y_test, y_pred,beta=2))

In [None]:
# function for adding results to dataframe


# Split Feauture und Target

In [None]:
# split features and target
X = german_data.drop(['Target'], axis=1)
y = german_data['Target']

# Basic Modelling with Logistic Regression as Baseline, xGboost and a neuronal Network

In [31]:
# create dictionary with algo names and score metrics for later analysis

Non_opt_results = {"Algo_name": [], "Recall": [],"Precision": [],"Accuracy": [], "F2": []}

In [34]:
# use logistic regression to predict target variable
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, fbeta_score

# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(german_data.drop('Target', axis=1), german_data['Target'], test_size=0.2, random_state=42)

# create logistic regression model with standard parameters

logreg = LogisticRegression(random_state=42)
# fit the model
logreg.fit(X_train, y_train)
# predict the target variable
y_pred = logreg.predict(X_test)

# print accuracy score and recall score
metrics(y_test, y_pred)

print("Crossval score:", statistics.mean(cross_val_score(logreg ,X_test, y_test, scoring='recall',cv = 3 )))
print("Crossval score:", cross_val_score(logreg ,X_test, y_test, scoring='recall',cv = 3 ))

# add algo name and recall score to dictionary
metrics_dict_non_opt(y_test, y_pred, "Logistic Regression")





Accuracy score: 0.765
Recall score: 0.5423728813559322
Precision score: 0.6153846153846154
fbeta score: 0.5555555555555556
Crossval score: 0.47368421052631576
Crossval score: [0.6        0.4        0.42105263]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
plot_roc_curve(y_test, y_pred,"Logistic Regression")

In [None]:
plot_confusion_matrix(y_test,y_pred,"Logistic Regression")


In [36]:
# use random forest to predict target variable
from sklearn.ensemble import RandomForestClassifier
# fit the model
rf = RandomForestClassifier( random_state=42)
rf.fit(X_train, y_train)
# predict the target variable
y_pred = rf.predict(X_test)
# print accuracy score and recall score
metrics(y_test, y_pred)

metrics_dict_non_opt(y_test, y_pred, "Random forest")



Accuracy score: 0.78
Recall score: 0.4067796610169492
Precision score: 0.7272727272727273
fbeta score: 0.446096654275093


In [None]:
plot_roc_curve(y_test, y_pred,"Random Forest")

In [None]:
plot_confusion_matrix(y_test,y_pred,"Random Forest")


In [None]:
# use neuro network to predict target variable
# als baseline werden die Standardparameter verwendet
# normalisieren
from sklearn.neural_network import MLPClassifier
# fit the model
nn = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42, activation="logistic", alpha=0.0115)
nn.fit(X_train, y_train)
# predict the target variable
y_pred = nn.predict(X_test)
# print accuracy score and recall score
metrics(y_test, y_pred)

metrics_dict_non_opt(y_test, y_pred, "Neural Network")


In [None]:
plot_roc_curve(y_test, y_pred,"Neural Network")

In [None]:
plot_confusion_matrix(y_test,y_pred,"Neural Network")


In [None]:
# use xgboost to predict target variable
from xgboost import XGBClassifier
import re

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]

# fit the model
xgb = XGBClassifier(random_state=42)


xgb.fit(X_train, y_train)
# predict the target variable
y_pred = xgb.predict(X_test)
# print accuracy score and recall score
metrics(y_test, y_pred)

metrics_dict_non_opt(y_test, y_pred, "xGboost")



In [None]:
plot_roc_curve(y_test, y_pred,"xGboost")

In [None]:
confusion_matrix = confusion_matrix(y_test, y_pred)
plot_confusion_matrix("xGboost")


# Ergebnisse nicht Optimierte Modelle


In [None]:
Non_opt_results

# Modells Optimized with Hyperparametertuning
## Logistic Regression

In [None]:
# dictionary for hypertuned modells
Opt_results = {"Algo_name": [], "Recall": [],"Precision": [],"Accuracy": [], "F2": []}

In [None]:
#hyperparameter tuning for logistic regression
from sklearn.model_selection import GridSearchCV
#tune hyperparameters
parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2'],'intercept_scaling': [100, 500, 1000, 5000, 10000]}
# create grid search object with cross validation
grid_search = GridSearchCV(logreg, parameters, cv=5, scoring='recall')
# fit the model
grid_search.fit(X_train, y_train)
# print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
# create the model with the best hyperparameters
'''
solver to liblinear --> better score; standard lbfgs
'''
logreg = LogisticRegression(C=grid_search.best_params_['C'], penalty=grid_search.best_params_['penalty'], intercept_scaling=grid_search.best_params_['intercept_scaling'], solver = 'liblinear' )
# fit the model
logreg.fit(X_train, y_train)
# predict the target variable
y_pred = logreg.predict(X_test)
# print accuracy score and recall score
metrics(y_test, y_pred)


In [None]:
metrics_dict_opt(y_test, y_pred, "Logistic regression")

# Random Forest

In [None]:
# grid search parameters for random forest
parameters = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['auto', 'sqrt', 'log2'],'max_depth': [None, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]}
# create grid search object with cross validation
grid_search = GridSearchCV(rf, parameters, cv=5, scoring='recall')
# fit the model
grid_search.fit(X_train, y_train)
# print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


In [None]:
# create the model with the best hyperparameters
rf = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], max_features=grid_search.best_params_['max_features'], max_depth=grid_search.best_params_['max_depth'])
# fit the model
rf.fit(X_train, y_train)
# predict the target variable
y_pred = rf.predict(X_test)
# print accuracy score and recall score
metrics(y_test, y_pred)


In [None]:
# append the results to the dictionary
metrics_dict_opt(y_test, y_pred, "Random Forest")


# xGboost

In [None]:
# tune hyperparameters for xgboost
parameters = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
# create grid search object with cross validation
grid_search = GridSearchCV(xgb, parameters, cv=5, scoring='recall')
# fit the model
grid_search.fit(X_train, y_train)
# print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


In [None]:
# create the model with the best hyperparameters
xgb = XGBClassifier(n_estimators=grid_search.best_params_['n_estimators'], max_depth=grid_search.best_params_['max_depth'], learning_rate=grid_search.best_params_['learning_rate'])
# fit the model
xgb.fit(X_train, y_train)
# predict the target variable
y_pred = xgb.predict(X_test)
# print accuracy score and recall score

metrics(y_test, y_pred)


In [None]:
plot_roc_curve(y_test, y_pred, "xGboost Opt")

In [None]:
# append the results to the dictionary
metrics_dict_opt(y_test, y_pred, "xGboost")


# Neuronal Network

In [None]:
# tune hyperparameters for neural network
parameters = {'hidden_layer_sizes': [(15,)],'activation': ['logistic', 'relu', 'tanh'],'solver': ['lbfgs', 'sgd', 'adam'], 'max_iter': [100], 'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 1.0], 'power_t': [0.5, 0.75, 0.9], 'shuffle': [True, False]}
# create grid search object with cross validation
grid_search = GridSearchCV(nn, parameters, cv=5, scoring='recall')
# fit the model
grid_search.fit(X_train, y_train)
# print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


In [None]:
# create the model with the best hyperparameters
nn = MLPClassifier(hidden_layer_sizes=grid_search.best_params_['hidden_layer_sizes'], activation=grid_search.best_params_['activation'], solver=grid_search.best_params_['solver'], max_iter=grid_search.best_params_['max_iter'], alpha=grid_search.best_params_['alpha'], learning_rate=grid_search.best_params_['learning_rate'], learning_rate_init=grid_search.best_params_['learning_rate_init'], power_t=grid_search.best_params_['power_t'], shuffle=grid_search.best_params_['shuffle'])
# fit the model
nn.fit(X_train, y_train)
# predict the target variable
y_pred = nn.predict(X_test)
# print accuracy score and recall score
metrics(y_test, y_pred)


In [None]:

# plot roc curve

plot_roc_curve(y_test, y_pred, "NN Opt")

In [None]:
metrics_dict_opt(y_test, y_pred, "Neuronal Network")

In [None]:
# Opt_results to excel file
Opt_results = pd.DataFrame(Opt_results)
Opt_results

In [None]:
# Non_opt_results to excel file
Non_opt_results = pd.DataFrame(Non_opt_results)
Non_opt_results

In [None]:
# export nonopt to csv
Non_opt_results.to_csv("Non_opt_results.csv", index=False)
Opt_results.to_csv("Non_opt_results.csv", index=False)


# Hyperopt Hyperparametertuning

# Kreuzvalidierung

In [None]:
# create stratified kfold object
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# create empty list to store recall scores
recall_scores = []
# perform stratified kfold cross validation
for train, test in kfold.split(X_train, y_train):
    # create model
    logreg = LogisticRegression(C=grid_search.best_params_['C'], penalty=grid_search.best_params_['penalty'], intercept_scaling=grid_search.best_params_['intercept_scaling'], solver = 'liblinear' )
    # fit model
    logreg.fit(X_train.iloc[train], y_train.iloc[train])
    # predict on test set
    y_pred = logreg.predict(X_train.iloc[test])
    # append recall score to list
    recall_scores.append(recall_score(y_train.iloc[test], y_pred))
    # print recall score
    print("Recall score:", recall_score(y_train.iloc[test], y_pred))
# print mean recall score
print("Mean recall score:", statistics.mean(recall_scores))

In [None]:
from sklearn.model_selection import cross_val_score

# Running 10-Fold Cross validation on a given algorithmd
# Passing full data X and y because the K-fold will split the data and automatically choose train/test
Accuracy_Values=cross_val_score(logreg, X , y, cv=10, scoring='recall')
print('\nAccuracy values for 10-fold Cross Validation:\n',Accuracy_Values)
print('\nFinal Average Accuracy of the model:', round(Accuracy_Values.mean(),2))

# Oversampling mit Smote

In [None]:
# oversampling with smote
from imblearn.over_sampling import SMOTE


# Ideas
- check label encoding vs one hor encoding
- hyperparamter tuning
- use shape for neuronal network
- feauture importance

- Feauture engineering?
- oversampling?


https://optuna.org/