## Logistic Regression
Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from numpy import mean

import pandas as pd

def encode_df(df):
  columnsToEncode = list(df.select_dtypes(include=['object']))
  le = LabelEncoder()
  for feature in columnsToEncode:
      try:
          df[feature] = le.fit_transform(df[feature])
      except:
          print('Error encoding ' + feature)
  return df

X_train_full = pd.read_csv("./preprocessed/mergedTrainData.csv")
X_test_full = pd.read_csv("./preprocessed/mergedTestData.csv")

X_train_full = encode_df(X_train_full)
X_test_full = encode_df(X_test_full)

# Obtain target and predictors
#features = ["years_since_loan","amount","duration","payments","balance","frequency","years_since_acc_open","region","no. of inhabitants","no. of municipalities with inhabitants < 499 ","no. of municipalities with inhabitants 500-1999","no. of municipalities with inhabitants > 2000","no. of cities ","ratio of urban inhabitants ","average salary ","unemployment_rate","no. of enterpreneurs per 1000 inhabitants ","no. of commited crimes","type","gender","age_group"]

features = ["duration","payments","last_balance","itr_balance_per_account", "frequency", "region","ratio entrepeneurs","average salary ","unemploymant_growth","criminality_growth","age_group"]

y_train = X_train_full.status
X_train = X_train_full[features]
X_test = X_test_full[features]

numeric_features = list(X_train.select_dtypes(include=['int64', 'float64']))
autoscaler = StandardScaler()
X_train[numeric_features] = autoscaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = autoscaler.fit_transform(X_test[numeric_features])

# Define the model
model = LogisticRegression(solver='liblinear', random_state=0, class_weight='balanced')

# Define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)

param_grid = {'C': [0.001, 0.01, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
                  'solver': ['liblinear', 'sag', 'saga'] 
             }
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv,
scoring='roc_auc')

# summarize performance
# print('Mean ROC AUC: %.3f' % mean(scores))

grid_result = grid.fit(X_train, y_train)
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))

# Evaluate the model
p_pred = grid_result.predict_proba(X_test)
print(p_pred)

resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/logisticRegression.csv", index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


Best: 0.639206 using {'C': 0.1, 'solver': 'liblinear'}
[[0.26754818 0.73245182]
 [0.84809554 0.15190446]
 [0.54425753 0.45574247]
 [0.31560747 0.68439253]
 [0.54670479 0.45329521]
 [0.32100132 0.67899868]
 [0.27787484 0.72212516]
 [0.50383266 0.49616734]
 [0.68029411 0.31970589]
 [0.37371215 0.62628785]
 [0.46308915 0.53691085]
 [0.59692139 0.40307861]
 [0.74182327 0.25817673]
 [0.52870199 0.47129801]
 [0.46480796 0.53519204]
 [0.78061098 0.21938902]
 [0.4177478  0.5822522 ]
 [0.41311506 0.58688494]
 [0.5764962  0.4235038 ]
 [0.24063616 0.75936384]
 [0.61285794 0.38714206]
 [0.39400459 0.60599541]
 [0.19001257 0.80998743]
 [0.48775469 0.51224531]
 [0.56526559 0.43473441]
 [0.30851485 0.69148515]
 [0.37789079 0.62210921]
 [0.51117912 0.48882088]
 [0.39571505 0.60428495]
 [0.362288   0.637712  ]
 [0.25735701 0.74264299]
 [0.31454858 0.68545142]
 [0.23531922 0.76468078]
 [0.42286824 0.57713176]
 [0.23852857 0.76147143]
 [0.29693436 0.70306564]
 [0.67365927 0.32634073]
 [0.32854707 0.67145

# Random Forest

In [4]:
# https://stackoverflow.com/questions/30814231/using-the-predict-proba-function-of-randomforestclassifier-in-the-safe-and-rig
# https://rpmcruz.github.io/machine%20learning/2018/02/09/probabilities-trees.html

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(bootstrap=False,
                                class_weight='balanced',
                                criterion='gini',
                                max_depth=6,
                                max_features='auto',
                                min_samples_leaf=6,
                                min_samples_split=2,
                                n_estimators=1000,
                                random_state=0)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
param_grid = {
    'max_depth': [6],
    'min_samples_leaf': [6],
    'min_samples_split': [2],
}
grid = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=cv,
scoring='roc_auc')

grid_result = grid.fit(X_train, y_train)
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))

p_pred = grid_result.predict_proba(X_test)

resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/randomForest.csv", index=None)

Best: 0.644603 using {'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 2}


# SVM
Applying Min-Max Scaling so that values are normalized (0-1).

In [5]:
def min_max_scaling(df, numeric_columns):
  # copy the dataframe
  df_norm = df.copy()
  # apply min-max scaling
  for column in numeric_columns:
      df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())
      
  return df_norm


In [7]:
# https://towardsdatascience.com/support-vector-machine-introduction-to-machine-learning-algorithms-934a444fca47

from sklearn.svm import SVC

X_train_full = pd.read_csv("./preprocessed/mergedTrainData.csv")
X_test_full = pd.read_csv("./preprocessed/mergedTestData.csv")

features = ["duration","payments","last_balance","itr_balance_per_account", "frequency","years_since_acc_open","region","ratio entrepeneurs","average salary ","unemploymant_growth","criminality_growth","age_group"]

y_train = X_train_full.status
X_train = X_train_full[features]
X_test = X_test_full[features]

numeric_features = list(X_train.select_dtypes(include=['int64', 'float64']))
# print(X_train.dtypes)
print(numeric_features)

X_train = encode_df(X_train)
X_test = encode_df(X_test)

X_train = min_max_scaling(X_train, numeric_features)
X_test = min_max_scaling(X_test, numeric_features)

clf = SVC(probability=True, class_weight='balanced')
clf.fit(X_train, y_train)
p_pred = clf.predict_proba(X_test)

resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/SVM.csv", index=None)

['duration', 'payments', 'last_balance', 'itr_balance_per_account', 'years_since_acc_open', 'ratio entrepeneurs', 'average salary ', 'unemploymant_growth', 'criminality_growth', 'age_group']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[feature] = le.fit_transform(df[feature])


# Logistic Regression with SMOTE

In [9]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
import numpy as np

# best for logistic regression: {'C': 0.001, 'solver': 'saga'}

pipe = make_pipeline(
    SMOTE(sampling_strategy=1.0),
    LogisticRegression(C=0.001, solver='saga')
)

weights = np.linspace(0.005, 0.25, 10)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)

gsc = GridSearchCV(
    estimator=pipe,
    param_grid={
        'smote__sampling_strategy': weights
    },
    scoring='f1',
    cv=cv
)
grid_result = gsc.fit(X_train, y_train)

print("Best parameters : %s" % grid_result.best_params_)

pipe = make_pipeline(
    SMOTE(sampling_strategy=grid_result.best_params_['smote__sampling_strategy']),
    LogisticRegression(C=0.001, solver='saga')
)

pipe.fit(X_train, y_train)

p_pred = pipe.predict_proba(X_test)


resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/logisticRegressionWithSMOTE.csv", index=None)

Best parameters : {'smote__sampling_strategy': 0.16833333333333333}


180 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dukes/.local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dukes/.local/lib/python3.9/site-packages/imblearn/pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/home/dukes/.local/lib/python3.9/site-packages/imblearn/pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/home/dukes/.local/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return self.f

# Random Forest With SMOTE

In [10]:
# https://stackoverflow.com/questions/30814231/using-the-predict-proba-function-of-randomforestclassifier-in-the-safe-and-rig
# https://rpmcruz.github.io/machine%20learning/2018/02/09/probabilities-trees.html

from sklearn.ensemble import RandomForestClassifier

pipe = make_pipeline(
    SMOTE(sampling_strategy=1.0),
    RandomForestClassifier(bootstrap=False,
                                criterion='gini',
                                max_depth=5,
                                max_features='auto',
                                min_samples_leaf=6,
                                min_samples_split=2,
                                n_estimators=200,
                                random_state=0)
)

weights = np.linspace(0.005, 0.25, 10)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)

gsc = GridSearchCV(
    estimator=pipe,
    param_grid={
        'smote__sampling_strategy': weights
    },
    scoring='f1',
    cv=cv
)
grid_result = gsc.fit(X_train, y_train)

print("Best parameters : %s" % grid_result.best_params_)

pipe = make_pipeline(
    SMOTE(sampling_strategy=grid_result.best_params_['smote__sampling_strategy']),
    RandomForestClassifier(bootstrap=False,
                                criterion='gini',
                                max_depth=5,
                                max_features='auto',
                                min_samples_leaf=6,
                                min_samples_split=2,
                                n_estimators=200,
                                random_state=0)
)

pipe.fit(X_train, y_train)

p_pred = pipe.predict_proba(X_test)


resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/RandomForestWithSMOTE.csv", index=None)

180 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dukes/.local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dukes/.local/lib/python3.9/site-packages/imblearn/pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/home/dukes/.local/lib/python3.9/site-packages/imblearn/pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/home/dukes/.local/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return self.f

Best parameters : {'smote__sampling_strategy': 0.19555555555555554}


# SVM With SMOTE


In [11]:
pipe = make_pipeline(
    SMOTE(sampling_strategy=1.0),
    SVC(probability=True, class_weight='balanced')
)

weights = np.linspace(0.005, 0.25, 10)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)

gsc = GridSearchCV(
    estimator=pipe,
    param_grid={
        'smote__sampling_strategy': weights
    },
    scoring='f1',
    cv=cv
)
grid_result = gsc.fit(X_train, y_train)

print("Best parameters : %s" % grid_result.best_params_)

pipe = make_pipeline(
    SMOTE(sampling_strategy=grid_result.best_params_['smote__sampling_strategy']),
    SVC(probability=True, class_weight='balanced')
)

pipe.fit(X_train, y_train)

p_pred = pipe.predict_proba(X_test)


resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/SVMWithSMOTE.csv", index=None)

Best parameters : {'smote__sampling_strategy': 0.25}


180 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dukes/.local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dukes/.local/lib/python3.9/site-packages/imblearn/pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/home/dukes/.local/lib/python3.9/site-packages/imblearn/pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/home/dukes/.local/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return self.f