## Logistic Regression
Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.

In [31]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
#from sklearn.metrics import confusion_matrix, classification_report
from numpy import mean

import pandas as pd

def encode_df(df):
  columnsToEncode = list(df.select_dtypes(include=['object']))
  le = LabelEncoder()
  for feature in columnsToEncode:
      try:
          df[feature] = le.fit_transform(df[feature])
      except:
          print('Error encoding ' + feature)
  return df

X_train_full = pd.read_csv("./preprocessed/mergedTrainData.csv")
X_test_full = pd.read_csv("./preprocessed/mergedTestData.csv")

X_train_full = encode_df(X_train_full)
X_test_full = encode_df(X_test_full)

# Obtain target and predictors
#features = ["years_since_loan","amount","duration","payments","balance","frequency","years_since_acc_open","region","no. of inhabitants","no. of municipalities with inhabitants < 499 ","no. of municipalities with inhabitants 500-1999","no. of municipalities with inhabitants > 2000","no. of cities ","ratio of urban inhabitants ","average salary ","unemployment_rate","no. of enterpreneurs per 1000 inhabitants ","no. of commited crimes","type","gender","age_group"]

features = ["duration","payments","balance","itr_balance_per_account", "frequency","region","ratio entrepeneurs","average salary ","unemploymant_growth","criminality_growth","age_group"]

y_train = X_train_full.status
X_train = X_train_full[features]
X_test = X_test_full[features]

# Define the model
model = LogisticRegression(solver='liblinear', random_state=0, class_weight='balanced')

# Define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)

param_grid = {'C': [0.001, 0.01, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
                  'solver': ['liblinear', 'sag', 'saga'] 
             }
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv,
scoring='roc_auc')

# summarize performance
# print('Mean ROC AUC: %.3f' % mean(scores))

grid_result = grid.fit(X_train, y_train)
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))

# Evaluate the model
p_pred = grid_result.predict_proba(X_test)


# from imblearn.over_sampling import SMOTE
# sm = SMOTE(random_state=0)
# X_res, y_res = sm.fit_resample(X_train, y_train)

# # Best: {'C': 0.001, 'solver': 'saga'}
# clf1 = LogisticRegression(solver='liblinear', random_state=0, class_weight='balanced')


# cv1 = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0)

# param_grid1 = {'C': [0.001, 0.01, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 40, 50, 100],
#                'solver': ['liblinear', 'sag', 'saga', 'newton-cg', 'lbfgs'],
#                'class_weight': ['balanced', None],
#                'penalty': ['none', 'l2', 'l1', 'elasticnet'],
#                'dual': [True, False],
#                'fit_intercept': [True, False],
#                'max_iter': [100, 200, 300, 400, 500, 1000]
#               }
# grid1 = GridSearchCV(estimator=clf1, param_grid=param_grid1, n_jobs=-1, cv=cv1,
# scoring='roc_auc')

# grid_result1 = grid1.fit(X_res, y_res)
# print('Best: %f using %s' % (grid_result1.best_score_, grid_result1.best_params_))

# p_pred1 = grid_result1.predict_proba(X_test)


# Metrics

#print(y_test)

#confusion_matrix(y_test, p_pred)
#classification_report(y_test, p_pred)

resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/logisticRegression.csv", index=None)



Best: 0.703450 using {'C': 0.001, 'solver': 'saga'}




# Random Forest

In [32]:
# https://stackoverflow.com/questions/30814231/using-the-predict-proba-function-of-randomforestclassifier-in-the-safe-and-rig
# https://rpmcruz.github.io/machine%20learning/2018/02/09/probabilities-trees.html

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(bootstrap=False,
                                class_weight='balanced',
                                criterion='gini',
                                max_depth=6,
                                max_features='auto',
                                min_samples_leaf=6,
                                min_samples_split=2,
                                n_estimators=1000,
                                random_state=0)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
param_grid = {
    'max_depth': [6],
    'min_samples_leaf': [6],
    'min_samples_split': [2],
}
grid = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=cv,
scoring='roc_auc')

grid_result = grid.fit(X_train, y_train)
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))

p_pred = grid_result.predict_proba(X_test)

resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/randomForest.csv", index=None)

Best: 0.645022 using {'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 2}


# SVM

In [33]:
# https://towardsdatascience.com/support-vector-machine-introduction-to-machine-learning-algorithms-934a444fca47

from sklearn.svm import SVC

clf = SVC(probability=True, class_weight='balanced')
clf.fit(X_train, y_train)
p_pred = clf.predict_proba(X_test)

resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/SVM.csv", index=None)

# Logistic Regression with SMOTE

In [34]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
import numpy as np

# best for logistic regression: {'C': 0.001, 'solver': 'saga'}

pipe = make_pipeline(
    SMOTE(sampling_strategy=1.0),
    LogisticRegression(C=0.001, solver='saga')
)

weights = np.linspace(0.005, 0.25, 10)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)

gsc = GridSearchCV(
    estimator=pipe,
    param_grid={
        'smote__sampling_strategy': weights
    },
    scoring='f1',
    cv=cv
)
grid_result = gsc.fit(X_train, y_train)

print("Best parameters : %s" % grid_result.best_params_)

pipe = make_pipeline(
    SMOTE(sampling_strategy=grid_result.best_params_['smote__sampling_strategy']),
    LogisticRegression(C=0.001, solver='saga')
)

pipe.fit(X_train, y_train)

p_pred = pipe.predict_proba(X_test)


resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/logisticRegressionWithSMOTE.csv", index=None)

180 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/home/tiago/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/imblearn/pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.8/dist-packages/imblearn/pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/home/tiago/.local/lib/python3.8/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwa

Best parameters : {'smote__sampling_strategy': 0.25}




# Random Forest With SMOTE

In [38]:
# https://stackoverflow.com/questions/30814231/using-the-predict-proba-function-of-randomforestclassifier-in-the-safe-and-rig
# https://rpmcruz.github.io/machine%20learning/2018/02/09/probabilities-trees.html

from sklearn.ensemble import RandomForestClassifier

pipe = make_pipeline(
    SMOTE(sampling_strategy=1.0),
    RandomForestClassifier(bootstrap=False,
                                criterion='gini',
                                max_depth=5,
                                max_features='auto',
                                min_samples_leaf=6,
                                min_samples_split=2,
                                n_estimators=200,
                                random_state=0)
)

weights = np.linspace(0.005, 0.25, 10)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)

gsc = GridSearchCV(
    estimator=pipe,
    param_grid={
        'smote__sampling_strategy': weights
    },
    scoring='f1',
    cv=cv
)
grid_result = gsc.fit(X_train, y_train)

print("Best parameters : %s" % grid_result.best_params_)

pipe = make_pipeline(
    SMOTE(sampling_strategy=grid_result.best_params_['smote__sampling_strategy']),
    RandomForestClassifier(bootstrap=False,
                                criterion='gini',
                                max_depth=5,
                                max_features='auto',
                                min_samples_leaf=6,
                                min_samples_split=2,
                                n_estimators=200,
                                random_state=0)
)

pipe.fit(X_train, y_train)

p_pred = pipe.predict_proba(X_test)


resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/RandomForestWithSMOTE.csv", index=None)

180 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/home/tiago/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/imblearn/pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.8/dist-packages/imblearn/pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/home/tiago/.local/lib/python3.8/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwa

Best parameters : {'smote__sampling_strategy': 0.25}


# SVM With SMOTE

In [39]:
pipe = make_pipeline(
    SMOTE(sampling_strategy=1.0),
    SVC(probability=True, class_weight='balanced')
)

weights = np.linspace(0.005, 0.25, 10)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)

gsc = GridSearchCV(
    estimator=pipe,
    param_grid={
        'smote__sampling_strategy': weights
    },
    scoring='f1',
    cv=cv
)
grid_result = gsc.fit(X_train, y_train)

print("Best parameters : %s" % grid_result.best_params_)

pipe = make_pipeline(
    SMOTE(sampling_strategy=grid_result.best_params_['smote__sampling_strategy']),
    SVC(probability=True, class_weight='balanced')
)

pipe.fit(X_train, y_train)

p_pred = pipe.predict_proba(X_test)


resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/SVMWithSMOTE.csv", index=None)

Best parameters : {'smote__sampling_strategy': 0.16833333333333333}


180 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/home/tiago/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/imblearn/pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/usr/local/lib/python3.8/dist-packages/imblearn/pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/home/tiago/.local/lib/python3.8/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwa