In [10]:
def run_model(pipeline, param_grid):
  # Define evaluation procedure
  cv = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

  grid = GridSearchCV(estimator=pipeline, param_grid=param_grid, 
    scoring='roc_auc', n_jobs=-1, cv=cv)

  grid_result = grid.fit(X_train, y_train)
  print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))

  # Evaluate the model
  p_pred = grid_result.predict_proba(X_test)

  resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
  result = pd.DataFrame(data=resultData)
  return result

def encode_df(df):
  columnsToEncode = list(df.select_dtypes(include=['object']))
  le = LabelEncoder()
  for feature in columnsToEncode:
      try:
          df[feature] = le.fit_transform(df[feature])
      except:
          print('Error encoding ' + feature)
  return df

def load_data():
  X_train_full = pd.read_csv("./preprocessed/mergedTrainData.csv")
  X_test_full = pd.read_csv("./preprocessed/mergedTestData.csv")

  # Obtain target and predictors
  # features = ["duration","payments","last_balance","itr_balance_per_account", "frequency", "region","ratio entrepeneurs","average salary ","unemploymant_growth","criminality_growth","age_group"]
  # features = ['reached_negative_balance', 'ratio_RAB', 'credit_ratio', 'withdrawal_ratio', 'IC_mean', 'balance_min', 'mean_trans_profit', 'balance_mean', 'ratio_CC', 'CC_std', 'ratio_IC', 'withdrawal_max', 'WC_max']
  features = ['reached_negative_balance', 'RAB_mean', 'ratio_RAB', 'credit_ratio', 'balance_min', 'RAB_sum', 'mean_trans_profit', 'balance_mean', 'ratio_CC', 'CC_std', 'IC_min', 'CC_max', 'ratio_IC', 'withdrawal_max', 'WC_max', 'last_balance', 'CAB_mean']

  X_train = X_train_full[features]
  X_test = X_test_full[features]
  y_train = X_train_full.status

  return [X_train, X_test, y_train]

## Logistic Regression
Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

# Load data
[X_train, X_test, y_train] = load_data()

X_train = encode_df(X_train)
X_test = encode_df(X_test)

# Without SMOTE
print("------- WITHOUT SMOTE -------")
pipeline = Pipeline([('classification', LogisticRegression(random_state=0, max_iter=5000))])

param_grid = {'classification__penalty': ['l1', 'l2', 'none'],
          'classification__C': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0],
          'classification__solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
          'classification__class_weight': ["balanced", None]}

results_df = run_model(pipeline, param_grid)
results_df.to_csv("./results/logisticRegression.csv", index=None)

print("-----------------------\n\n")
# With SMOTE (Add more stuff to param_grid)
print("------- WITH SMOTE -------")
pipeline = Pipeline([('smote', SMOTE()), ('classification', LogisticRegression(random_state=0, max_iter=5000))])

weights = list(np.linspace(0.005, 0.25, 5)) + ['minority', 'auto']
param_grid = {
        'smote__sampling_strategy': weights,
        'classification__C': [0.001, 0.01, 0.05],
        'classification__solver': ['liblinear', 'newton-cg', 'saga'],
        'classification__class_weight': ["balanced", None]}

results_df = run_model(pipeline, param_grid)
results_df.to_csv("./results/logisticRegressionWithSMOTE.csv", index=None)



KeyboardInterrupt: 

# Random Forest

In [5]:
# https://stackoverflow.com/questions/30814231/using-the-predict-proba-function-of-randomforestclassifier-in-the-safe-and-rig
# https://rpmcruz.github.io/machine%20learning/2018/02/09/probabilities-trees.html

from sklearn.ensemble import RandomForestClassifier

# Load data
[X_train, X_test, y_train] = load_data()

# Without SMOTE
print("------- WITHOUT SMOTE -------")
param_grid = {'classification__n_estimators': [int(x) for x in range(2, 14, 2)],
            'classification__max_features': ['auto', 'sqrt'],
            'classification__max_depth': [2, 6, 10, 14],
            'classification__criterion': ['gini', 'entropy'],
            'classification__min_samples_split':  [2, 4, 6, 8],
            'classification__min_samples_leaf':  [1, 2, 4, 6],
            'classification__class_weight': ["balanced", "balanced_subsample", None]}


pipeline = Pipeline([('classification', RandomForestClassifier(random_state=0))])

result_df = run_model(pipeline, param_grid)
result_df.to_csv("./results/randomForest.csv", index=None)

print("-----------------------\n\n")
# With SMOTE
print("------- WITH SMOTE -------")
pipeline = Pipeline([('smote', SMOTE()), ('classification', RandomForestClassifier(random_state=0))])

weights = list(np.linspace(0.005, 0.25, 10)) + ['minority', 'auto']
param_grid['smote__sampling_strategy'] = weights
result_df = run_model(pipeline, param_grid)
result_df.to_csv("./results/randomForestWithSMOTE.csv", index=None)


Process LokyProcess-12:
Traceback (most recent call last):
  File "/usr/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/dukes/.local/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 483, in _process_worker
    gc.collect()
KeyboardInterrupt


KeyboardInterrupt: 

# SVM
Applying Min-Max Scaling so that values are normalized (0-1).

In [6]:
def min_max_scaling(df, numeric_columns):
  # copy the dataframe
  df_norm = df.copy()
  # apply min-max scaling
  for column in numeric_columns:
      df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())
      
  return df_norm


In [13]:
# https://towardsdatascience.com/support-vector-machine-introduction-to-machine-learning-algorithms-934a444fca47

from sklearn.svm import SVC

# Load data
[X_train, X_test, y_train] = load_data()

numeric_features = list(X_train.select_dtypes(include=['int64', 'float64']))
X_train = min_max_scaling(X_train, numeric_features)
X_test = min_max_scaling(X_test, numeric_features)

# Without SMOTE
print("------- WITHOUT SMOTE -------")

clf = Pipeline([('classification', SVC(probability=True))])

param_grid = {'classification__C': [0.1, 1, 10, 100, 1000],
              'classification__gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'classification__kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

results_df = run_model(clf, param_grid)
results_df.to_csv("./results/SVM.csv", index=None)
print("-----------------------\n\n")

# With SMOTE
print("------- WITH SMOTE -------")
clf = Pipeline([('smote', SMOTE()), ('classification', SVC(probability=True))])

weights = list(np.linspace(0.005, 0.25, 10)) + ['minority', 'auto']
param_grid['smote__sampling_strategy'] = weights

result_df = run_model(clf, param_grid)
result_df.to_csv("./results/SVMWithSMOTE.csv", index=None)


------- WITH SMOTE -------
Best: 0.886839 using {'classification__C': 0.1, 'classification__gamma': 1, 'classification__kernel': 'rbf', 'smote__sampling_strategy': 0.19555555555555554}


6000 fits failed out of a total of 12000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6000 fits failed with the following error:
Traceback (most recent call last):
  File "/home/dukes/.local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/dukes/.local/lib/python3.9/site-packages/imblearn/pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "/home/dukes/.local/lib/python3.9/site-packages/imblearn/pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "/home/dukes/.local/lib/python3.9/site-packages/joblib/memory.py", line 349, in __call__
    return se