## Logistic Regression
Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.

In [61]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd

def encode_df(df):
  columnsToEncode = list(df.select_dtypes(include=['object']))
  le = LabelEncoder()
  for feature in columnsToEncode:
      try:
          df[feature] = le.fit_transform(df[feature])
      except:
          print('Error encoding ' + feature)
  return df

X_train_full = pd.read_csv("./preprocessed/mergedTrainData.csv")
X_test_full = pd.read_csv("./preprocessed/mergedTestData.csv")

X_train_full = encode_df(X_train_full)
X_test_full = encode_df(X_test_full)

# Obtain target and predictors
features = ["years_since_loan","amount","duration","payments","balance","frequency","years_since_acc_open","region","no. of inhabitants","no. of municipalities with inhabitants < 499 ","no. of municipalities with inhabitants 500-1999","no. of municipalities with inhabitants > 2000","no. of cities ","ratio of urban inhabitants ","average salary ","unemployment_rate","no. of enterpreneurs per 1000 inhabitants ","no. of commited crimes","type","gender","age_group"]

y_train = X_train_full.status
X_train = X_train_full[features]
X_test = X_test_full[features]

# Step 3: Create a model and train it
model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
model.fit(X_train, y_train)

# Step 4: Evaluate the model
p_pred = model.predict_proba(X_test)

resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/logisticRegression.csv", index=None)

# Random Forest

In [62]:
# https://stackoverflow.com/questions/30814231/using-the-predict-proba-function-of-randomforestclassifier-in-the-safe-and-rig
# https://rpmcruz.github.io/machine%20learning/2018/02/09/probabilities-trees.html

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10000, max_depth=None,
     random_state=0)
clf.fit(X_train, y_train)
p_pred = clf.predict_proba(X_test)

resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/randomForest.csv", index=None)

array([[0.0339, 0.9661],
       [0.5562, 0.4438],
       [0.0326, 0.9674],
       [0.0321, 0.9679],
       [0.3793, 0.6207],
       [0.0148, 0.9852],
       [0.1194, 0.8806],
       [0.1639, 0.8361],
       [0.089 , 0.911 ],
       [0.0721, 0.9279],
       [0.1302, 0.8698],
       [0.13  , 0.87  ],
       [0.4881, 0.5119],
       [0.1673, 0.8327],
       [0.1141, 0.8859],
       [0.4313, 0.5687],
       [0.0469, 0.9531],
       [0.221 , 0.779 ],
       [0.107 , 0.893 ],
       [0.0211, 0.9789],
       [0.1434, 0.8566],
       [0.0653, 0.9347],
       [0.0545, 0.9455],
       [0.2056, 0.7944],
       [0.322 , 0.678 ],
       [0.0274, 0.9726],
       [0.0819, 0.9181],
       [0.1164, 0.8836],
       [0.0899, 0.9101],
       [0.0884, 0.9116],
       [0.0427, 0.9573],
       [0.038 , 0.962 ],
       [0.0752, 0.9248],
       [0.2268, 0.7732],
       [0.0403, 0.9597],
       [0.0133, 0.9867],
       [0.2857, 0.7143],
       [0.0423, 0.9577],
       [0.066 , 0.934 ],
       [0.0797, 0.9203],
