## Logistic Regression
Logistic Regression is a Machine Learning classification algorithm that is used to predict the probability of a categorical dependent variable. In logistic regression, the dependent variable is a binary variable that contains data coded as 1 (yes, success, etc.) or 0 (no, failure, etc.). In other words, the logistic regression model predicts P(Y=1) as a function of X.

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd

def encode_df(df):
  columnsToEncode = list(df.select_dtypes(include=['object']))
  le = LabelEncoder()
  for feature in columnsToEncode:
      try:
          df[feature] = le.fit_transform(df[feature])
      except:
          print('Error encoding ' + feature)
  return df

X_train_full = pd.read_csv("./preprocessed/mergedTrainData.csv")
X_test_full = pd.read_csv("./preprocessed/mergedTestData.csv")

X_train_full = encode_df(X_train_full)
X_test_full = encode_df(X_test_full)

# Obtain target and predictors
features = ["years_since_loan","amount","duration","payments","balance","frequency","years_since_acc_open","region","no. of inhabitants","no. of municipalities with inhabitants < 499 ","no. of municipalities with inhabitants 500-1999","no. of municipalities with inhabitants > 2000","no. of cities ","ratio of urban inhabitants ","average salary ","unemployment_rate","no. of enterpreneurs per 1000 inhabitants ","no. of commited crimes","type","gender","age_group"]

y_train = X_train_full.status
X_train = X_train_full[features]
X_test = X_test_full[features]

# Step 3: Create a model and train it
model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)
model.fit(X_train, y_train)

# Step 4: Evaluate the model
p_pred = model.predict_proba(X_test)

resultData = {'Id': X_test_full['loan_id'], 'Predicted': p_pred[:,1]}
result = pd.DataFrame(data=resultData)
result.to_csv("./results/newLogisticRegression.csv", index=None)