In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv(r"C:\Users\kpehl\Desktop\train.csv")
test = pd.read_csv(r"C:\Users\kpehl\Desktop\test.csv")

In [3]:
train = train.replace(['Y'],1)
train = train.replace(['N'],0)
test = test.replace(['Y'],1)
test = test.replace(['N'],0)

In [4]:
y=train["default"]
X1=train.drop(columns=["default","loan_application_id","loan_amount"])
scaler = StandardScaler().fit(X1)
X1 = scaler.transform(X1)

In [5]:
log = LogisticRegression(solver = 'lbfgs', penalty = 'none', class_weight = "balanced", max_iter = 10000)
log.fit(X1, y)
preds = log.predict_proba(X1)[:,1]
print(roc_auc_score(y, preds))

0.8324955914934664


In [6]:
w = log.coef_

In [7]:
X = pd.DataFrame(X1)

In [8]:
X["loan_amount"]=train["loan_amount"]

In [9]:
y_actual=train["default"]

In [10]:
def loss(w, X, y_actual, mu):
    y_actual=y_actual.values.reshape(-1,1)
    y_pred = (sigmoid(X.drop(columns=["loan_amount"])@w.reshape(-1,1))).to_numpy()
    amounts=X["loan_amount"].values.reshape(-1,1)
    
    return (sum(np.square(y_pred-y_actual)*(y_pred*amounts*0.15+y_actual*amounts))).item() + mu*np.linalg.norm(w.transpose().flatten())

def gradient(w, X, y_actual, mu):
    y_actual=y_actual.values.reshape(-1,1)
    y_pred = (sigmoid(X.drop(columns=["loan_amount"])@w.reshape(-1,1))).to_numpy()
    amounts=X["loan_amount"].values.reshape(-1,1)
    grad_y_pred = (sigmoidgrad(X.drop(columns=["loan_amount"])@w.reshape(-1,1))).to_numpy()
    
    return ((((2*((y_pred-y_actual)*grad_y_pred)*((y_pred*amounts)*0.15+(y_actual*amounts))
 + (((np.square(y_pred-y_actual)*grad_y_pred)*amounts)*0.15)).transpose()@X.drop(columns=["loan_amount"])).transpose()).to_numpy().flatten()
+ (2*mu*w).transpose().flatten())

def sigmoid(x):
    k=40
    return (1/(1+np.exp(-2*k*(x-0.5))))

def sigmoidgrad(x):
    return (sigmoid(x)*(1-sigmoid(x)))

def gradientdescent(X,y_actual,initial_w,mu,step_length):
    w=initial_w
    print("Loss",loss(X, y_actual, w, mu))
    for i in range(1000):
        w=w-(step_length*gradient(X, y_actual, w, mu))
        print("Gradient length",np.linalg.norm(gradient(X, y_actual, w, mu)))
        print("Loss",loss(X, y_actual, w, mu))
        if (np.linalg.norm(gradient(X, y_actual, w, mu))<25):
            break
    return w


In [11]:
np.linalg.norm(w.transpose().flatten())

11.888055991689694

In [12]:
np.linalg.norm(w)

11.888055991689694

In [13]:
loss(w.transpose().flatten(),X,y_actual,0)

  return (1/(1+np.exp(-2*k*(x-0.5))))


323360.7688609789

In [14]:
from scipy.optimize import minimize
res = minimize(loss, w.transpose(), method='BFGS', jac=gradient, args = (X, y_actual,0))
res

  return (1/(1+np.exp(-2*k*(x-0.5))))


      fun: 236122.1416748745
 hess_inv: array([[ 2.53614808e-05,  8.50775241e-05, -1.39786083e-05, ...,
         6.28369243e-05,  1.70761869e-04,  1.74866588e-05],
       [ 8.50775241e-05,  6.12540566e-04, -5.69731858e-05, ...,
         3.05000789e-04,  1.18972017e-03,  9.78908280e-05],
       [-1.39786083e-05, -5.69731858e-05,  3.94908621e-05, ...,
        -3.49312444e-05, -1.22092134e-04, -8.05726974e-06],
       ...,
       [ 6.28369243e-05,  3.05000789e-04, -3.49312444e-05, ...,
         2.21692740e-04,  6.93242946e-04,  6.43432942e-05],
       [ 1.70761869e-04,  1.18972017e-03, -1.22092134e-04, ...,
         6.93242946e-04,  4.99496387e-03,  2.26537222e-04],
       [ 1.74866588e-05,  9.78908280e-05, -8.05726974e-06, ...,
         6.43432942e-05,  2.26537222e-04,  3.92513025e-05]])
      jac: array([-1.50029143e-06, -2.16868555e-06,  1.28833710e-06, -1.49184643e-07,
        1.37065975e-07,  5.31207846e-07, -1.53016481e-06, -1.29321583e-06,
       -3.23830648e-07, -2.74954726e-06, -

In [15]:
X_test=test.drop(columns=["default","loan_application_id","loan_amount"])

In [16]:
scaler = StandardScaler().fit(X_test)
X_test = scaler.transform(X_test)

In [17]:
sig=sigmoid(X_test@res.x)

  return (1/(1+np.exp(-2*k*(x-0.5))))


In [18]:
predictions=[]
for i in range(len(sig)):
    if(sig[i]<0.5):
        predictions.append(0)
    if(sig[i]>=0.5):
        predictions.append(1)
predictions        

[0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,


In [19]:
test["predictions"]=predictions

In [28]:
test_predictions=test[["loan_application_id","predictions"]]

In [30]:
test_predictions.to_csv(r'C:\Users\kpehl\Desktop\test_predictions.csv',index=False)