In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 2000)

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import *

In [2]:
train = pd.read_csv('ML_HIRING/train.csv')
test = pd.read_csv('ML_HIRING/test.csv')
sub = pd.read_csv('ML_HIRING/sample_submission.csv')

In [3]:
test.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co-borrower_credit_score,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12
0,1,Y,Browning-Hart,3.875,417000,360,01/02/12,Apr-12,75,1,20,790,A23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,X,OTHER,4.5,113000,360,01/02/12,Apr-12,80,2,33,793,C86,0,784,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,Y,OTHER,4.5,72000,360,01/01/12,Mar-12,75,1,34,710,C86,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,X,"Miller, Mcclure and Allen",4.125,123000,180,01/02/12,Apr-12,41,2,24,798,A23,0,813,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,X,Browning-Hart,3.25,166000,180,01/02/12,Apr-12,53,2,12,767,A23,0,768,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
train.shape

(116058, 29)

In [19]:
# Converting loan term to months and years

train['loanterm_Years'] =train.loan_term * 0.00273973
train['loanterm_Monthly'] = train.loan_term * 0.0328767

test['loanterm_Years'] =train.loan_term * 0.00273973
test['loanterm_Monthly'] = train.loan_term * 0.0328767

In [20]:
# Creating new column to calculate monthly EMI

train['EMI'] = (train.unpaid_principal_bal * (train.interest_rate/1200)) * ((1 +  train.interest_rate/1200)** train.loanterm_Monthly) / (((1 +  train.interest_rate/1200)** train.loanterm_Monthly) -1)
test['EMI'] = (test.unpaid_principal_bal * (test.interest_rate/1200)) * ((1 +  test.interest_rate/1200)** test.loanterm_Monthly) / (((1 +  test.interest_rate/1200)** test.loanterm_Monthly) -1)

# New column to find total amount payable
train['Total_amount'] = train.EMI * train.loanterm_Monthly
test['Total_amount'] = test.EMI * test.loanterm_Monthly

# deleting unnecessary columns
train.drop(['loanterm_Monthly','loanterm_Years'], axis=1, inplace=True)
test.drop(['loanterm_Monthly','loanterm_Years'], axis=1, inplace=True)

train.drop(['loan_id','origination_date','first_payment_date'], axis=1, inplace=True)
test.drop(['loan_id','origination_date','first_payment_date'], axis=1, inplace=True)

In [21]:
# months 1: 12 provide some insights so adding all of them

train['Sum_all_months'] = train.loc[:,'m1':'m12'].sum(axis=1)               ##   13
test['Sum_all_months'] = test.loc[:,'m1':'m12'].sum(axis=1)

train['debt_value'] =train.debt_to_income_ratio * train.loan_to_value   ###  6
train['loan_debt_ratio'] = train.unpaid_principal_bal / train.debt_value
test['debt_value'] =test.debt_to_income_ratio * test.loan_to_value
test['loan_debt_ratio'] = test.unpaid_principal_bal / test.debt_value

# train['EMI_la'] = train.EMI / train.debt_value                          ###  12
# test['EMI_la'] = test.EMI / test.debt_value
####
train['EMI_lala'] = train.EMI / train.loan_debt_ratio                      #1
test['EMI_lala'] = test.EMI / test.loan_debt_ratio

train['total_by_debtvalue'] = train.Total_amount / train.debt_value
test['total_by_debtvalue'] = test.Total_amount / test.debt_value


train['balance_value_ratio'] = train.unpaid_principal_bal / train.loan_to_value   ###  2
test['balance_value_ratio'] = test.unpaid_principal_bal / test.loan_to_value

train['balance_debt_ratio'] = train.unpaid_principal_bal / train.debt_to_income_ratio   ###   9
test['balance_debt_ratio'] = test.unpaid_principal_bal / test.debt_to_income_ratio

In [22]:
# mean encoding

value = train.groupby('loan_to_value').m13.mean()
train['encod_loanvalue'] = train.loan_to_value.map(value) *100                   ####  5
test['encod_loanvalue'] = test.loan_to_value.map(value)  *100

test.encod_loanvalue.fillna(0, inplace=True)

In [23]:
train.drop(['number_of_borrowers'], axis=1, inplace=True)
test.drop(['number_of_borrowers'], axis=1, inplace=True)

In [24]:
train['EMI_by_debtvalue'] = train.EMI / train.debt_value
test['EMI_by_debtvalue'] = test.EMI / test.debt_value

train['balance_EMI_by_debtvalue'] = train.balance_debt_ratio / train.EMI_by_debtvalue
test['balance_EMI_by_debtvalue'] = test.balance_debt_ratio / test.EMI_by_debtvalue

In [11]:
# one-hot encoding categorical variables

train=pd.concat([pd.get_dummies(train['source']),train],axis=1).drop('source',axis=1)
train=pd.concat([pd.get_dummies(train['financial_institution']),train],axis=1).drop('financial_institution',axis=1)
train=pd.concat([pd.get_dummies(train['loan_purpose']),train],axis=1).drop('loan_purpose',axis=1)

test=pd.concat([pd.get_dummies(test['source']),test],axis=1).drop('source',axis=1)
test=pd.concat([pd.get_dummies(test['financial_institution']),test],axis=1).drop('financial_institution',axis=1)
test=pd.concat([pd.get_dummies(test['loan_purpose']),test],axis=1).drop('loan_purpose',axis=1)

In [26]:
train.head()

Unnamed: 0,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,loan_to_value,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co-borrower_credit_score,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,EMI,Total_amount,Sum_all_months,debt_value,loan_debt_ratio,EMI_lala,total_by_debtvalue,balance_value_ratio,balance_debt_ratio,encod_loanvalue,EMI_by_debtvalue,balance_EMI_by_debtvalue
0,Z,"Turner, Baldwin and Rhodes",4.25,214000,360,95,22.0,694.0,C86,30.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,1,18494.62601,218895.217538,1,2090.0,102.392344,180.625086,104.734554,2252.631579,9727.272727,0.685871,8.849103,1099.238232
1,Y,"Swanson, Newton and Miller",4.875,144000,360,72,44.0,697.0,B12,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1,12486.207249,147781.904347,1,3168.0,45.454545,274.696559,46.648328,2000.0,3272.727273,0.538117,3.941353,830.356232
2,Z,Thornton-Davis,3.25,366000,180,49,33.0,780.0,B12,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,62427.906274,369436.238314,0,1617.0,226.345083,275.808537,228.470154,7469.387755,11090.909091,0.371747,38.60724,287.275372
3,X,OTHER,4.75,135000,360,46,44.0,633.0,B12,0.0,638.0,0.0,0,0,0,0,0,0,0,0,1,1,1,1,1,11698.083856,138453.981658,4,2024.0,66.699605,175.384605,68.406117,2934.782609,3068.181818,0.21164,5.779686,530.856171
4,X,OTHER,4.75,124000,360,80,43.0,681.0,C86,0.0,0.0,0.0,0,1,2,3,4,5,6,7,8,9,10,11,1,10744.906653,127172.546116,66,3440.0,36.046512,298.084507,36.968763,1550.0,2883.72093,0.693078,3.123519,923.228123


In [12]:
X = train.drop('m13', axis=1)
Y = train.m13

In [None]:
Model = LGBMClassifier(random_state=22,class_weight={0:1,1:11}, learning_rate=0.05)
Model.fit(X,Y)
sss=Model.predict(test)
test["m13"] = sss
test.to_csv('loan.csv', index=False)
# sub.m13.value_counts()