In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score, precision_score,recall_score,accuracy_score
from sklearn.model_selection import train_test_split

In [71]:
df = pd.read_csv('Data/train.csv') 

In [72]:
num_cols = [col for col in df.columns if df[col].dtype in ('int64','double','float')]
cat_cols = [col for col in df.columns if df[col].dtype =='object']

In [73]:
df.describe().round(2)

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,27.55,64046.17,4.7,9217.56,10.68,0.16,5.81,0.14
std,16929.5,6.03,37931.11,3.96,5563.81,3.03,0.09,4.03,0.35
min,0.0,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,14661.0,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,29322.0,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,43983.0,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,58644.0,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [74]:
for col in ('person_age','person_income','person_emp_length','cb_person_cred_hist_length'):
    print("Applying log to",col)
    df[col] = df[col].replace(0,1)
    df[col] = np.log10(df[col])
    df[col] = df[col].replace(np.nan,0)


Applying log to person_age
Applying log to person_income
Applying log to person_emp_length
Applying log to cb_person_cred_hist_length


In [75]:
df.describe().round(2)

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,1.43,4.76,0.54,9217.56,10.68,0.16,0.67,0.14
std,16929.5,0.08,0.2,0.37,5563.81,3.03,0.09,0.27,0.35
min,0.0,1.3,3.62,0.0,500.0,5.42,0.0,0.3,0.0
25%,14661.0,1.36,4.62,0.3,5000.0,7.88,0.09,0.48,0.0
50%,29322.0,1.41,4.76,0.6,8000.0,10.75,0.14,0.6,0.0
75%,43983.0,1.48,4.88,0.85,12000.0,12.99,0.21,0.9,0.0
max,58644.0,2.09,6.28,2.09,35000.0,23.22,0.83,1.48,1.0


In [76]:
for col in cat_cols:
    df[col] = df[col].astype('category')

In [77]:
X = df.drop(columns=['id','loan_status'],axis=1)
y = df['loan_status']

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [79]:
print('X_train shape: ',X_train.shape)
print('y_train shape: ',y_train.shape)
print('X_test shape: ',X_test.shape)
print('y_test shape: ',y_test.shape)

X_train shape:  (41051, 11)
y_train shape:  (41051,)
X_test shape:  (17594, 11)
y_test shape:  (17594,)


In [80]:
def modelevaluation(y_true,y_pred):
    roc_auc = roc_auc_score(y_true,y_pred)
    accuracy= accuracy_score(y_true,y_pred)
    precision = precision_score(y_true,y_pred)
    recall = recall_score(y_true,y_pred)
    return roc_auc,accuracy,precision,recall

def print_evaluation(y_true,y_pred):
    roc_auc,accuracy,precision,recall = modelevaluation(y_true,y_pred)
    print('Roc_Auc: ',roc_auc)
    print('accuracy: ',accuracy)
    print('precision: ',precision)
    print('recall: ',recall)

In [81]:
model = LGBMClassifier()

In [84]:
model.fit(X_train,y_train,categorical_feature=cat_cols)

TypeError: fit() got an unexpected keyword argument 'max_cat_threshold'

In [68]:
y_pred = model.predict_proba(X_test)
y_img = model.predict(X_test)
y_p = np.argmax(y_pred, axis=1)

In [69]:
print_evaluation(y_test,y_img)

Roc_Auc:  0.8592135623164125
accuracy:  0.9526543139706718
precision:  0.916030534351145
recall:  0.7293354943273906


In [30]:
y_pred.shape

(17594, 2)

In [41]:
test_df = pd.read_csv('Data/test.csv') 

In [42]:
for col in cat_cols:
    test_df[col] = test_df[col].astype('category')

In [43]:
y_pred = model.predict_proba(test_df.drop(columns=['id'],axis=1))

In [45]:
y_pred_class_1 = y_pred[:, 1]

In [46]:
final_df = pd.DataFrame({
    'id': test_df['id'],
    'loan_status': y_pred_class_1
})

In [47]:
print(final_df)

          id  loan_status
0      58645     0.975091
1      58646     0.016753
2      58647     0.578039
3      58648     0.011775
4      58649     0.057694
...      ...          ...
39093  97738     0.070686
39094  97739     0.008185
39095  97740     0.010476
39096  97741     0.200295
39097  97742     0.918237

[39098 rows x 2 columns]


In [48]:
final_df.to_csv('Data/final_predictions.csv', index=False)