In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [2]:
pd.set_option("display.max_columns", 50)

In [3]:
train_data=pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
len_train_data=len(train_data)
test_data=pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
data=pd.concat([train_data,test_data],axis=0,ignore_index=True)
data.columns
print(len_train_data,len(test_data))

58645 39098


In [4]:
data['loan_grade'] = data['loan_grade'].astype(str)

In [5]:
grade_mapping = {
    'A': 6,
    'B': 5,
    'C': 4,
    'D': 3,
    'E': 2,
    'F': 1,
    'G': 0
}

# Use the map method to create a new column with encoded values
data['loan_grade_encoded'] = data['loan_grade'].map(grade_mapping)

In [6]:
data=pd.get_dummies(data,columns=['person_home_ownership','loan_intent','cb_person_default_on_file'],dtype=float)
data.columns

Index(['id', 'person_age', 'person_income', 'person_emp_length', 'loan_grade',
       'loan_amnt', 'loan_int_rate', 'loan_percent_income',
       'cb_person_cred_hist_length', 'loan_status', 'loan_grade_encoded',
       'person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER',
       'person_home_ownership_OWN', 'person_home_ownership_RENT',
       'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE',
       'cb_person_default_on_file_N', 'cb_person_default_on_file_Y'],
      dtype='object')

In [7]:
data=data.drop('loan_grade',axis=1)

In [8]:
dataset_train=data.loc[:len_train_data-1,:]
dataset_test=data.loc[len_train_data:,data.columns!='loan_status']
print(len(dataset_train),len(dataset_test))

58645 39098


In [9]:
dataset_train

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status,loan_grade_encoded,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,0,37,35000,0.0,6000,11.49,0.17,14,0.0,5,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,22,56000,6.0,4000,13.35,0.07,2,0.0,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,2,29,28800,8.0,6000,8.90,0.21,10,0.0,6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,3,30,70000,14.0,12000,11.11,0.17,5,0.0,5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,4,22,60000,2.0,6000,6.92,0.10,3,0.0,6,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58640,58640,34,120000,5.0,25000,15.95,0.21,10,0.0,3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
58641,58641,28,28800,0.0,10000,12.73,0.35,8,1.0,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
58642,58642,23,44000,7.0,6800,16.00,0.15,2,1.0,3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
58643,58643,22,30000,2.0,5000,8.90,0.17,3,0.0,6,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
X=dataset_train.drop('loan_status',axis=1)
y=dataset_train['loan_status']

In [11]:
scaler=StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
dataset_test_scaled=scaler.transform(dataset_test.to_numpy())



In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [13]:
log_reg_model=LogisticRegression()
log_reg_model.fit(X_train,y_train)
y_pred=log_reg_model.predict(X_test)

In [14]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95     10087
         1.0       0.76      0.46      0.57      1642

    accuracy                           0.90     11729
   macro avg       0.84      0.72      0.76     11729
weighted avg       0.89      0.90      0.89     11729



In [15]:
y_pred_test=log_reg_model.predict(dataset_test_scaled)

In [16]:
y_pred_test=['True' if x==1 else 'False' for x in y_pred_test]

In [17]:
result_df=pd.DataFrame({'id':dataset_test['id'],'loan_status':y_pred_test})

In [18]:
result_df.to_csv('result_loan_pred.csv',index=False)

In [19]:
xgb_classifier_model=XGBClassifier(n_estimators=200)
xgb_classifier_model.fit(X_train,y_train)
y_pred=xgb_classifier_model.predict(X_test)

In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97     10087
         1.0       0.89      0.75      0.81      1642

    accuracy                           0.95     11729
   macro avg       0.93      0.87      0.89     11729
weighted avg       0.95      0.95      0.95     11729



In [21]:

y_pred_xgb_test=xgb_classifier_model.predict(dataset_test_scaled)
y_pred_xgb_test=['True' if x==1 else 'False' for x in y_pred_xgb_test]

In [22]:
result_df_xgb=pd.DataFrame({'id':dataset_test['id'],'loan_status':y_pred_xgb_test})

In [23]:
result_df_xgb.to_csv('result_loan_pred_xgb.csv',index=False)

In [24]:
## TO DO:
# 1. Learn to perform EDA properly on all features of input data
# 2. get probablity of predicted y instead of labels, use roc curve to get the best threshold
# 3. perform hyperparameter tuning
# 4. try out lgb and cat boost as well