In [37]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report


In [21]:
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)
X = train.drop(columns=['loan_status'])
y = train['loan_status']
train.sample(5)
# train['loan_intent'].value_counts()


Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1160,24,69000,RENT,0.0,DEBTCONSOLIDATION,D,3800,15.28,0.06,Y,2,1
33391,34,35000,RENT,16.0,DEBTCONSOLIDATION,B,10000,10.25,0.29,N,9,0
19433,32,75000,MORTGAGE,0.0,HOMEIMPROVEMENT,A,7500,8.38,0.1,N,8,0
46250,24,37000,OWN,0.0,EDUCATION,D,8000,15.31,0.22,Y,2,0
44541,22,27000,RENT,5.0,MEDICAL,A,4000,6.76,0.15,N,2,0


In [22]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41051 entries, 52748 to 56422
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  41051 non-null  int64  
 1   person_income               41051 non-null  int64  
 2   person_home_ownership       41051 non-null  object 
 3   person_emp_length           41051 non-null  float64
 4   loan_intent                 41051 non-null  object 
 5   loan_grade                  41051 non-null  object 
 6   loan_amnt                   41051 non-null  int64  
 7   loan_int_rate               41051 non-null  float64
 8   loan_percent_income         41051 non-null  float64
 9   cb_person_default_on_file   41051 non-null  object 
 10  cb_person_cred_hist_length  41051 non-null  int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 3.8+ MB


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train


Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
52748,25,60000,RENT,0.0,PERSONAL,A,3500,9.32,0.06,N,4
7648,38,41033,MORTGAGE,2.0,DEBTCONSOLIDATION,C,5700,12.99,0.14,N,14
4217,28,30000,RENT,0.0,MEDICAL,A,14500,9.63,0.47,N,7
37793,23,65000,RENT,7.0,EDUCATION,A,2000,5.42,0.03,N,3
28209,26,49000,RENT,4.0,HOMEIMPROVEMENT,D,5500,15.21,0.11,Y,2
...,...,...,...,...,...,...,...,...,...,...,...
54343,45,60000,RENT,1.0,DEBTCONSOLIDATION,B,15000,10.38,0.25,N,16
38158,22,70000,MORTGAGE,1.0,MEDICAL,A,10000,6.62,0.14,N,3
860,32,90000,MORTGAGE,0.0,PERSONAL,A,3350,5.42,0.03,N,6
15795,27,110000,MORTGAGE,2.0,EDUCATION,C,3000,13.23,0.03,Y,9


In [34]:
num_cols = ['person_age','person_income','person_emp_length','loan_amnt','loan_int_rate','loan_percent_income']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scalar', StandardScaler())
])

cat_cols = ['person_home_ownership','loan_intent','loan_grade','cb_person_default_on_file','cb_person_cred_hist_length']
cat_transformers = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num',num_transformer, num_cols),
    ('cat', cat_transformers, cat_cols)
])

models = {
    'RandomForest':{
        'model':RandomForestClassifier(random_state=42),
        'params':{
            'model__n_estimators':[50,100,200],
            'model__max_depth':[None, 10, 20],
            'model__min_samples_split':[2, 5, 10]
        }
    },
    'LogisticRegression':{
        'model':LogisticRegression(random_state=42, max_iter=1000),
        'params':{
            'model__C':[0.1,1,10],
            'model__solver':['lbfgs','liblinear']
        }
    },
    'SVC':{
        'model':SVC(random_state=42),
        'params':{
            'model__C':[0.1,1,10],
            'model__kernel':['linear','rbf']
        }
    }
}

best_model = None
best_score = 0

for name, model_info in models.items():
    pipe = Pipeline(steps=[
        ('preprocessor',preprocessor),
        ('model',model_info['model'])
    ])

    grid_search = GridSearchCV(pipe, model_info['params'], cv=5,scoring='accuracy', verbose=1,n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best cross-validation score for {name}: {grid_search.best_score_}")


    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_model = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters for RandomForest: {'model__max_depth': None, 'model__min_samples_split': 10, 'model__n_estimators': 200}
Best cross-validation score for RandomForest: 0.9501109011099012
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for LogisticRegression: {'model__C': 10, 'model__solver': 'liblinear'}
Best cross-validation score for LogisticRegression: 0.9127915272448014
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters for SVC: {'model__C': 10, 'model__kernel': 'rbf'}
Best cross-validation score for SVC: 0.942388792195372


In [40]:
y_pred = best_model.predict(X_test)
print('Best Model Performance on Test Set:')
print(classification_report(y_test, y_pred))

Best Model Performance on Test Set:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     15126
           1       0.93      0.71      0.80      2468

    accuracy                           0.95     17594
   macro avg       0.94      0.85      0.89     17594
weighted avg       0.95      0.95      0.95     17594



In [44]:
predictions = best_model.predict(test)
print(len(predictions))

39098
