In [11]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [12]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    LabelEncoder,
    OrdinalEncoder,
    StandardScaler,
    FunctionTransformer,
    MinMaxScaler,
)
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.metrics import precision_score, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn import pipeline
from sklearn.feature_selection import f_classif
from sklearn.svm import LinearSVC
import joblib

pd.set_option(
    "display.max_colwidth", 1000, "display.max_rows", None, "display.max_columns", None
)

import warnings

warnings.filterwarnings("ignore")
print("Setup complete")

Setup complete


<IPython.core.display.Javascript object>

In [13]:
def empty_rows(df):
    """Delete rows wit Null values"""
    null_rows = df[df.iloc[:, 1:].isnull().all(axis=1)].index
    df.drop(null_rows, inplace=True)

<IPython.core.display.Javascript object>

In [14]:
def missing_values_summary(df):
    """function gives summary of all features with missing values"""
    missing_values = df.isnull().sum()
    missing_values_percent = 100 * df.isnull().sum() / len(df)
    missing_values_type = df.dtypes
    missing_values_table = pd.concat(
        [missing_values, missing_values_percent, missing_values_type], axis=1
    )
    missing_values_table = missing_values_table.rename(
        columns={0: "Missing Values", 1: "% of Total Values", 2: "type"}
    )
    missing_values_table = (
        missing_values_table[missing_values_table.iloc[:, 1] != 0]
        .sort_values("% of Total Values", ascending=False)
        .round(4)
    )
    return missing_values_table

<IPython.core.display.Javascript object>

In [15]:
def single_value_features(df):
    """delete features that have only single value"""
    single_feature = []
    for column in list(df.columns):
        if df[column].unique().size <= 1:
            single_feature.append(column)
    return single_feature

<IPython.core.display.Javascript object>

In [16]:
def feature_correlation(X_train, y_train):
    """provides feature correlation list"""
    linear_dep = pd.DataFrame()
    for col in X_train.columns:
        linear_dep.loc[col, "pearson_corr"] = X_train[col].corr(y_train)
    linear_dep["abs_pearson_corr"] = abs(linear_dep["pearson_corr"])
    for col in X_train.columns:
        mask = X_train[col].notnull()
        (linear_dep.loc[col, "F"], linear_dep.loc[col, "p_value"]) = f_classif(
            pd.DataFrame(X_train.loc[mask, col]), y_train.loc[mask]
        )
    linear_dep.sort_values("abs_pearson_corr", ascending=False, inplace=True)
    linear_dep.drop("abs_pearson_corr", axis=1, inplace=True)
    linear_dep.reset_index(inplace=True)
    linear_dep.rename(columns={"index": "variable"}, inplace=True)

    return linear_dep

<IPython.core.display.Javascript object>

In [17]:
def replace_outliers_with_iqr(data, column_name, multiplier=1.5):
    """replacing outliers with upper and lower bound"""
    Q1 = data[column_name].quantile(0.25)
    Q3 = data[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    data[column_name] = data[column_name].apply(
        lambda x: lower_bound
        if x < lower_bound
        else (upper_bound if x > upper_bound else x)
    )

    return data

<IPython.core.display.Javascript object>

In [18]:
def encode_categorical(df, columns, mode_value):
    '''Encoding categorical value like OneHotEncoder'''
    encoded_dfs = []
    for column in columns:
        temp = pd.get_dummies(df[column], prefix=column, prefix_sep="__")
        if column + "__" + str(mode_value) in temp.columns:
            temp = temp.drop(column + "__" + str(mode_value), axis=1)
        temp = temp.astype(int)
        encoded_dfs.append(temp)

    df_encoded = pd.concat([df.drop(columns, axis=1)] + encoded_dfs, axis=1)
    return df_encoded


<IPython.core.display.Javascript object>

# LOAN Status prediction

In [19]:
df = pd.read_csv("accepted_2007_to_2018Q4.csv", low_memory=False)
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68407277,,debt_consolidation,Debt consolidation,190xx,PA,5.91,0.0,Aug-2003,675.0,679.0,1.0,30.0,,7.0,0.0,2765.0,29.7,13.0,w,0.0,0.0,4421.723917,4421.72,3600.0,821.72,0.0,0.0,0.0,Jan-2019,122.67,,Mar-2019,564.0,560.0,0.0,30.0,1.0,Individual,,,,0.0,722.0,144904.0,2.0,2.0,0.0,1.0,21.0,4981.0,36.0,3.0,3.0,722.0,34.0,9300.0,3.0,1.0,4.0,4.0,20701.0,1506.0,37.2,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,69.0,4.0,69.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,0.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68355089,,small_business,Business,577xx,SD,16.06,1.0,Dec-1999,715.0,719.0,4.0,6.0,,22.0,0.0,21470.0,19.2,38.0,w,0.0,0.0,25679.66,25679.66,24700.0,979.66,0.0,0.0,0.0,Jun-2016,926.35,,Mar-2019,699.0,695.0,0.0,,1.0,Individual,,,,0.0,0.0,204396.0,1.0,1.0,0.0,1.0,19.0,18005.0,73.0,2.0,3.0,6472.0,29.0,111800.0,0.0,0.0,6.0,4.0,9733.0,57830.0,27.1,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,,0.0,6.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,0.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68341763,,home_improvement,,605xx,IL,10.78,0.0,Aug-2000,695.0,699.0,0.0,,,6.0,0.0,7869.0,56.2,18.0,w,0.0,0.0,22705.924294,22705.92,20000.0,2705.92,0.0,0.0,0.0,Jun-2017,15813.3,,Mar-2019,704.0,700.0,0.0,,1.0,Joint App,71000.0,13.85,Not Verified,0.0,0.0,189699.0,0.0,1.0,0.0,4.0,19.0,10827.0,73.0,0.0,2.0,2081.0,65.0,14000.0,2.0,5.0,1.0,6.0,31617.0,2737.0,55.9,0.0,0.0,125.0,184.0,14.0,14.0,5.0,101.0,,10.0,,0.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,3.0,6.0,0.0,0.0,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,n,https://lendingclub.com/browse/loanDetail.action?loan_id=66310712,,debt_consolidation,Debt consolidation,076xx,NJ,17.06,0.0,Sep-2008,785.0,789.0,0.0,,,13.0,0.0,7802.0,11.6,17.0,w,15897.65,15897.65,31464.01,31464.01,19102.35,12361.66,0.0,0.0,0.0,Feb-2019,829.9,Apr-2019,Mar-2019,679.0,675.0,0.0,,1.0,Individual,,,,0.0,0.0,301500.0,1.0,1.0,0.0,1.0,23.0,12609.0,70.0,1.0,1.0,6987.0,45.0,67300.0,0.0,1.0,0.0,2.0,23192.0,54962.0,12.1,0.0,0.0,36.0,87.0,2.0,2.0,1.0,2.0,,,,0.0,4.0,5.0,8.0,10.0,2.0,10.0,13.0,5.0,13.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68476807,,major_purchase,Major purchase,174xx,PA,25.37,1.0,Jun-1998,695.0,699.0,3.0,12.0,,12.0,0.0,21929.0,64.5,35.0,w,0.0,0.0,11740.5,11740.5,10400.0,1340.5,0.0,0.0,0.0,Jul-2016,10128.96,,Mar-2018,704.0,700.0,0.0,,1.0,Individual,,,,0.0,0.0,331730.0,1.0,3.0,0.0,3.0,14.0,73839.0,84.0,4.0,7.0,9702.0,78.0,34000.0,2.0,1.0,3.0,10.0,27644.0,4567.0,77.5,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,12.0,1.0,12.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,0.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


<IPython.core.display.Javascript object>

Filtering data by loan status. Since first task is to predict if loan will payed (accepted) or risking to be charged off (rejected:)

In [10]:
df["loan_status"] = df["loan_status"].str.replace(
    "Does not meet the credit policy. Status:", ""
)
df = df[(df["loan_status"] == "Fully Paid") | (df["loan_status"] == "Charged Off")]
df["loan_status"] = np.where(df["loan_status"] == "Fully Paid", 1, 0)

NameError: name 'df' is not defined

<IPython.core.display.Javascript object>

Checking rows with all missing values and droping columns with missing data more than 30%:

In [11]:
empty_rows(df)

<IPython.core.display.Javascript object>

In [12]:
missing_values_summary(df)

Unnamed: 0,Missing Values,% of Total Values,type
member_id,1348059,100.0,float64
next_pymnt_d,1345310,99.7961,object
orig_projected_additional_accrued_interest,1344300,99.7212,float64
hardship_status,1342305,99.5732,object
hardship_length,1342305,99.5732,float64
hardship_type,1342305,99.5732,object
hardship_reason,1342305,99.5732,object
deferral_term,1342305,99.5732,float64
hardship_amount,1342305,99.5732,float64
hardship_start_date,1342305,99.5732,object


<IPython.core.display.Javascript object>

In [12]:
missing_vals = 100 * df.isnull().sum() / len(df)
drop_list = sorted(missing_vals[missing_vals > 30].index)
df.drop(labels=drop_list, axis=1, inplace=True)

<IPython.core.display.Javascript object>

In [13]:
single_value_feature = single_value_features(df)
print(single_value_feature)
df.drop(single_value_feature, axis=1, inplace=True)

['pymnt_plan', 'out_prncp', 'out_prncp_inv', 'policy_code', 'hardship_flag']


<IPython.core.display.Javascript object>

In [14]:
features_keep = [
    "loan_amnt",
    "term",
    "emp_length",
    "home_ownership",
    "annual_inc",
    "purpose",
    "addr_state",
    "dti",
    "earliest_cr_line",
    "fico_range_low",
    "fico_range_high",
    "open_acc",
    "revol_util",
    "total_acc",
    "application_type",
    "tot_cur_bal",
    "acc_open_past_24mths",
    "mo_sin_old_rev_tl_op",
    "mo_sin_rcnt_rev_tl_op",
    "mort_acc",
    "mths_since_recent_inq",
    "num_actv_bc_tl",
    "pub_rec_bankruptcies",
    "issue_d",
    "loan_status",
]
drop_list = [col for col in df.columns if col not in features_keep]
df.drop(labels=drop_list, axis=1, inplace=True)
df.shape

(1348059, 25)

<IPython.core.display.Javascript object>

Dealing with datatype columns and keeping only year, and creating new column for credit history in years and FICO score mean:

In [15]:
df["issue_d"] = pd.to_datetime(df["issue_d"], format="%b-%Y", errors="coerce")

<IPython.core.display.Javascript object>

In [16]:
df["earliest_cr_line"] = pd.to_datetime(
    df["earliest_cr_line"], format="%b-%Y", errors="coerce"
)
date_today = dt.datetime.now()
df["credit_history"] = (date_today - df["earliest_cr_line"]).dt.days // 365
df["credit_history"].fillna(0, inplace=True)
df["credit_history"] = df["credit_history"].astype(int)
df.drop("earliest_cr_line", axis=1, inplace=True)

<IPython.core.display.Javascript object>

In [17]:
df["fico_mean"] = df[["fico_range_low", "fico_range_high"]].mean(axis=1)
df.drop(["fico_range_low", "fico_range_high"], axis=1, inplace=True)

<IPython.core.display.Javascript object>

column term keeping only number:

In [18]:
df["term"] = df["term"].str.extract("(\d+)").astype("category")
df["emp_length"] = df["emp_length"].str.replace("+", "_more").str.replace("<", "less_")

<IPython.core.display.Javascript object>

to save memory space changing dtype from object to category:

In [19]:
df = df.apply(lambda x: x.astype("category") if x.dtype == "object" else x)

<IPython.core.display.Javascript object>

Creating sampled data based on issue date:

In [20]:
sample_data = df[df["issue_d"].dt.year == 2015]

sample_data.reset_index(drop=True, inplace=True)

print("Sample Data Shape:", sample_data.shape)

Sample Data Shape: (375545, 24)


<IPython.core.display.Javascript object>

In [21]:
sample_data["loan_status"].value_counts(normalize=True, dropna=False)

loan_status
1    0.798152
0    0.201848
Name: proportion, dtype: float64

<IPython.core.display.Javascript object>

In [22]:
X = sample_data.drop(["loan_status", "issue_d"], axis=1)
y = sample_data["loan_status"]

<IPython.core.display.Javascript object>

In [24]:
print("unique term : %d" % len(df["term"].unique()))
print("unique emp_length : %d" % len(df["emp_length"].unique()))
print("unique home_ownership : %d" % len(df["home_ownership"].unique()))
print("unique purpose : %d" % len(df["purpose"].unique()))
print("unique addr_state : %d" % len(df["addr_state"].unique()))
print("unique application_type : %d" % len(df["application_type"].unique()))

unique term : 2
unique emp_length : 12
unique home_ownership : 6
unique purpose : 14
unique addr_state : 51
unique application_type : 2


<IPython.core.display.Javascript object>

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
y_train.value_counts(), y_test.value_counts()

(loan_status
 1    239794
 0     60642
 Name: count, dtype: int64,
 loan_status
 1    59948
 0    15161
 Name: count, dtype: int64)

<IPython.core.display.Javascript object>

In [24]:
categorical_columns = X_train.select_dtypes(include=["category"]).columns
numeric_columns = X_train.select_dtypes(exclude=["category", "datetime64[ns]"]).columns

<IPython.core.display.Javascript object>

Replacing outliers with upper and lower bounds

In [25]:
for feature in numeric_columns:
    X_train = replace_outliers_with_iqr(X_train, feature)

for feature in numeric_columns:
    X_test = replace_outliers_with_iqr(X_test, feature)

<IPython.core.display.Javascript object>

Imputing median or mode to numeric and categorical columns instead of missing values:

In [26]:
categorical_imputer = SimpleImputer(strategy="most_frequent")
numeric_imputer = SimpleImputer(strategy="median")
categorical_imputer.fit(X_train[categorical_columns])
numeric_imputer.fit(X_train[numeric_columns])

X_train[numeric_columns] = numeric_imputer.transform(X_train[numeric_columns])
X_train[categorical_columns] = categorical_imputer.transform(
    X_train[categorical_columns]
)

X_test[numeric_columns] = numeric_imputer.transform(X_test[numeric_columns])
X_test[categorical_columns] = categorical_imputer.transform(X_test[categorical_columns])

<IPython.core.display.Javascript object>

Encoding categorical values and to avoid overfitting deleting most popular value:

In [27]:
mode_value = X_train[categorical_columns[0]].mode()[0]

X_train = encode_categorical(X_train, categorical_columns, mode_value)
X_test = encode_categorical(X_test, categorical_columns, mode_value)

<IPython.core.display.Javascript object>

In case if column number in train data and test data do not match - I add column filled with zeroes:

In [28]:
train_columns = set(X_train.columns)
test_columns = set(X_test.columns)

missing_columns_test = train_columns - test_columns
for column in missing_columns_test:
    X_test[column] = 0

missing_columns_train = test_columns - train_columns
for column in missing_columns_train:
    X_train[column] = 0

<IPython.core.display.Javascript object>

Correlation after encoding (will be needed when choosing k number of features in SelectKBest):

In [31]:
feature_correlation(X_train, y_train)

Unnamed: 0,variable,pearson_corr,F,p_value
0,term__60,-0.230103,16796.529297,0.0
1,fico_mean,0.131948,5323.326633,0.0
2,acc_open_past_24mths,-0.128468,5041.549918,0.0
3,dti,-0.117602,4213.322924,0.0
4,mo_sin_rcnt_rev_tl_op,0.072799,1600.681351,0.0
5,tot_cur_bal,0.069214,1446.163678,1.110479e-315
6,mort_acc,0.067401,1371.071541,1.936302e-299
7,annual_inc,0.066901,1350.710489,4.915995e-295
8,home_ownership__MORTGAGE,0.063802,1227.987183,1.7673570000000002e-268
9,home_ownership__RENT,-0.063433,1213.731445,2.1523410000000002e-265


<IPython.core.display.Javascript object>

In [32]:
X_train.head()

Unnamed: 0,loan_amnt,annual_inc,dti,open_acc,revol_util,total_acc,tot_cur_bal,acc_open_past_24mths,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mort_acc,mths_since_recent_inq,num_actv_bc_tl,pub_rec_bankruptcies,credit_history,fico_mean,term__60,emp_length__1 year,emp_length__10_more years,emp_length__2 years,emp_length__3 years,emp_length__4 years,emp_length__5 years,emp_length__6 years,emp_length__7 years,emp_length__8 years,emp_length__9 years,emp_length__less_ 1 year,home_ownership__ANY,home_ownership__MORTGAGE,home_ownership__OWN,home_ownership__RENT,purpose__car,purpose__credit_card,purpose__debt_consolidation,purpose__home_improvement,purpose__house,purpose__major_purchase,purpose__medical,purpose__moving,purpose__other,purpose__renewable_energy,purpose__small_business,purpose__vacation,purpose__wedding,addr_state__AK,addr_state__AL,addr_state__AR,addr_state__AZ,addr_state__CA,addr_state__CO,addr_state__CT,addr_state__DC,addr_state__DE,addr_state__FL,addr_state__GA,addr_state__HI,addr_state__IL,addr_state__IN,addr_state__KS,addr_state__KY,addr_state__LA,addr_state__MA,addr_state__MD,addr_state__ME,addr_state__MI,addr_state__MN,addr_state__MO,addr_state__MS,addr_state__MT,addr_state__NC,addr_state__ND,addr_state__NE,addr_state__NH,addr_state__NJ,addr_state__NM,addr_state__NV,addr_state__NY,addr_state__OH,addr_state__OK,addr_state__OR,addr_state__PA,addr_state__RI,addr_state__SC,addr_state__SD,addr_state__TN,addr_state__TX,addr_state__UT,addr_state__VA,addr_state__VT,addr_state__WA,addr_state__WI,addr_state__WV,addr_state__WY,application_type__Individual,application_type__Joint App,purpose__educational
203695,9000.0,60000.0,17.6,18.0,66.5,26.0,147777.0,3.0,222.0,3.0,2.0,4.0,4.0,0.0,26.0,712.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
94569,15000.0,75000.0,21.02,25.5,67.6,45.0,269965.0,6.0,303.0,7.0,4.0,7.0,5.0,0.0,40.0,672.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
14618,8500.0,58300.0,15.62,11.0,26.8,26.0,95823.0,2.0,120.0,14.0,1.0,6.0,3.0,0.0,20.0,717.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
349867,15600.0,73000.0,14.96,8.0,91.7,17.0,160142.0,4.0,301.0,13.0,2.0,8.0,5.0,0.0,33.0,677.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
327493,7200.0,31000.0,10.69,14.0,25.5,34.0,166952.0,5.0,140.0,16.0,3.0,2.0,1.0,0.0,20.0,672.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


<IPython.core.display.Javascript object>

To avoid system crashing I train models separately:

In [34]:
ros = RandomUnderSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

pipeline_log = Pipeline(
    steps=[
        ("scaler", MinMaxScaler()),
        (
            "classifier",
            LogisticRegression(solver="saga", random_state=42),
        ),
    ]
)
pipeline_log.fit(X_train, y_train)
y_pred = pipeline_log.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


param_grid_log = {
    "classifier__C": [
        0.001,
        0.01,
        0.5,
    ],
    "classifier__max_iter": [100, 200, 300, 500, 1000],
    "classifier__penalty": ["l1", "l2", None],
}
grid_log = GridSearchCV(
    estimator=pipeline_log,
    param_grid=param_grid_log,
    scoring="f1",
    n_jobs=-1,
    cv=3,
    verbose=1,
    return_train_score=False,
)
grid_log.fit(X_train, y_train)
y_pred = grid_log.predict(X_test)
print(f"Best parameters for LogisticRegression: {grid_log.best_params_}")
print(f"Best Mean F1 Score: {grid_log.best_score_}")
report = classification_report(y_test, y_pred)
print("LogisticRegression Classification Report with Best Parameters:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.49      0.43     15161
           1       0.86      0.80      0.83     59948

    accuracy                           0.74     75109
   macro avg       0.62      0.64      0.63     75109
weighted avg       0.76      0.74      0.75     75109

Fitting 3 folds for each of 45 candidates, totalling 135 fits
Best parameters for LogisticRegression: {'classifier__C': 0.001, 'classifier__max_iter': 100, 'classifier__penalty': 'l1'}
Best Mean F1 Score: 0.678821119732215
LogisticRegression Classification Report with Best Parameters:
               precision    recall  f1-score   support

           0       0.35      0.52      0.42     15161
           1       0.86      0.76      0.81     59948

    accuracy                           0.71     75109
   macro avg       0.61      0.64      0.61     75109
weighted avg       0.76      0.71      0.73     75109



<IPython.core.display.Javascript object>

In [36]:
model_rf = RandomForestClassifier(n_jobs=-1, random_state=42)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}
grid_rf = GridSearchCV(
    estimator=model_rf,
    param_grid=param_grid_rf,
    scoring="f1",
    n_jobs=-1,
    cv=3,
    verbose=1,
    return_train_score=False,
)
grid_rf.fit(X_train, y_train)
y_pred = grid_rf.predict(X_test)
print(f"Best parameters for RandomForestClassifier: {grid_rf.best_params_}")
print(f"Best Mean F1 Score: {grid_rf.best_score_}")
report = classification_report(y_test, y_pred)
print("RandomForestClassifier Report with Best Parameters:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.62      0.43     15161
           1       0.88      0.68      0.77     59948

    accuracy                           0.67     75109
   macro avg       0.60      0.65      0.60     75109
weighted avg       0.77      0.67      0.70     75109

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters for RandomForestClassifier: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best Mean F1 Score: 0.6653136915942454
RandomForestClassifier Report with Best Parameters:
               precision    recall  f1-score   support

           0       0.35      0.58      0.44     15161
           1       0.87      0.73      0.79     59948

    accuracy                           0.70     75109
   macro avg       0.61      0.65      0.62     75109
weighted avg       0.77      0.70      0.72     75109



<IPython.core.display.Javascript object>

In [36]:
pipeline_svc = Pipeline(
    steps=[
        ("scaler", MinMaxScaler()),
        (
            "classifier",
            LinearSVC(random_state=42),
        ),
    ]
)
pipeline_svc.fit(X_train, y_train)
y_pred = pipeline_svc.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

param_grid_svc = {
    "classifier__C": [1, 2, 3],
    "classifier__penalty": ["l1", "l2"],
    "classifier__max_iter": [3000, 4000, 5000],
}

grid_svc = GridSearchCV(
    estimator=pipeline_svc,
    param_grid=param_grid_svc,
    scoring="f1",
    n_jobs=-1,
    cv=3,
    verbose=1,
    return_train_score=False,
)
grid_svc.fit(X_train, y_train)
y_pred = grid_svc.predict(X_test)
print(f"Best parameters for LinearSVC: {grid_svc.best_params_}")
print(f"Best Mean F1 Score: {grid_svc.best_score_}")
report = classification_report(y_test, y_pred)
print("LinearSVC Report with Best Parameters:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.36      0.53      0.43     15161
           1       0.87      0.76      0.81     59948

    accuracy                           0.72     75109
   macro avg       0.61      0.65      0.62     75109
weighted avg       0.76      0.72      0.73     75109

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters for LinearSVC: {'classifier__C': 2, 'classifier__max_iter': 3000, 'classifier__penalty': 'l2'}
Best Mean F1 Score: 0.6658996478564498
LinearSVC Report with Best Parameters:
               precision    recall  f1-score   support

           0       0.36      0.53      0.43     15161
           1       0.87      0.76      0.81     59948

    accuracy                           0.72     75109
   macro avg       0.61      0.65      0.62     75109
weighted avg       0.76      0.72      0.73     75109



<IPython.core.display.Javascript object>

Esemble model with best performace on F1 score:

In [40]:
classifiers = [
    (
        "Random Forest",
        RandomForestClassifier(
            random_state=42,
            max_depth=10,
            min_samples_leaf=2,
            min_samples_split=2,
            n_estimators=100,
            n_jobs=-1,
        ),
    ),
    (
        "LogClassifier",
        LogisticRegression(solver="saga", random_state=42),
    ),
]


pipeline_list = []
for name, classifier in classifiers:
    pipeline = Pipeline(
        [
            ("scaler", MinMaxScaler()),
            ("select", SelectKBest(k=20, score_func=f_regression)),
            ("classifier", classifier),
        ]
    )

    pipeline_list.append((name, pipeline))

classifier = VotingClassifier(estimators=pipeline_list, voting="hard")

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred)
ensemble_precision = precision_score(y_test, y_pred)
ensemble_f1 = f1_score(y_test, y_pred)

print("Ensemble Model Accuracy:", ensemble_accuracy)
print("Ensemble Model Precision:", ensemble_precision)
print("Ensemble Model F1 Score:", ensemble_f1)

Ensemble Model Accuracy: 0.6545287515477506
Ensemble Model Precision: 0.8841287057122198
Ensemble Model F1 Score: 0.7509884457410464


<IPython.core.display.Javascript object>

In [41]:
joblib.dump(classifier, "loan_status_prediction.joblib")

['loan_status_prediction.joblib']

<IPython.core.display.Javascript object>

I am making separate file to compare with app.py file:

In [46]:
column_names = X_train.columns.tolist()
column_names_df = pd.DataFrame({"FeatureName": column_names})
csv_file_path = "trained_feature_names.csv"
column_names_df.to_csv(csv_file_path, index=False)

<IPython.core.display.Javascript object>