In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    LabelEncoder,
    OrdinalEncoder,
    StandardScaler,
    FunctionTransformer,
    MinMaxScaler,
)
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.metrics import precision_score, f1_score, accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn import pipeline
from sklearn.feature_selection import f_classif
from sklearn.svm import LinearSVC
import joblib
import re
from sklearn.utils import resample

pd.set_option(
    "display.max_colwidth", 1000, "display.max_rows", None, "display.max_columns", None
)

import warnings

warnings.filterwarnings("ignore")
print("Setup complete")

Setup complete


<IPython.core.display.Javascript object>

In [3]:
def empty_rows(df):
    """Delete rows wit Null values"""
    null_rows = df[df.iloc[:, 1:].isnull().all(axis=1)].index
    df.drop(null_rows, inplace=True)

<IPython.core.display.Javascript object>

In [4]:
def missing_values_summary(df):
    """function gives summary of all features with missing values"""
    missing_values = df.isnull().sum()
    missing_values_percent = 100 * df.isnull().sum() / len(df)
    missing_values_type = df.dtypes
    missing_values_table = pd.concat(
        [missing_values, missing_values_percent, missing_values_type], axis=1
    )
    missing_values_table = missing_values_table.rename(
        columns={0: "Missing Values", 1: "% of Total Values", 2: "type"}
    )
    missing_values_table = (
        missing_values_table[missing_values_table.iloc[:, 1] != 0]
        .sort_values("% of Total Values", ascending=False)
        .round(4)
    )
    return missing_values_table

<IPython.core.display.Javascript object>

In [5]:
def single_value_features(df):
    """delete features that have only single value"""
    single_feature = []
    for column in list(df.columns):
        if df[column].unique().size <= 1:
            single_feature.append(column)
    return single_feature

<IPython.core.display.Javascript object>

In [6]:
def feature_correlation(X_train, y_train):
    """provides feature correlation list"""
    linear_dep = pd.DataFrame()
    for col in X_train.columns:
        linear_dep.loc[col, "pearson_corr"] = X_train[col].corr(y_train)
    linear_dep["abs_pearson_corr"] = abs(linear_dep["pearson_corr"])
    for col in X_train.columns:
        mask = X_train[col].notnull()
        (linear_dep.loc[col, "F"], linear_dep.loc[col, "p_value"]) = f_classif(
            pd.DataFrame(X_train.loc[mask, col]), y_train.loc[mask]
        )
    linear_dep.sort_values("abs_pearson_corr", ascending=False, inplace=True)
    linear_dep.drop("abs_pearson_corr", axis=1, inplace=True)
    linear_dep.reset_index(inplace=True)
    linear_dep.rename(columns={"index": "variable"}, inplace=True)

    return linear_dep

<IPython.core.display.Javascript object>

In [7]:
def replace_outliers_with_iqr(data, column_name, multiplier=1.5):
    """replacing outliers with upper and lower bound"""
    Q1 = data[column_name].quantile(0.25)
    Q3 = data[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    data[column_name] = data[column_name].apply(
        lambda x: lower_bound
        if x < lower_bound
        else (upper_bound if x > upper_bound else x)
    )

    return data

<IPython.core.display.Javascript object>

In [8]:
def encode_categorical(df, columns, mode_value):
    '''Encoding categorical value like OneHotEncoder'''
    encoded_dfs = []
    for column in columns:
        temp = pd.get_dummies(df[column], prefix=column, prefix_sep="__")
        if column + "__" + str(mode_value) in temp.columns:
            temp = temp.drop(column + "__" + str(mode_value), axis=1)
        temp = temp.astype(int)
        encoded_dfs.append(temp)

    df_encoded = pd.concat([df.drop(columns, axis=1)] + encoded_dfs, axis=1)
    return df_encoded


<IPython.core.display.Javascript object>

### SUB_Grade Prediction

In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import f_classif

<IPython.core.display.Javascript object>

In [10]:
df = pd.read_csv("accepted_2007_to_2018Q4.csv", low_memory=False)
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68407277,,debt_consolidation,Debt consolidation,190xx,PA,5.91,0.0,Aug-2003,675.0,679.0,1.0,30.0,,7.0,0.0,2765.0,29.7,13.0,w,0.0,0.0,4421.723917,4421.72,3600.0,821.72,0.0,0.0,0.0,Jan-2019,122.67,,Mar-2019,564.0,560.0,0.0,30.0,1.0,Individual,,,,0.0,722.0,144904.0,2.0,2.0,0.0,1.0,21.0,4981.0,36.0,3.0,3.0,722.0,34.0,9300.0,3.0,1.0,4.0,4.0,20701.0,1506.0,37.2,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,69.0,4.0,69.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,0.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68355089,,small_business,Business,577xx,SD,16.06,1.0,Dec-1999,715.0,719.0,4.0,6.0,,22.0,0.0,21470.0,19.2,38.0,w,0.0,0.0,25679.66,25679.66,24700.0,979.66,0.0,0.0,0.0,Jun-2016,926.35,,Mar-2019,699.0,695.0,0.0,,1.0,Individual,,,,0.0,0.0,204396.0,1.0,1.0,0.0,1.0,19.0,18005.0,73.0,2.0,3.0,6472.0,29.0,111800.0,0.0,0.0,6.0,4.0,9733.0,57830.0,27.1,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,,0.0,6.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,0.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68341763,,home_improvement,,605xx,IL,10.78,0.0,Aug-2000,695.0,699.0,0.0,,,6.0,0.0,7869.0,56.2,18.0,w,0.0,0.0,22705.924294,22705.92,20000.0,2705.92,0.0,0.0,0.0,Jun-2017,15813.3,,Mar-2019,704.0,700.0,0.0,,1.0,Joint App,71000.0,13.85,Not Verified,0.0,0.0,189699.0,0.0,1.0,0.0,4.0,19.0,10827.0,73.0,0.0,2.0,2081.0,65.0,14000.0,2.0,5.0,1.0,6.0,31617.0,2737.0,55.9,0.0,0.0,125.0,184.0,14.0,14.0,5.0,101.0,,10.0,,0.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,3.0,6.0,0.0,0.0,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,n,https://lendingclub.com/browse/loanDetail.action?loan_id=66310712,,debt_consolidation,Debt consolidation,076xx,NJ,17.06,0.0,Sep-2008,785.0,789.0,0.0,,,13.0,0.0,7802.0,11.6,17.0,w,15897.65,15897.65,31464.01,31464.01,19102.35,12361.66,0.0,0.0,0.0,Feb-2019,829.9,Apr-2019,Mar-2019,679.0,675.0,0.0,,1.0,Individual,,,,0.0,0.0,301500.0,1.0,1.0,0.0,1.0,23.0,12609.0,70.0,1.0,1.0,6987.0,45.0,67300.0,0.0,1.0,0.0,2.0,23192.0,54962.0,12.1,0.0,0.0,36.0,87.0,2.0,2.0,1.0,2.0,,,,0.0,4.0,5.0,8.0,10.0,2.0,10.0,13.0,5.0,13.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68476807,,major_purchase,Major purchase,174xx,PA,25.37,1.0,Jun-1998,695.0,699.0,3.0,12.0,,12.0,0.0,21929.0,64.5,35.0,w,0.0,0.0,11740.5,11740.5,10400.0,1340.5,0.0,0.0,0.0,Jul-2016,10128.96,,Mar-2018,704.0,700.0,0.0,,1.0,Individual,,,,0.0,0.0,331730.0,1.0,3.0,0.0,3.0,14.0,73839.0,84.0,4.0,7.0,9702.0,78.0,34000.0,2.0,1.0,3.0,10.0,27644.0,4567.0,77.5,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,12.0,1.0,12.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,0.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


<IPython.core.display.Javascript object>

Filtering only fully payed loans for thuther prediction modeling:

In [11]:
df["loan_status"] = df["loan_status"].str.replace(
    "Does not meet the credit policy. Status:", ""
)
df = df[(df["loan_status"] == "Fully Paid")]

<IPython.core.display.Javascript object>

In [12]:
df.shape

(1078739, 151)

<IPython.core.display.Javascript object>

In [12]:
empty_rows(df)

<IPython.core.display.Javascript object>

In [14]:
missing_values_summary(df)

Unnamed: 0,Missing Values,% of Total Values,type
member_id,1078739,100.0,float64
debt_settlement_flag_date,1078734,99.9995,object
settlement_percentage,1078734,99.9995,float64
settlement_amount,1078734,99.9995,float64
settlement_date,1078734,99.9995,object
settlement_status,1078734,99.9995,object
settlement_term,1078734,99.9995,float64
orig_projected_additional_accrued_interest,1077137,99.8515,float64
deferral_term,1077045,99.843,float64
hardship_dpd,1077045,99.843,float64


<IPython.core.display.Javascript object>

Deleting missing values (more than 30%) and single value features:

In [13]:
missing_vals = 100 * df.isnull().sum() / len(df)
drop_list = sorted(missing_vals[missing_vals > 30].index)
df.drop(labels=drop_list, axis=1, inplace=True)

<IPython.core.display.Javascript object>

In [14]:
single_value_feature = single_value_features(df)
print(single_value_feature)
df.drop(single_value_feature, axis=1, inplace=True)

['loan_status', 'pymnt_plan', 'out_prncp', 'out_prncp_inv', 'recoveries', 'collection_recovery_fee', 'policy_code', 'hardship_flag']


<IPython.core.display.Javascript object>

In [15]:
features_keep = [
    "loan_amnt",
    "term",
    "emp_length",
    "home_ownership",
    "annual_inc",
    "purpose",
    "addr_state",
    "dti",
    "earliest_cr_line",
    "fico_range_low",
    "fico_range_high",
    "open_acc",
    "revol_util",
    "total_acc",
    "application_type",
    "tot_cur_bal",
    "acc_open_past_24mths",
    "mo_sin_old_rev_tl_op",
    "mo_sin_rcnt_rev_tl_op",
    "mort_acc",
    "mths_since_recent_inq",
    "num_actv_bc_tl",
    "pub_rec_bankruptcies",
    "issue_d",
    "sub_grade",
]
drop_list = [col for col in df.columns if col not in features_keep]
df.drop(labels=drop_list, axis=1, inplace=True)
df.shape

(1078739, 25)

<IPython.core.display.Javascript object>

Encoding target column:

In [16]:
label_encoder = LabelEncoder()
df["sub_grade"] = label_encoder.fit_transform(df["sub_grade"])
target_mappings = {index: label for index, label in enumerate(label_encoder.classes_)}
target_mappings

{0: 'A1',
 1: 'A2',
 2: 'A3',
 3: 'A4',
 4: 'A5',
 5: 'B1',
 6: 'B2',
 7: 'B3',
 8: 'B4',
 9: 'B5',
 10: 'C1',
 11: 'C2',
 12: 'C3',
 13: 'C4',
 14: 'C5',
 15: 'D1',
 16: 'D2',
 17: 'D3',
 18: 'D4',
 19: 'D5',
 20: 'E1',
 21: 'E2',
 22: 'E3',
 23: 'E4',
 24: 'E5',
 25: 'F1',
 26: 'F2',
 27: 'F3',
 28: 'F4',
 29: 'F5',
 30: 'G1',
 31: 'G2',
 32: 'G3',
 33: 'G4',
 34: 'G5'}

<IPython.core.display.Javascript object>

In [17]:
df_target_mappings = pd.DataFrame(
    list(target_mappings.items()), columns=["Index", "Label"]
)
df_target_mappings.to_csv("subgrade_target_mappings.csv", index=False)

<IPython.core.display.Javascript object>

Creating new features and changing "object" dtype into "category" to save memory space:

In [18]:
df["issue_d"] = pd.to_datetime(df["issue_d"], format="%b-%Y", errors="coerce")
df["earliest_cr_line"] = df["earliest_cr_line"].astype(str)
df["earliest_cr_line"] = pd.to_datetime(
    df["earliest_cr_line"], format="%b-%Y", errors="coerce"
)
date_today = dt.datetime.now()
df["credit_history"] = (date_today - df["earliest_cr_line"]).dt.days // 365
df["credit_history"].fillna(0, inplace=True)
df["credit_history"] = df["credit_history"].astype(int)
df.drop("earliest_cr_line", axis=1, inplace=True)
df["term"] = df["term"].str.extract("(\d+)").astype("category")
df["emp_length"] = df["emp_length"].str.replace("+", "_more").str.replace("<", "less_")
df["fico_mean"] = df[["fico_range_low", "fico_range_high"]].mean(axis=1)
df.drop(["fico_range_low", "fico_range_high"], axis=1, inplace=True)
df = df.apply(lambda x: x.astype("category") if x.dtype == "object" else x)

<IPython.core.display.Javascript object>

In [19]:
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in df.columns.values
]

<IPython.core.display.Javascript object>

Sampling data for training only 2015 year

In [20]:
sample_data = df[df["issue_d"].dt.year == 2015]
print("Sample Data Shape:", sample_data.shape)

Sample Data Shape: (299742, 24)


<IPython.core.display.Javascript object>

In [21]:
X = sample_data.drop(["sub_grade", "issue_d"], axis=1)
y = sample_data["sub_grade"]
X.shape

(299742, 22)

<IPython.core.display.Javascript object>

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((239793, 22), (239793,), (59949, 22), (59949,))

<IPython.core.display.Javascript object>

In [23]:
categorical_columns = X_train.select_dtypes(include=["category", "object"]).columns
numeric_columns = X_train.select_dtypes(
    exclude=["category", "datetime64[ns]", "object"]
).columns

<IPython.core.display.Javascript object>

Replacing outliers:

In [24]:
for feature in numeric_columns:
    X_train = replace_outliers_with_iqr(X_train, feature)

for feature in numeric_columns:
    X_test = replace_outliers_with_iqr(X_test, feature)

<IPython.core.display.Javascript object>

Imputing missing values:

In [25]:
categorical_imputer = SimpleImputer(strategy="most_frequent")
numeric_imputer = SimpleImputer(strategy="median")
categorical_imputer.fit(X_train[categorical_columns])
numeric_imputer.fit(X_train[numeric_columns])

X_train[numeric_columns] = numeric_imputer.transform(X_train[numeric_columns])
X_train[categorical_columns] = categorical_imputer.transform(
    X_train[categorical_columns]
)

X_test[numeric_columns] = numeric_imputer.transform(X_test[numeric_columns])
X_test[categorical_columns] = categorical_imputer.transform(X_test[categorical_columns])

<IPython.core.display.Javascript object>

Encoding categorical values:

In [26]:
mode_value = X_train[categorical_columns[0]].mode()[0]

X_train = encode_categorical(X_train, categorical_columns, mode_value)
X_test = encode_categorical(X_test, categorical_columns, mode_value)

<IPython.core.display.Javascript object>

In case column number mismatch after split, I am adding column with 0 values:

In [27]:
train_columns = set(X_train.columns)
test_columns = set(X_test.columns)

missing_columns_test = train_columns - test_columns
for column in missing_columns_test:
    X_test[column] = 0

missing_columns_train = test_columns - train_columns
for column in missing_columns_train:
    X_train[column] = 0
X_test = X_test[X_train.columns]

<IPython.core.display.Javascript object>

In [29]:
feature_correlation(X_train, y_train)

Unnamed: 0,variable,pearson_corr,F,p_value
0,term__60,0.405437,1545.776733,0.0
1,fico_mean,-0.392939,2171.871174,0.0
2,acc_open_past_24mths,0.226318,389.756081,0.0
3,purpose__credit_card,-0.216237,369.716797,0.0
4,revol_util,0.204276,483.715846,0.0
5,dti,0.18369,278.550866,0.0
6,mths_since_recent_inq,-0.168695,212.512896,0.0
7,mo_sin_old_rev_tl_op,-0.166305,239.5039,0.0
8,annual_inc,-0.158358,276.448763,0.0
9,mo_sin_rcnt_rev_tl_op,-0.153011,182.964964,0.0


<IPython.core.display.Javascript object>

In [28]:
def feature_correlation(X_train, y_train, threshold=0.02):
    """Taking only features that correlate 0,02 coef."""
    linear_dep = pd.DataFrame()
    for col in X_train.columns:
        linear_dep.loc[col, "pearson_corr"] = X_train[col].corr(y_train)
    linear_dep["abs_pearson_corr"] = abs(linear_dep["pearson_corr"])

    for col in X_train.columns:
        mask = X_train[col].notnull()
        (linear_dep.loc[col, "F"], linear_dep.loc[col, "p_value"]) = f_classif(
            pd.DataFrame(X_train.loc[mask, col]), y_train.loc[mask]
        )
    selected_features = linear_dep[linear_dep["abs_pearson_corr"] > threshold]
    selected_features.drop("abs_pearson_corr", axis=1, inplace=True)
    selected_features.reset_index(inplace=True)
    selected_features.rename(columns={"index": "variable"}, inplace=True)

    return selected_features

<IPython.core.display.Javascript object>

In [29]:
selected_features = feature_correlation(X_train, y_train, threshold=0.02)
selected_features

Unnamed: 0,variable,pearson_corr,F,p_value
0,loan_amnt,0.089454,176.575586,0.0
1,annual_inc,-0.158358,276.448763,0.0
2,dti,0.18369,278.550866,0.0
3,open_acc,-0.038793,40.646046,7.130347e-268
4,revol_util,0.204276,483.715846,0.0
5,total_acc,-0.055006,50.956947,0.0
6,tot_cur_bal,-0.087567,113.637807,0.0
7,acc_open_past_24mths,0.226318,389.756081,0.0
8,mo_sin_old_rev_tl_op,-0.166305,239.5039,0.0
9,mo_sin_rcnt_rev_tl_op,-0.153011,182.964964,0.0


<IPython.core.display.Javascript object>

Using SMOTE to normalize sub_grade classes (target) from filtered data with selected features:

In [30]:
selected_feature_names = selected_features["variable"].tolist()
X_train_selected = X_train[selected_feature_names]
X_test_selected = X_test[selected_feature_names]

smote = SMOTE(sampling_strategy="auto", random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_selected, y_train)

<IPython.core.display.Javascript object>

For further modeling taking 35000 entries dataset (for 1000 entries for each target category):

In [31]:
num_categories = len(y_train_resampled.value_counts())
samples_per_category = 1000
total_samples = num_categories * samples_per_category

X_train_resampled.reset_index(drop=True, inplace=True)
y_train_resampled.reset_index(drop=True, inplace=True)

subsampled_X = np.empty((0, X_train_resampled.shape[1]))
subsampled_y = np.array([])

for category in range(num_categories):
    category_indices = np.where(y_train_resampled == category)[0]
    sub_indices = resample(
        category_indices, n_samples=samples_per_category, random_state=42
    )
    subsampled_X = np.vstack((subsampled_X, X_train_resampled.iloc[sub_indices]))
    subsampled_y = np.append(subsampled_y, y_train_resampled.iloc[sub_indices])

<IPython.core.display.Javascript object>

In [34]:
feature_names = X_train_resampled.columns.tolist()
feature_names_df = pd.DataFrame({"SubgradeFeatureName": feature_names})
feature_names_df.to_csv("sub_grade_feature_names.csv", index=False)

<IPython.core.display.Javascript object>

In [105]:
scorer = make_scorer(f1_score, average="macro")
pipeline_rf = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        (
            "classifier",
            RandomForestClassifier(random_state=42),
        ),
    ]
)
pipeline_rf.fit(subsampled_X, subsampled_y)
y_pred = pipeline_rf.predict(X_test_selected)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


param_grid_rf = {
    "classifier__n_estimators": [100, 200, 300],
    "classifier__min_samples_leaf": [1, 2, 3],
    "classifier__min_samples_split": [7, 8, 9],
    "classifier__max_depth": [10, 20, 30],
}
scorer = make_scorer(f1_score, average="macro")
grid_rf = GridSearchCV(
    estimator=pipeline_rf,
    param_grid=param_grid_rf,
    scoring=scorer,
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_rf.fit(subsampled_X, subsampled_y)
y_pred = grid_rf.predict(X_test_selected)
print(f"Best parameters for RandomForestClassifier: {grid_rf.best_params_}")
print(f"Best Mean F1 Score: {grid_rf.best_score_}")
report = classification_report(y_test, y_pred)
print("RandomForestClassifier Report with Best Parameters:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.31      0.57      0.40      2407
           1       0.13      0.17      0.15      2215
           2       0.09      0.15      0.11      2069
           3       0.12      0.14      0.13      2857
           4       0.12      0.11      0.12      4032
           5       0.14      0.16      0.15      3639
           6       0.11      0.10      0.10      3653
           7       0.10      0.08      0.09      3839
           8       0.08      0.07      0.08      3808
           9       0.09      0.08      0.09      3658
          10       0.09      0.08      0.09      3806
          11       0.10      0.07      0.08      3572
          12       0.08      0.05      0.06      3305
          13       0.10      0.06      0.07      3100
          14       0.07      0.05      0.05      2451
          15       0.07      0.06      0.06      2071
          16       0.04      0.03      0.04      1507
   

<IPython.core.display.Javascript object>

In [116]:
pipeline_log = Pipeline(
    steps=[
        ("scaler", MinMaxScaler()),
        (
            "classifier",
            LogisticRegression(multi_class="multinomial", random_state=42),
        ),
    ]
)
pipeline_log.fit(subsampled_X, subsampled_y)
y_pred = pipeline_log.predict(X_test_selected)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


param_grid_log = {
    "classifier__solver": ["sag", "saga"],
    "classifier__C": [0.01, 0.1, 1],
    "classifier__max_iter": [100, 200, 300],
    "classifier__penalty": ["l1", "l2", "elasticnet"],
}
scorer = make_scorer(f1_score, average="macro")
grid_log = GridSearchCV(
    estimator=pipeline_log,
    param_grid=param_grid_log,
    scoring=scorer,
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_log.fit(subsampled_X, subsampled_y)
y_pred = grid_log.predict(X_test_selected)
print(f"Best parameters for LogisticRegression: {grid_log.best_params_}")
print(f"Best Score: {grid_log.best_score_}")
report = classification_report(y_test, y_pred)
print("LogisticRegression Classification Report with Best Parameters:\n", report)


Classification Report:
               precision    recall  f1-score   support

           0       0.31      0.63      0.42      2407
           1       0.14      0.14      0.14      2215
           2       0.09      0.06      0.08      2069
           3       0.12      0.14      0.13      2857
           4       0.14      0.19      0.16      4032
           5       0.15      0.15      0.15      3639
           6       0.11      0.08      0.09      3653
           7       0.10      0.06      0.08      3839
           8       0.10      0.08      0.09      3808
           9       0.10      0.10      0.10      3658
          10       0.10      0.11      0.11      3806
          11       0.09      0.07      0.08      3572
          12       0.09      0.06      0.07      3305
          13       0.11      0.09      0.10      3100
          14       0.07      0.08      0.07      2451
          15       0.07      0.04      0.05      2071
          16       0.05      0.05      0.05      1507
   

<IPython.core.display.Javascript object>

In [120]:
scorer = make_scorer(f1_score, average="macro")
pipeline_xgb = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        (
            "classifier",
            XGBClassifier(random_state=42, n_jobs=-1, objective="multi:softmax"),
        ),
    ]
)
pipeline_xgb.fit(subsampled_X, subsampled_y)
y_pred = pipeline_xgb.predict(X_test_selected)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


param_grid_xgb = {
    "classifier__max_depth": [8, 10, 12],
    "classifier__min_child_weight": range(2, 6, 2),
    "classifier__n_estimators": [30, 40, 50],
}
grid_xgb = GridSearchCV(
    estimator=pipeline_xgb,
    param_grid=param_grid_xgb,
    scoring=scorer,
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_xgb.fit(subsampled_X, subsampled_y)
y_pred = grid_xgb.predict(X_test_selected)
print(f"Best parameters for XGBClassifier: {grid_xgb.best_params_}")
print(f"Best Mean F1 Score: {grid_xgb.best_score_}")
report = classification_report(y_test, y_pred)
print("XGBClassifier Report with Best Parameters:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.49      0.43      2407
           1       0.17      0.19      0.18      2215
           2       0.12      0.14      0.12      2069
           3       0.13      0.14      0.13      2857
           4       0.14      0.16      0.14      4032
           5       0.15      0.17      0.16      3639
           6       0.12      0.12      0.12      3653
           7       0.10      0.10      0.10      3839
           8       0.09      0.09      0.09      3808
           9       0.10      0.11      0.10      3658
          10       0.10      0.11      0.10      3806
          11       0.10      0.09      0.09      3572
          12       0.09      0.08      0.08      3305
          13       0.10      0.09      0.10      3100
          14       0.07      0.06      0.07      2451
          15       0.07      0.07      0.07      2071
          16       0.05      0.03      0.04      1507
   

<IPython.core.display.Javascript object>

In [124]:
classifiers = [
    (
        "Random Forest",
        RandomForestClassifier(
            random_state=42,
            max_depth=30,
            n_estimators=300,
            min_samples_leaf=1,
            min_samples_split=7,
            n_jobs=-1,
        ),
    ),
    (
        "XGBClassifier",
        XGBClassifier(
            min_child_weight=2,
            max_depth=12,
            n_estimators=50,
        ),
    ),
]


pipeline_list = []
for name, classifier in classifiers:
    pipeline = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("select", SelectKBest(k=10, score_func=f_regression)),
            ("classifier", classifier),
        ]
    )

    pipeline_list.append((name, pipeline))

classifier = VotingClassifier(estimators=pipeline_list, voting="hard")

classifier.fit(subsampled_X, subsampled_y)

y_pred = classifier.predict(X_test_selected)
ensemble_accuracy = accuracy_score(y_test, y_pred)
ensemble_f1 = f1_score(y_test, y_pred, average="micro")

print("Ensemble Model Accuracy:", ensemble_accuracy)
print("Ensemble Model F1 Score (Macro):", ensemble_f1)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Ensemble Model Accuracy: 0.10557307044321006
Ensemble Model F1 Score (Macro): 0.10557307044321006
Classification Report:
               precision    recall  f1-score   support

           0       0.27      0.54      0.36      2407
           1       0.11      0.17      0.14      2215
           2       0.09      0.14      0.11      2069
           3       0.10      0.14      0.12      2857
           4       0.11      0.13      0.12      4032
           5       0.14      0.18      0.15      3639
           6       0.10      0.11      0.11      3653
           7       0.09      0.07      0.08      3839
           8       0.09      0.09      0.09      3808
           9       0.09      0.09      0.09      3658
          10       0.10      0.09      0.09      3806
          11       0.09      0.07      0.08      3572
          12       0.09      0.06      0.07      3305
          13       0.10      0.08      0.08      3100
          14       0.07      0.04      0.05      2451
          15 

<IPython.core.display.Javascript object>

In [126]:
joblib.dump(classifier, "subgrade_prediction.joblib")

['subgrade_prediction.joblib']

<IPython.core.display.Javascript object>

In [None]:
column_names = X_train.columns.tolist()
column_names_df = pd.DataFrame({"FeatureName": column_names})
csv_file_path = "trained_feature_names.csv"
column_names_df.to_csv(csv_file_path, index=False)