In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    LabelEncoder,
    OrdinalEncoder,
    StandardScaler,
    FunctionTransformer,
    MinMaxScaler,
)
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.metrics import precision_score, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn import pipeline
from sklearn.feature_selection import f_classif
from sklearn.svm import LinearSVC
import joblib
import re

pd.set_option(
    "display.max_colwidth", 1000, "display.max_rows", None, "display.max_columns", None
)

import warnings

warnings.filterwarnings("ignore")
print("Setup complete")

Setup complete


<IPython.core.display.Javascript object>

In [3]:
def empty_rows(df):
    """Delete rows wit Null values"""
    null_rows = df[df.iloc[:, 1:].isnull().all(axis=1)].index
    df.drop(null_rows, inplace=True)

<IPython.core.display.Javascript object>

In [4]:
def missing_values_summary(df):
    """function gives summary of all features with missing values"""
    missing_values = df.isnull().sum()
    missing_values_percent = 100 * df.isnull().sum() / len(df)
    missing_values_type = df.dtypes
    missing_values_table = pd.concat(
        [missing_values, missing_values_percent, missing_values_type], axis=1
    )
    missing_values_table = missing_values_table.rename(
        columns={0: "Missing Values", 1: "% of Total Values", 2: "type"}
    )
    missing_values_table = (
        missing_values_table[missing_values_table.iloc[:, 1] != 0]
        .sort_values("% of Total Values", ascending=False)
        .round(4)
    )
    return missing_values_table

<IPython.core.display.Javascript object>

In [5]:
def single_value_features(df):
    """delete features that have only single value"""
    single_feature = []
    for column in list(df.columns):
        if df[column].unique().size <= 1:
            single_feature.append(column)
    return single_feature

<IPython.core.display.Javascript object>

In [6]:
def feature_correlation(X_train, y_train):
    """provides feature correlation list"""
    linear_dep = pd.DataFrame()
    for col in X_train.columns:
        linear_dep.loc[col, "pearson_corr"] = X_train[col].corr(y_train)
    linear_dep["abs_pearson_corr"] = abs(linear_dep["pearson_corr"])
    for col in X_train.columns:
        mask = X_train[col].notnull()
        (linear_dep.loc[col, "F"], linear_dep.loc[col, "p_value"]) = f_classif(
            pd.DataFrame(X_train.loc[mask, col]), y_train.loc[mask]
        )
    linear_dep.sort_values("abs_pearson_corr", ascending=False, inplace=True)
    linear_dep.drop("abs_pearson_corr", axis=1, inplace=True)
    linear_dep.reset_index(inplace=True)
    linear_dep.rename(columns={"index": "variable"}, inplace=True)

    return linear_dep

<IPython.core.display.Javascript object>

In [7]:
def replace_outliers_with_iqr(data, column_name, multiplier=1.5):
    """replacing outliers with upper and lower bound"""
    Q1 = data[column_name].quantile(0.25)
    Q3 = data[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    data[column_name] = data[column_name].apply(
        lambda x: lower_bound
        if x < lower_bound
        else (upper_bound if x > upper_bound else x)
    )

    return data

<IPython.core.display.Javascript object>

In [8]:
def encode_categorical(df, columns, mode_value):
    '''Encoding categorical value like OneHotEncoder'''
    encoded_dfs = []
    for column in columns:
        temp = pd.get_dummies(df[column], prefix=column, prefix_sep="__")
        if column + "__" + str(mode_value) in temp.columns:
            temp = temp.drop(column + "__" + str(mode_value), axis=1)
        temp = temp.astype(int)
        encoded_dfs.append(temp)

    df_encoded = pd.concat([df.drop(columns, axis=1)] + encoded_dfs, axis=1)
    return df_encoded


<IPython.core.display.Javascript object>

# Interest rate prediction

In [9]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.feature_selection import SelectKBest, f_regression

<IPython.core.display.Javascript object>

In [10]:
df_rate = pd.read_csv("accepted_2007_to_2018Q4.csv", low_memory=False)
df_rate.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68407277,,debt_consolidation,Debt consolidation,190xx,PA,5.91,0.0,Aug-2003,675.0,679.0,1.0,30.0,,7.0,0.0,2765.0,29.7,13.0,w,0.0,0.0,4421.723917,4421.72,3600.0,821.72,0.0,0.0,0.0,Jan-2019,122.67,,Mar-2019,564.0,560.0,0.0,30.0,1.0,Individual,,,,0.0,722.0,144904.0,2.0,2.0,0.0,1.0,21.0,4981.0,36.0,3.0,3.0,722.0,34.0,9300.0,3.0,1.0,4.0,4.0,20701.0,1506.0,37.2,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,69.0,4.0,69.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,0.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68355089,,small_business,Business,577xx,SD,16.06,1.0,Dec-1999,715.0,719.0,4.0,6.0,,22.0,0.0,21470.0,19.2,38.0,w,0.0,0.0,25679.66,25679.66,24700.0,979.66,0.0,0.0,0.0,Jun-2016,926.35,,Mar-2019,699.0,695.0,0.0,,1.0,Individual,,,,0.0,0.0,204396.0,1.0,1.0,0.0,1.0,19.0,18005.0,73.0,2.0,3.0,6472.0,29.0,111800.0,0.0,0.0,6.0,4.0,9733.0,57830.0,27.1,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,,0.0,6.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,0.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68341763,,home_improvement,,605xx,IL,10.78,0.0,Aug-2000,695.0,699.0,0.0,,,6.0,0.0,7869.0,56.2,18.0,w,0.0,0.0,22705.924294,22705.92,20000.0,2705.92,0.0,0.0,0.0,Jun-2017,15813.3,,Mar-2019,704.0,700.0,0.0,,1.0,Joint App,71000.0,13.85,Not Verified,0.0,0.0,189699.0,0.0,1.0,0.0,4.0,19.0,10827.0,73.0,0.0,2.0,2081.0,65.0,14000.0,2.0,5.0,1.0,6.0,31617.0,2737.0,55.9,0.0,0.0,125.0,184.0,14.0,14.0,5.0,101.0,,10.0,,0.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,3.0,6.0,0.0,0.0,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,n,https://lendingclub.com/browse/loanDetail.action?loan_id=66310712,,debt_consolidation,Debt consolidation,076xx,NJ,17.06,0.0,Sep-2008,785.0,789.0,0.0,,,13.0,0.0,7802.0,11.6,17.0,w,15897.65,15897.65,31464.01,31464.01,19102.35,12361.66,0.0,0.0,0.0,Feb-2019,829.9,Apr-2019,Mar-2019,679.0,675.0,0.0,,1.0,Individual,,,,0.0,0.0,301500.0,1.0,1.0,0.0,1.0,23.0,12609.0,70.0,1.0,1.0,6987.0,45.0,67300.0,0.0,1.0,0.0,2.0,23192.0,54962.0,12.1,0.0,0.0,36.0,87.0,2.0,2.0,1.0,2.0,,,,0.0,4.0,5.0,8.0,10.0,2.0,10.0,13.0,5.0,13.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68476807,,major_purchase,Major purchase,174xx,PA,25.37,1.0,Jun-1998,695.0,699.0,3.0,12.0,,12.0,0.0,21929.0,64.5,35.0,w,0.0,0.0,11740.5,11740.5,10400.0,1340.5,0.0,0.0,0.0,Jul-2016,10128.96,,Mar-2018,704.0,700.0,0.0,,1.0,Individual,,,,0.0,0.0,331730.0,1.0,3.0,0.0,3.0,14.0,73839.0,84.0,4.0,7.0,9702.0,78.0,34000.0,2.0,1.0,3.0,10.0,27644.0,4567.0,77.5,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,12.0,1.0,12.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,0.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


<IPython.core.display.Javascript object>

In [11]:
df_rate["loan_status"] = df_rate["loan_status"].str.replace(
    "Does not meet the credit policy. Status:", ""
)
df_rate = df_rate[
    (df_rate["loan_status"] == "Fully Paid") | (df_rate["loan_status"] == "Charged Off")
]
df_rate["loan_status"] = np.where(df_rate["loan_status"] == "Fully Paid", 1, 0)

<IPython.core.display.Javascript object>

In [12]:
empty_rows(df_rate)
missing_vals = 100 * df_rate.isnull().sum() / len(df_rate)
drop_list = sorted(missing_vals[missing_vals > 30].index)
df_rate.drop(labels=drop_list, axis=1, inplace=True)

<IPython.core.display.Javascript object>

In [13]:
single_value_feature = single_value_features(df_rate)
print(single_value_feature)
df_rate.drop(single_value_feature, axis=1, inplace=True)

['pymnt_plan', 'out_prncp', 'out_prncp_inv', 'policy_code', 'hardship_flag']


<IPython.core.display.Javascript object>

In [14]:
features_keep = [
    "loan_amnt",
    "term",
    "emp_length",
    "home_ownership",
    "annual_inc",
    "purpose",
    "addr_state",
    "dti",
    "earliest_cr_line",
    "fico_range_low",
    "fico_range_high",
    "open_acc",
    "revol_util",
    "total_acc",
    "application_type",
    "tot_cur_bal",
    "acc_open_past_24mths",
    "mo_sin_old_rev_tl_op",
    "mo_sin_rcnt_rev_tl_op",
    "mort_acc",
    "mths_since_recent_inq",
    "num_actv_bc_tl",
    "pub_rec_bankruptcies",
    "issue_d",
    "grade",
    "int_rate",
]
drop_list = [col for col in df_rate.columns if col not in features_keep]
df_rate.drop(labels=drop_list, axis=1, inplace=True)
df_rate.shape

(1348059, 26)

<IPython.core.display.Javascript object>

In [15]:
df_rate["issue_d"] = pd.to_datetime(df_rate["issue_d"], format="%b-%Y", errors="coerce")
df_rate["issue_d"] = df_rate["issue_d"].dt.year.astype("category")
df_rate["earliest_cr_line"] = pd.to_datetime(
    df_rate["earliest_cr_line"], format="%b-%Y", errors="coerce"
)
date_today = dt.datetime.now()
df_rate["credit_history"] = (date_today - df_rate["earliest_cr_line"]).dt.days // 365
df_rate["credit_history"].fillna(0, inplace=True)
df_rate["credit_history"] = df_rate["credit_history"].astype(int)
df_rate.drop("earliest_cr_line", axis=1, inplace=True)
df_rate["fico_mean"] = df_rate[["fico_range_low", "fico_range_high"]].mean(axis=1)
df_rate.drop(["fico_range_low", "fico_range_high"], axis=1, inplace=True)
df_rate["term"] = df_rate["term"].str.extract("(\d+)")
df_rate["emp_length"] = (
    df_rate["emp_length"].str.replace("+", "_more").str.replace("<", "less_")
)
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df_rate.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in df_rate.columns.values
]
df_rate = df_rate.apply(lambda x: x.astype("category") if x.dtype == "object" else x)

<IPython.core.display.Javascript object>

In [16]:
train_data = df_rate[df_rate["issue_d"] == 2015]
test_data = df_rate[df_rate["issue_d"] == 2016]

<IPython.core.display.Javascript object>

In [17]:
if set(train_data.columns) == set(test_data.columns):
    print("Column names match between train_data and test_data.")
else:
    print("Column names do not match between train_data and test_data.")

Column names match between train_data and test_data.


<IPython.core.display.Javascript object>

In [18]:
X_train = train_data.drop(columns=["int_rate", "issue_d"], axis=1)
y_train = train_data["int_rate"]
X_test = test_data.drop(columns=["int_rate", "issue_d"], axis=1)
y_test = test_data["int_rate"]


<IPython.core.display.Javascript object>

In [19]:
categorical_columns = X_train.select_dtypes(include=["category"]).columns
numeric_columns = X_train.select_dtypes(exclude=["category", "datetime64[ns]"]).columns
for feature in numeric_columns:
    X_train = replace_outliers_with_iqr(X_train, feature)

for feature in numeric_columns:
    X_test = replace_outliers_with_iqr(X_test, feature)

<IPython.core.display.Javascript object>

In [20]:
categorical_imputer = SimpleImputer(strategy="most_frequent")
numeric_imputer = SimpleImputer(strategy="median")
categorical_imputer.fit(X_train[categorical_columns])
numeric_imputer.fit(X_train[numeric_columns])

X_train[numeric_columns] = numeric_imputer.transform(X_train[numeric_columns])
X_train[categorical_columns] = categorical_imputer.transform(
    X_train[categorical_columns]
)

X_test[numeric_columns] = numeric_imputer.transform(X_test[numeric_columns])
X_test[categorical_columns] = categorical_imputer.transform(X_test[categorical_columns])

<IPython.core.display.Javascript object>

In [21]:
mode_value = X_train[categorical_columns[0]].mode()[0]

X_train = encode_categorical(X_train, categorical_columns, mode_value)
X_test = encode_categorical(X_test, categorical_columns, mode_value)

<IPython.core.display.Javascript object>

In [22]:
train_columns = set(X_train.columns)
test_columns = set(X_test.columns)

missing_columns_test = train_columns - test_columns
for column in missing_columns_test:
    X_test[column] = 0
missing_columns_train = test_columns - train_columns
for column in missing_columns_train:
    X_train[column] = 0
X_test = X_test[X_train.columns]

<IPython.core.display.Javascript object>

In [23]:
X_train.columns = [str(i) for i in X_train.columns]
X_test.columns = [str(i) for i in X_test.columns]

<IPython.core.display.Javascript object>

In [24]:
feature_correlation(X_train, y_train)

Unnamed: 0,variable,pearson_corr,F,p_value
0,grade__A,-0.615739,23908440.0,0.0
1,grade__E,0.459113,204077.6,0.0
2,term__60,0.449381,962.0421,0.0
3,grade__D,0.409237,1419337.0,0.0
4,grade__F,0.389882,64869.5,0.0
5,fico_mean,-0.381244,1018.489,0.0
6,grade__B,-0.342872,207132.4,0.0
7,acc_open_past_24mths,0.245393,228.4931,0.0
8,grade__G,0.238547,94359.8,0.0
9,purpose__credit_card,-0.217306,183.0927,0.0


<IPython.core.display.Javascript object>

In [26]:
feature_names = X_train.columns.tolist()
feature_names_df = pd.DataFrame({"IntFeatureName": feature_names})
feature_names_df.to_csv("interest_feature_names.csv", index=False)

<IPython.core.display.Javascript object>

Data is still too big for faster computation, therefore I make sample of train data:

In [24]:
sample_size = int(0.35 * len(X_train))
random_indices = np.random.choice(len(X_train), sample_size, replace=False)
X_train_sampled = X_train.iloc[random_indices]
y_train_sampled = y_train.iloc[random_indices]

<IPython.core.display.Javascript object>

In [25]:
pipeline_lrg = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        (
            "regressor",
            LinearRegression(n_jobs=-1),
        ),
    ]
)
pipeline_lrg.fit(X_train, y_train)
y_pred = pipeline_lrg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

scorer = make_scorer(mean_squared_error, greater_is_better=False)

param_grid_lrg = {
    "regressor__fit_intercept": [True, False],
}

grid_lrg = GridSearchCV(
    estimator=pipeline_lrg,
    param_grid=param_grid_lrg,
    scoring=scorer,
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_lrg.fit(X_train, y_train)
y_pred = grid_lrg.predict(X_test)
print(f"Best parameters for LinearRegression: {grid_lrg.best_params_}")

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Mean Squared Error: 1.6388562043535834e+22
R-squared: -6.448664895333815e+20
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters for LinearRegression: {'regressor__fit_intercept': True}
Mean Squared Error: 1.6388562043535834e+22
R-squared: -6.448664895333815e+20


<IPython.core.display.Javascript object>

In [32]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)

model = DecisionTreeRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

param_grid_dt = {
    "max_depth": [
        10,
        30,
        40,
    ],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

grid_dt = GridSearchCV(
    estimator=model,
    param_grid=param_grid_dt,
    scoring=scorer,
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_dt.fit(X_train, y_train)
y_pred = grid_dt.predict(X_test)
print(f"Best parameters for DecisionTreeRegressor: {grid_dt.best_params_}")

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Mean Squared Error: 3.626764247428308
R-squared: 0.8572920111971167
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters for DecisionTreeRegressor: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Mean Squared Error: 2.705101054434348
R-squared: 0.893558140355933


<IPython.core.display.Javascript object>

In [33]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)
model = XGBRegressor(learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

param_grid_xbr = {
    "n_estimators": [50, 100],
    "max_depth": [3, 7],
    "gamma": [0.1, 0.2],
}

grid_xbr = GridSearchCV(
    estimator=model,
    param_grid=param_grid_xbr,
    scoring=scorer,
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_xbr.fit(X_train, y_train)
y_pred = grid_xbr.predict(X_test)
print(f"Best parameters for XGBRegressor: {grid_xbr.best_params_}")

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Mean Squared Error: 2.666417786498488
R-squared: 0.8950802716528221
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters for XGBRegressor: {'gamma': 0.1, 'max_depth': 7, 'n_estimators': 100}
Mean Squared Error: 2.668914151956607
R-squared: 0.8949820432405129


<IPython.core.display.Javascript object>

In [37]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)

pipeline_fr = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        (
            "regressor",
            RandomForestRegressor(random_state=42),
        ),
    ]
)

pipeline_fr.fit(X_train, y_train)
y_pred = pipeline_fr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

param_grid_rf = {
    "regressor__n_estimators": [50, 100, 150],
    "regressor__max_depth": [10, 20, 30],
    "regressor__min_samples_split": [2, 5, 10],
    "regressor__min_samples_leaf": [1, 2, 3],
    "regressor__max_features": ["Auto", "sqrt", "log2"],
}

random_search_rf = RandomizedSearchCV(
    estimator=pipeline_fr,
    param_distributions=param_grid_rf,
    scoring=scorer,
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
    n_iter=10,
)
random_search_rf.fit(X_train, y_train)
y_pred = random_search_rf.predict(X_test)
print(f"Best parameters for RandomForestRegressor: {random_search_rf.best_params_}")

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2)

Mean Squared Error: 2.689889584819468
R-squared: 0.8941566899410114
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for RandomForestRegressor: {'regressor__n_estimators': 150, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 2, 'regressor__max_features': 'sqrt', 'regressor__max_depth': 30}
Mean Squared Error: 2.9667049498838485
R-squared: 0.8832644010236699


<IPython.core.display.Javascript object>

In [39]:
regressors = [
    (
        "Random Forest",
        RandomForestRegressor(random_state=42),
    ),
    (
        "XGB Boosting",
        XGBRegressor(random_state=42, gamma=0.1, max_depth=7, n_estimators=100),
    ),
    (
        "Decision Tree",
        DecisionTreeRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=10),
    ),
]
pipeline_list = []
for name, regressor in regressors:
    pipeline = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("select", SelectKBest(score_func=f_regression, k=31)),
            ("regressor", regressor),
        ]
    )
    pipeline_list.append((name, pipeline))

regressor = VotingRegressor(estimators=pipeline_list)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error for Ensemble Model:", mse)
print("R-squared for Ensemble Model:", r2)

Mean Squared Error for Ensemble Model: 2.6839646326443862
R-squared for Ensemble Model: 0.8943898283395877


<IPython.core.display.Javascript object>

Ensemble model has a relatively low MSE, indicating good accuracy in predicting interest rates. High R-squared value shows that the model explains 89% of the variability in interest rates. These results shows that ensemble model is performing rather well in regression task for predicting loan interest rates.

In [40]:
joblib.dump(regressor, "interest_prediction.joblib")

['interest_prediction.joblib']

<IPython.core.display.Javascript object>