In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OneHotEncoder,
    LabelEncoder,
    OrdinalEncoder,
    StandardScaler,
    FunctionTransformer,
    MinMaxScaler,
)
from sklearn.feature_selection import SelectKBest, chi2, f_regression
from sklearn.metrics import precision_score, f1_score, accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn import pipeline
from sklearn.feature_selection import f_classif
from sklearn.svm import LinearSVC
import joblib
import re

pd.set_option(
    "display.max_colwidth", 1000, "display.max_rows", None, "display.max_columns", None
)

import warnings

warnings.filterwarnings("ignore")
print("Setup complete")

Setup complete


<IPython.core.display.Javascript object>

In [3]:
def empty_rows(df):
    """Delete rows wit Null values"""
    null_rows = df[df.iloc[:, 1:].isnull().all(axis=1)].index
    df.drop(null_rows, inplace=True)

<IPython.core.display.Javascript object>

In [4]:
def missing_values_summary(df):
    """function gives summary of all features with missing values"""
    missing_values = df.isnull().sum()
    missing_values_percent = 100 * df.isnull().sum() / len(df)
    missing_values_type = df.dtypes
    missing_values_table = pd.concat(
        [missing_values, missing_values_percent, missing_values_type], axis=1
    )
    missing_values_table = missing_values_table.rename(
        columns={0: "Missing Values", 1: "% of Total Values", 2: "type"}
    )
    missing_values_table = (
        missing_values_table[missing_values_table.iloc[:, 1] != 0]
        .sort_values("% of Total Values", ascending=False)
        .round(4)
    )
    return missing_values_table

<IPython.core.display.Javascript object>

In [5]:
def single_value_features(df):
    """delete features that have only single value"""
    single_feature = []
    for column in list(df.columns):
        if df[column].unique().size <= 1:
            single_feature.append(column)
    return single_feature

<IPython.core.display.Javascript object>

In [6]:
def feature_correlation(X_train, y_train):
    """provides feature correlation list"""
    linear_dep = pd.DataFrame()
    for col in X_train.columns:
        linear_dep.loc[col, "pearson_corr"] = X_train[col].corr(y_train)
    linear_dep["abs_pearson_corr"] = abs(linear_dep["pearson_corr"])
    for col in X_train.columns:
        mask = X_train[col].notnull()
        (linear_dep.loc[col, "F"], linear_dep.loc[col, "p_value"]) = f_classif(
            pd.DataFrame(X_train.loc[mask, col]), y_train.loc[mask]
        )
    linear_dep.sort_values("abs_pearson_corr", ascending=False, inplace=True)
    linear_dep.drop("abs_pearson_corr", axis=1, inplace=True)
    linear_dep.reset_index(inplace=True)
    linear_dep.rename(columns={"index": "variable"}, inplace=True)

    return linear_dep

<IPython.core.display.Javascript object>

In [7]:
def replace_outliers_with_iqr(data, column_name, multiplier=1.5):
    """replacing outliers with upper and lower bound"""
    Q1 = data[column_name].quantile(0.25)
    Q3 = data[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    data[column_name] = data[column_name].apply(
        lambda x: lower_bound
        if x < lower_bound
        else (upper_bound if x > upper_bound else x)
    )

    return data

<IPython.core.display.Javascript object>

In [8]:
def encode_categorical(df, columns, mode_value):
    '''Encoding categorical value like OneHotEncoder'''
    encoded_dfs = []
    for column in columns:
        temp = pd.get_dummies(df[column], prefix=column, prefix_sep="__")
        if column + "__" + str(mode_value) in temp.columns:
            temp = temp.drop(column + "__" + str(mode_value), axis=1)
        temp = temp.astype(int)
        encoded_dfs.append(temp)

    df_encoded = pd.concat([df.drop(columns, axis=1)] + encoded_dfs, axis=1)
    return df_encoded


<IPython.core.display.Javascript object>

# Loan Grade Prediction

All steps are recreated as in the loan_status_modeling file before:

In [9]:
df_grade = pd.read_csv("accepted_2007_to_2018Q4.csv", low_memory=False)
df_grade.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68407277,,debt_consolidation,Debt consolidation,190xx,PA,5.91,0.0,Aug-2003,675.0,679.0,1.0,30.0,,7.0,0.0,2765.0,29.7,13.0,w,0.0,0.0,4421.723917,4421.72,3600.0,821.72,0.0,0.0,0.0,Jan-2019,122.67,,Mar-2019,564.0,560.0,0.0,30.0,1.0,Individual,,,,0.0,722.0,144904.0,2.0,2.0,0.0,1.0,21.0,4981.0,36.0,3.0,3.0,722.0,34.0,9300.0,3.0,1.0,4.0,4.0,20701.0,1506.0,37.2,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,69.0,4.0,69.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,0.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68355089,,small_business,Business,577xx,SD,16.06,1.0,Dec-1999,715.0,719.0,4.0,6.0,,22.0,0.0,21470.0,19.2,38.0,w,0.0,0.0,25679.66,25679.66,24700.0,979.66,0.0,0.0,0.0,Jun-2016,926.35,,Mar-2019,699.0,695.0,0.0,,1.0,Individual,,,,0.0,0.0,204396.0,1.0,1.0,0.0,1.0,19.0,18005.0,73.0,2.0,3.0,6472.0,29.0,111800.0,0.0,0.0,6.0,4.0,9733.0,57830.0,27.1,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,,0.0,6.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,0.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68341763,,home_improvement,,605xx,IL,10.78,0.0,Aug-2000,695.0,699.0,0.0,,,6.0,0.0,7869.0,56.2,18.0,w,0.0,0.0,22705.924294,22705.92,20000.0,2705.92,0.0,0.0,0.0,Jun-2017,15813.3,,Mar-2019,704.0,700.0,0.0,,1.0,Joint App,71000.0,13.85,Not Verified,0.0,0.0,189699.0,0.0,1.0,0.0,4.0,19.0,10827.0,73.0,0.0,2.0,2081.0,65.0,14000.0,2.0,5.0,1.0,6.0,31617.0,2737.0,55.9,0.0,0.0,125.0,184.0,14.0,14.0,5.0,101.0,,10.0,,0.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,3.0,6.0,0.0,0.0,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,n,https://lendingclub.com/browse/loanDetail.action?loan_id=66310712,,debt_consolidation,Debt consolidation,076xx,NJ,17.06,0.0,Sep-2008,785.0,789.0,0.0,,,13.0,0.0,7802.0,11.6,17.0,w,15897.65,15897.65,31464.01,31464.01,19102.35,12361.66,0.0,0.0,0.0,Feb-2019,829.9,Apr-2019,Mar-2019,679.0,675.0,0.0,,1.0,Individual,,,,0.0,0.0,301500.0,1.0,1.0,0.0,1.0,23.0,12609.0,70.0,1.0,1.0,6987.0,45.0,67300.0,0.0,1.0,0.0,2.0,23192.0,54962.0,12.1,0.0,0.0,36.0,87.0,2.0,2.0,1.0,2.0,,,,0.0,4.0,5.0,8.0,10.0,2.0,10.0,13.0,5.0,13.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=68476807,,major_purchase,Major purchase,174xx,PA,25.37,1.0,Jun-1998,695.0,699.0,3.0,12.0,,12.0,0.0,21929.0,64.5,35.0,w,0.0,0.0,11740.5,11740.5,10400.0,1340.5,0.0,0.0,0.0,Jul-2016,10128.96,,Mar-2018,704.0,700.0,0.0,,1.0,Individual,,,,0.0,0.0,331730.0,1.0,3.0,0.0,3.0,14.0,73839.0,84.0,4.0,7.0,9702.0,78.0,34000.0,2.0,1.0,3.0,10.0,27644.0,4567.0,77.5,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,12.0,1.0,12.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,0.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


<IPython.core.display.Javascript object>

In [10]:
df_grade["loan_status"] = df_grade["loan_status"].str.replace(
    "Does not meet the credit policy. Status:", ""
)
df_grade = df_grade[
    (df_grade["loan_status"] == "Fully Paid")
    | (df_grade["loan_status"] == "Charged Off")
]
df_grade["loan_status"] = np.where(df_grade["loan_status"] == "Fully Paid", 1, 0)

<IPython.core.display.Javascript object>

Cleaning missing values, single rows:

In [11]:
empty_rows(df_grade)
missing_vals = 100 * df_grade.isnull().sum() / len(df_grade)
drop_list = sorted(missing_vals[missing_vals > 30].index)
df_grade.drop(labels=drop_list, axis=1, inplace=True)

<IPython.core.display.Javascript object>

In [12]:
single_value_feature = single_value_features(df_grade)
print(single_value_feature)
df_grade.drop(single_value_feature, axis=1, inplace=True)

['pymnt_plan', 'out_prncp', 'out_prncp_inv', 'policy_code', 'hardship_flag']


<IPython.core.display.Javascript object>

In [13]:
features_keep = [
    "loan_amnt",
    "term",
    "emp_length",
    "home_ownership",
    "annual_inc",
    "purpose",
    "addr_state",
    "dti",
    "earliest_cr_line",
    "fico_range_low",
    "fico_range_high",
    "open_acc",
    "revol_util",
    "total_acc",
    "application_type",
    "tot_cur_bal",
    "acc_open_past_24mths",
    "mo_sin_old_rev_tl_op",
    "mo_sin_rcnt_rev_tl_op",
    "mort_acc",
    "mths_since_recent_inq",
    "num_actv_bc_tl",
    "pub_rec_bankruptcies",
    "issue_d",
    "grade",
]
drop_list = [col for col in df_grade.columns if col not in features_keep]
df_grade.drop(labels=drop_list, axis=1, inplace=True)
df_grade.shape

(1348059, 25)

<IPython.core.display.Javascript object>

In [14]:
df_grade["issue_d"] = pd.to_datetime(
    df_grade["issue_d"], format="%b-%Y", errors="coerce"
)
df_grade["issue_d"] = df_grade["issue_d"].dt.year.astype("category")
df_grade["earliest_cr_line"] = pd.to_datetime(
    df_grade["earliest_cr_line"], format="%b-%Y", errors="coerce"
)
date_today = dt.datetime.now()
df_grade["credit_history"] = (date_today - df_grade["earliest_cr_line"]).dt.days // 365
df_grade["credit_history"].fillna(0, inplace=True)
df_grade["credit_history"] = df_grade["credit_history"].astype(int)
df_grade.drop("earliest_cr_line", axis=1, inplace=True)
df_grade["fico_mean"] = df_grade[["fico_range_low", "fico_range_high"]].mean(axis=1)
df_grade.drop(["fico_range_low", "fico_range_high"], axis=1, inplace=True)
df_grade["term"] = df_grade["term"].str.extract("(\d+)")
df_grade["emp_length"] = (
    df_grade["emp_length"].str.replace("+", "_more").str.replace("<", "less_")
)
df_grade = df_grade.apply(lambda x: x.astype("category") if x.dtype == "object" else x)

<IPython.core.display.Javascript object>

In [15]:
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df_grade.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in df_grade.columns.values
]

<IPython.core.display.Javascript object>

In [16]:
sample_data = df_grade[df_grade["issue_d"] == 2015]
sample_data.head()

Unnamed: 0,loan_amnt,term,grade,emp_length,home_ownership,annual_inc,issue_d,purpose,addr_state,dti,open_acc,revol_util,total_acc,application_type,tot_cur_bal,acc_open_past_24mths,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mort_acc,mths_since_recent_inq,num_actv_bc_tl,pub_rec_bankruptcies,credit_history,fico_mean
0,3600.0,36,C,10_more years,MORTGAGE,55000.0,2015,debt_consolidation,PA,5.91,7.0,29.7,13.0,Individual,144904.0,4.0,128.0,3.0,1.0,4.0,2.0,0.0,20,677.0
1,24700.0,36,C,10_more years,MORTGAGE,65000.0,2015,small_business,SD,16.06,22.0,19.2,38.0,Individual,204396.0,4.0,192.0,2.0,4.0,0.0,5.0,0.0,23,717.0
2,20000.0,60,B,10_more years,MORTGAGE,63000.0,2015,home_improvement,IL,10.78,6.0,56.2,18.0,Joint App,189699.0,6.0,184.0,14.0,5.0,10.0,2.0,0.0,23,697.0
4,10400.0,60,F,3 years,MORTGAGE,104433.0,2015,major_purchase,PA,25.37,12.0,64.5,35.0,Individual,331730.0,10.0,210.0,4.0,6.0,1.0,4.0,0.0,25,697.0
5,11950.0,36,C,4 years,RENT,34000.0,2015,debt_consolidation,GA,10.2,5.0,68.4,6.0,Individual,12798.0,0.0,54.0,32.0,0.0,,2.0,0.0,36,692.0


<IPython.core.display.Javascript object>

Encoding target values in column "grade":

In [17]:
label_encoder = LabelEncoder()
sample_data["grade"] = label_encoder.fit_transform(sample_data["grade"])
target_mappings = {index: label for index, label in enumerate(label_encoder.classes_)}

<IPython.core.display.Javascript object>

In [18]:
df_target_mappings = pd.DataFrame(
    list(target_mappings.items()), columns=["Index", "Label"]
)
df_target_mappings.to_csv("grade_target_mappings.csv", index=False)

<IPython.core.display.Javascript object>

In [40]:
X = sample_data.drop(columns=["grade", "issue_d"], axis=1)
y = sample_data["grade"]

<IPython.core.display.Javascript object>

In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

<IPython.core.display.Javascript object>

In [42]:
categorical_columns = X_train.select_dtypes(include=["category"]).columns
numeric_columns = X_train.select_dtypes(exclude=["category", "datetime64[ns]"]).columns
for feature in numeric_columns:
    X_train = replace_outliers_with_iqr(X_train, feature)

for feature in numeric_columns:
    X_test = replace_outliers_with_iqr(X_test, feature)

<IPython.core.display.Javascript object>

In [43]:
categorical_imputer = SimpleImputer(strategy="most_frequent")
numeric_imputer = SimpleImputer(strategy="median")
categorical_imputer.fit(X_train[categorical_columns])
numeric_imputer.fit(X_train[numeric_columns])

X_train[numeric_columns] = numeric_imputer.transform(X_train[numeric_columns])
X_train[categorical_columns] = categorical_imputer.transform(
    X_train[categorical_columns]
)

X_test[numeric_columns] = numeric_imputer.transform(X_test[numeric_columns])
X_test[categorical_columns] = categorical_imputer.transform(X_test[categorical_columns])

<IPython.core.display.Javascript object>

In [44]:
mode_value = X_train[categorical_columns[0]].mode()[0]

X_train = encode_categorical(X_train, categorical_columns, mode_value)
X_test = encode_categorical(X_test, categorical_columns, mode_value)

<IPython.core.display.Javascript object>

In [65]:
train_columns = set(X_train.columns)
test_columns = set(X_test.columns)

missing_columns_test = train_columns - test_columns
for column in missing_columns_test:
    X_test[column] = 0
missing_columns_train = test_columns - train_columns
for column in missing_columns_train:
    X_train[column] = 0
X_test = X_test[X_train.columns]

<IPython.core.display.Javascript object>

In [55]:
feature_correlation(X_train, y_train)

Unnamed: 0,variable,pearson_corr,F,p_value
0,term__60,0.6254581,1194.362793,0.0
1,fico_mean,-0.3839323,506.551508,0.0
2,acc_open_past_24mths,0.3140695,194.94373,1.5923079999999999e-236
3,purpose__credit_card,-0.2910071,173.852615,0.0
4,loan_amnt,0.2608854,157.394206,3.355735e-192
5,mths_since_recent_inq,-0.2360097,107.27617,9.385186e-132
6,mo_sin_old_rev_tl_op,-0.2152367,92.298531,2.09539e-113
7,mo_sin_rcnt_rev_tl_op,-0.2008406,78.972284,5.667316e-97
8,credit_history,-0.1988047,79.400943,1.672242e-97
9,dti,0.162676,76.939306,1.8564939999999998e-94


<IPython.core.display.Javascript object>

In [89]:
ros = RandomUnderSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

pipeline_log = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        (
            "classifier",
            LogisticRegression(multi_class="multinomial", random_state=42),
        ),
    ]
)
pipeline_log.fit(X_train, y_train)
y_pred = pipeline_log.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


param_grid_log = {
    "classifier__solver": ["sag", "saga"],
    "classifier__C": [0.0, 0.001, 0.01, 0.1, 1],
    "classifier__max_iter": [800, 900, 1000],
    "classifier__penalty": ["l1", "l2", "elasticnet"],
}
scorer = make_scorer(f1_score, average="macro")
grid_log = GridSearchCV(
    estimator=pipeline_log,
    param_grid=param_grid_log,
    scoring=scorer,
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_log.fit(X_train, y_train)
y_pred = grid_log.predict(X_test)
print(f"Best parameters for LogisticRegression: {grid_log.best_params_}")
print(f"Best Score: {grid_log.best_score_}")
report = classification_report(y_test, y_pred)
print("LogisticRegression Classification Report with Best Parameters:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.71      0.64     14378
           1       0.46      0.40      0.43     21383
           2       0.42      0.27      0.33     20923
           3       0.26      0.28      0.27     10696
           4       0.20      0.21      0.20      5707
           5       0.10      0.31      0.16      1643
           6       0.05      0.47      0.10       379

    accuracy                           0.39     75109
   macro avg       0.30      0.38      0.30     75109
weighted avg       0.41      0.39      0.39     75109

Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best parameters for LogisticRegression: {'classifier__C': 1, 'classifier__max_iter': 800, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
Best Score: 0.382198732405055
LogisticRegression Classification Report with Best Parameters:
               precision    recall  f1-score   support

           0       

<IPython.core.display.Javascript object>

In [66]:
model_xgb = XGBClassifier(objective="multi:softmax", random_state=42, n_jobs=-1)

model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

param_grid_xgb = {
    "n_estimators": [100, 200, 300, 400],
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.1, 0.2, 0.3, 0.4],
}
grid_xgb = GridSearchCV(
    estimator=model_xgb,
    param_grid=param_grid_xgb,
    scoring="f1",
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_xgb.fit(X_train, y_train)
y_pred = grid_xgb.predict(X_test)
print(f"Best parameters for XGBClassifier: {grid_xgb.best_params_}")
print(f"Best Score: {grid_xgb.best_score_}")
report = classification_report(y_test, y_pred)
print("XGBClassifier Report with Best Parameters:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.70      0.64     14378
           1       0.46      0.41      0.44     21383
           2       0.41      0.28      0.34     20923
           3       0.26      0.24      0.25     10696
           4       0.21      0.26      0.23      5707
           5       0.11      0.31      0.16      1643
           6       0.06      0.49      0.11       379

    accuracy                           0.39     75109
   macro avg       0.30      0.39      0.31     75109
weighted avg       0.42      0.39      0.40     75109

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters for XGBClassifier: {'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best Mean F1 Score: nan
XGBClassifier Report with Best Parameters:
               precision    recall  f1-score   support

           0       0.49      0.68      0.57     14378
           1       0.41      0

<IPython.core.display.Javascript object>

In [77]:
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

param_grid_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30],
    "min_samples_split": [3, 4, 5],
    "min_samples_leaf": [1, 3, 5],
}
grid_rf = GridSearchCV(
    estimator=model_rf,
    param_grid=param_grid_rf,
    scoring="f1",
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_rf.fit(X_train, y_train)
y_pred = grid_rf.predict(X_test)
print(f"Best parameters for RandomForestClassifier: {grid_rf.best_params_}")
print(f"Best Mean F1 Score: {grid_rf.best_score_}")
report = classification_report(y_test, y_pred)
print("RandomForestClassifier Report with Best Parameters:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.73      0.61     14378
           1       0.44      0.37      0.40     21383
           2       0.41      0.25      0.31     20923
           3       0.26      0.22      0.24     10696
           4       0.19      0.19      0.19      5707
           5       0.09      0.30      0.14      1643
           6       0.05      0.51      0.09       379

    accuracy                           0.37     75109
   macro avg       0.28      0.37      0.28     75109
weighted avg       0.39      0.37      0.37     75109

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters for RandomForestClassifier: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
Best Mean F1 Score: nan
RandomForestClassifier Report with Best Parameters:
               precision    recall  f1-score   support

           0       0.50      0.76      0.61     14378
 

<IPython.core.display.Javascript object>

In [79]:
pipeline_rf = Pipeline(
    steps=[
        ("scaler", MinMaxScaler()),
        (
            "classifier",
            KNeighborsClassifier(n_jobs=-1),
        ),
    ]
)
pipeline_rf.fit(X_train, y_train)
y_pred = pipeline_rf.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


param_grid_rf = {
    "classifier__n_neighbors": [3, 5, 7, 9],
    "classifier__weights": ["uniform", "distance"],
    "classifier__leaf_size": [10, 20, 30, 40],
}
grid_rf = GridSearchCV(
    estimator=pipeline_rf,
    param_grid=param_grid_rf,
    scoring="f1",
    n_jobs=-1,
    cv=5,
    verbose=1,
    return_train_score=False,
)
grid_rf.fit(X_train, y_train)
y_pred = grid_rf.predict(X_test)
print(f"Best parameters for KNeighborsClassifier: {grid_rf.best_params_}")
print(f"Best Mean F1 Score: {grid_rf.best_score_}")
report = classification_report(y_test, y_pred)
print("KNeighborsClassifier Report with Best Parameters:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.37      0.57      0.45     14378
           1       0.34      0.27      0.30     21383
           2       0.33      0.19      0.24     20923
           3       0.19      0.15      0.17     10696
           4       0.15      0.18      0.17      5707
           5       0.07      0.24      0.11      1643
           6       0.02      0.23      0.04       379

    accuracy                           0.28     75109
   macro avg       0.21      0.26      0.21     75109
weighted avg       0.30      0.28      0.28     75109

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters for KNeighborsClassifier: {'classifier__leaf_size': 10, 'classifier__n_neighbors': 3, 'classifier__weights': 'uniform'}
Best Mean F1 Score: nan
KNeighborsClassifier Report with Best Parameters:
               precision    recall  f1-score   support

           0       0.33      0.60      0.43     143

<IPython.core.display.Javascript object>

In [90]:
classifiers = [
    (
        "Random Forest",
        RandomForestClassifier(
            random_state=42,
            max_depth=10,
            n_estimators=100,
            min_samples_leaf=1,
            min_samples_split=3,
            n_jobs=-1,
        ),
    ),
    (
        "LogisticRegression",
        LogisticRegression(
            multi_class="multinomial",
            solver="saga",
            random_state=42,
            C=1,
            max_iter=800,
            penalty="l2",
        ),
    ),
]


pipeline_list = []
for name, classifier in classifiers:
    pipeline = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("select", SelectKBest(k=25, score_func=f_regression)),
            ("classifier", classifier),
        ]
    )

    pipeline_list.append((name, pipeline))

classifier = VotingClassifier(estimators=pipeline_list, voting="hard")

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, y_pred)
ensemble_f1 = f1_score(y_test, y_pred, average="macro")

print("Ensemble Model Accuracy:", ensemble_accuracy)
print("Ensemble Model F1 Score (Macro):", ensemble_f1)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Ensemble Model Accuracy: 0.4057037106072508
Ensemble Model F1 Score (Macro): 0.3095676297768745
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.78      0.63     14378
           1       0.44      0.44      0.44     21383
           2       0.44      0.27      0.34     20923
           3       0.32      0.20      0.25     10696
           4       0.21      0.23      0.22      5707
           5       0.13      0.30      0.18      1643
           6       0.07      0.45      0.12       379

    accuracy                           0.41     75109
   macro avg       0.30      0.38      0.31     75109
weighted avg       0.41      0.41      0.40     75109



<IPython.core.display.Javascript object>

In summary, the model has an overall accuracy of about 41%, with varying precision, recall, and F1-scores for each class.  Model performs relatively well for some classes (class 0) but less so for others (e.g., class 6). The macro F1-score of 0.31 shows need of further improvement in the model's overall performance.

In [91]:
joblib.dump(classifier, "grade_prediction.joblib")

['grade_prediction.joblib']

<IPython.core.display.Javascript object>