# Model Selection and Model Evaluation

## Model Selection

In [1]:
# Import important dependencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_curve, roc_auc_score, make_scorer, f1_score, precision_recall_curve, average_precision_score

In [2]:
# Load the cleaned data
loans_info = pd.read_csv("../Data/CleanData/Cleaned_Loans_Data.csv")

# Preview
loans_info.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,Bachelor's,Full-time,Divorced,Yes,Yes,Other,Yes,0
1,69,50432,124440,458,15,1,4.81,60,0.68,Master's,Full-time,Married,No,No,Other,Yes,0
2,46,84208,129188,451,26,3,21.17,24,0.31,Master's,Unemployed,Divorced,Yes,Yes,Auto,No,1
3,32,31713,44799,743,0,3,7.07,24,0.23,High School,Full-time,Married,No,No,Business,No,0
4,60,20437,9139,633,8,4,6.51,48,0.73,Bachelor's,Unemployed,Divorced,No,Yes,Auto,No,0


In [3]:
# Assign predictors and target variable to their respective variables.
X = loans_info.drop("Default", axis=1)
y = loans_info["Default"]

# Split the data into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Identify column types
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), categorical_cols)
    ]
)

# Model pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('SMOTE', SMOTE(random_state=42)),
    ('forest', RandomForestClassifier(criterion= 'entropy', min_samples_split= 2, min_samples_leaf= 1, max_depth= 20, class_weight= 'balanced', n_estimators= 100))
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.54      0.68     45139
           1       0.13      0.53      0.21      5931

    accuracy                           0.54     51070
   macro avg       0.52      0.54      0.45     51070
weighted avg       0.81      0.54      0.62     51070

