In [82]:
import numpy as np
import joblib
import os
from custom_transformers import SimpleImputerWithMapping, CustomBinning
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials, space_eval
from sklearn.model_selection import cross_val_score


In [3]:
preprocessor = joblib.load("loan_preprocessor.pkl")
preprocessor

In [7]:
loan_train = pd.read_csv(os.path.abspath(os.path.join(os.getcwd(), 'data\\train_loanpred.csv')))
loan_train.drop(columns=['Loan_ID'], axis=1, inplace=True)
loan_train["Loan_Amount_Term"] = loan_train["Loan_Amount_Term"].astype('category')
loan_train.shape

(614, 12)

In [23]:
X = loan_train.loc[:, ~loan_train.columns.isin(["Loan_Status"])]
y = loan_train["Loan_Status"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 20, stratify=y)

In [24]:
loan_train_preprocessed = preprocessor.fit_transform(X_train)
loan_train_preprocessed.shape

(491, 15)

In [25]:
column_transformer = preprocessor.named_steps["combine_transforms"]
cat_pipeline = next(pipe for name, pipe, cols in column_transformer.transformers if name == "cat_pipeline")
onehot_encoder = cat_pipeline.named_steps['onehot']
cat_columns = onehot_encoder.get_feature_names_out(X_train.select_dtypes(include=['category', 'object']).columns.to_list())
numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
bin_cols = ["Credit_History"]

feature_names = numeric_cols.copy()
feature_names.extend(cat_columns)
feature_names.extend(bin_cols)
feature_names

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Gender_Male',
 'Married_Yes',
 'Dependents_1',
 'Dependents_2',
 'Dependents_3+',
 'Education_Not Graduate',
 'Self_Employed_Yes',
 'Loan_Amount_Term_Medium',
 'Loan_Amount_Term_Short',
 'Property_Area_Semiurban',
 'Property_Area_Urban',
 'Credit_History']

In [27]:
X_df = pd.DataFrame(loan_train_preprocessed, columns=feature_names, index=X_train.index)
X_df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Loan_Amount_Term_Medium,Loan_Amount_Term_Short,Property_Area_Semiurban,Property_Area_Urban,Credit_History
104,8.24722,6.626718,5.081404,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
132,7.908019,0.0,4.26268,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
247,8.796188,0.0,4.927254,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
395,8.094684,6.184149,4.912655,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
506,9.944342,8.805075,6.175867,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [36]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=500, random_state=1, oob_score=True)
rf_pipe = make_pipeline(preprocessor, random_forest)
rf_pipe.fit(X_train, y_train)

In [41]:
#Grid search CV
param_grid = {
    'randomforestclassifier__max_depth': [3, 5, 10, None],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 5],
    'randomforestclassifier__max_leaf_nodes': [2, 5, 10, 15],
}

grid_search = GridSearchCV(rf_pipe, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print(f'Best Score: {grid_search.best_score_}')

180 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\EdwinVivekN\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\EdwinVivekN\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\E

Best Parameters: {'randomforestclassifier__max_depth': 5, 'randomforestclassifier__max_leaf_nodes': 10, 'randomforestclassifier__min_samples_leaf': 1, 'randomforestclassifier__min_samples_split': 2}
Best Score: 0.8125747268604411


In [None]:
#Randomize Search CV

In [92]:
#Hyperopt
#1. initialize space
space = {
    "n_estimators": hp.choice("n_estimators", [100, 200, 300, 400,500,600]),
    "max_depth": hp.randint("max_depth", 1, 15),
    'min_samples_split': hp.randint("min_samples_split", 2, 11),
    'min_samples_leaf': hp.randint('min_samples_leaf', 1, 6),
    'max_leaf_nodes': hp.randint("max_leaf_nodes", 2, 16),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
}

In [93]:
#2. define objective funciton
def objective(params):
    clf = RandomForestClassifier(**params,n_jobs=-1)
    acc = cross_val_score(clf, X_df, y_train,scoring="accuracy", error_score='raise').mean()
    return {"loss": -acc, "status": STATUS_OK}

In [95]:
#3. initialize trials
trials = Trials()

In [96]:
#4. Minimization function
best = fmin(
    fn=objective,
    space = space, 
    algo=tpe.suggest, 
    max_evals=100, 
    trials=trials
)

100%|███████████████████████████████████████████████████████████████████████| 100/100 [05:22<00:00,  3.22s/trial, best loss: -0.8125747268604411]


In [97]:
print("Best: {}".format(space_eval(space, best)))


Best: {'criterion': 'gini', 'max_depth': 9, 'max_leaf_nodes': 11, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 500}


In [86]:
trials.results

[{'loss': -0.792269635126778, 'status': 'ok'},
 {'loss': -0.8125747268604411, 'status': 'ok'},
 {'loss': -0.7983920841063699, 'status': 'ok'},
 {'loss': -0.8024324881467738, 'status': 'ok'},
 {'loss': -0.8085137085137084, 'status': 'ok'},
 {'loss': -0.7860853432282002, 'status': 'ok'},
 {'loss': -0.8125747268604411, 'status': 'ok'},
 {'loss': -0.8105339105339106, 'status': 'ok'},
 {'loss': -0.8125747268604411, 'status': 'ok'},
 {'loss': -0.8125747268604411, 'status': 'ok'},
 {'loss': -0.8064935064935064, 'status': 'ok'},
 {'loss': -0.8105339105339106, 'status': 'ok'},
 {'loss': -0.686353329210472, 'status': 'ok'},
 {'loss': -0.802473716759431, 'status': 'ok'},
 {'loss': -0.8125747268604411, 'status': 'ok'},
 {'loss': -0.8024531024531025, 'status': 'ok'},
 {'loss': -0.8105339105339106, 'status': 'ok'},
 {'loss': -0.8125747268604411, 'status': 'ok'},
 {'loss': -0.8125747268604411, 'status': 'ok'},
 {'loss': -0.7861265718408575, 'status': 'ok'},
 {'loss': -0.8125747268604411, 'status': 'o

In [98]:
trials.best_trial

{'state': 2,
 'tid': 1,
 'spec': None,
 'result': {'loss': -0.8125747268604411, 'status': 'ok'},
 'misc': {'tid': 1,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'criterion': [1],
   'max_depth': [1],
   'max_leaf_nodes': [1],
   'min_samples_leaf': [1],
   'min_samples_split': [1],
   'n_estimators': [1]},
  'vals': {'criterion': [0],
   'max_depth': [9],
   'max_leaf_nodes': [11],
   'min_samples_leaf': [2],
   'min_samples_split': [6],
   'n_estimators': [4]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2025, 3, 24, 9, 9, 27, 503000),
 'refresh_time': datetime.datetime(2025, 3, 24, 9, 9, 31, 475000)}

In [None]:
#Hyperopt-Sklearn