# Personal Bank Loan Classification

**Aim:** To use different classification models to predict the likelihood that a customer will buy a personal loan.

# Importing Libraries

In [None]:
#For reading data visualisation
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import seaborn as sns

#Preprocessing of Data
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler

#Models for Predictions 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

#Model Evaluation
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, RocCurveDisplay, ConfusionMatrixDisplay
from scipy import stats
from sklearn.base import clone 
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


# Reading The Data

In [None]:
df = pd.read_excel('/kaggle/input/bank-loan-modelling/Bank_Personal_Loan_Modelling.xlsx', sheet_name='Data')

In [None]:
df.head()

In [None]:
df.describe(include='all')

**Dataset basic information:**
* The dataset has data on 5000 customers.
* We have 14 variables including 13 independent variables and 1 dependent variable which is Personal Loan. 
* We have 6 numeric variables: ID , Age , Experience , Income , CC_Avg , Mortgage
* We have 3 categorical variables: Family , Education , Zip_Code
* We have 5 Boolean variables: Personal_Loan , Securities Account , CD_Account , Online , Credit_Card

# Correlation Analysis

In [None]:
corr_matrix = df.corr()

In [None]:
plt.figure(figsize=(16, 9))  # Set the figure size
sns.heatmap(corr_matrix, annot=True, cmap='Greens', cbar=True)
plt.title('Heatmap of Feature Correlations')
plt.show()

Conclusion 
Personal Loan is highly correlated with Income, CD_Account, CCAvg.
Experience is highly correlated with Age. (ρ = 0.99)
CCAvg is correlated with Income to a good extent. (ρ = 0.58)

# Data Cleansing

**1.Noise Treatment**

In [None]:
sns.countplot(x=df['Personal Loan'])
plt.title('No of People who took the loan')
plt.grid()

We can see that more than 4000 people who had their account in the bank didn't take the loan and only around 500 people took the loan.

In [None]:
plt.figure(figsize=(10,6),dpi=90)
plt.scatter(df['Personal Loan'],df['Age'],color='red')
plt.title("Loan WRT Age")
plt.xlabel('Personal Loan')
plt.ylabel('Age')
plt.xticks(np.arange(0,2,1))
plt.yticks(np.arange(15,80,5))
plt.grid()
plt.show()

Age of those who didn't accepted loan is between 23 and around 67, while people who took the loan is between 25 and 65

In [None]:
plt.figure(figsize=(10,6),dpi=90)
plt.scatter(df['Personal Loan'],df['Experience'],color='red')
plt.title("Chart")
plt.xlabel('Personal Loan')
plt.ylabel('Experience')
plt.xticks(np.arange(0,2,1))
plt.grid()
plt.show()

There are some negative values which is needed to fixed.

In [None]:
df[df['Experience']<0]['Experience'].count()

In [None]:
df[df['Experience']<0]['Experience'].value_counts()

These values may be the result of incorrect input or readings, which can simply be corrected by taking absoulte of the values


In [None]:
df['Experience'] = df['Experience'].apply(abs)

In [None]:
df[df['Experience']<0]['Experience'].count()

In [None]:
plt.figure(figsize=(10,6),dpi=90)
plt.scatter(df['Personal Loan'],df['Income'],color='red')
plt.title("Chart")
plt.xlabel('Personal Loan')
plt.ylabel('Income')
plt.xticks(np.arange(0,2,1))
plt.yticks(np.arange(5,300,50))
plt.grid()
plt.show()

We can conclude from the graph that people who took loan made around 55 to 205, while people who didn't take the loan have income between 5 to 255.

In [None]:
plt.figure(figsize=(10,6),dpi=90)
plt.scatter(df['Personal Loan'],df['ZIP Code'],color='red')
plt.title("Chart")
plt.xlabel('Personal Loan')
plt.ylabel('ZIP Code')
plt.grid()
plt.show()

There is a noise in this sample as all other samples are over 90,000 but this is one sample is around 9000.

In [None]:
df[df['ZIP Code']<90000]

We found 1 noisy data in ZIP Code. We drop the corresponding sample because it contains 4 digits, while the other values of this feature all have 5 digits:

In [None]:
df.drop(df[df['ZIP Code']<20000].index, inplace=True)
df.reset_index(drop=True, inplace =True)

In [None]:
sns.countplot(x=df['Family'])
plt.title('No of family members')
plt.grid()

In [None]:
grouped_df = df.groupby('Personal Loan')['Family'].sum().reset_index()
plt.figure(figsize=(10, 6))
sns.countplot(x='Family', hue='Personal Loan', data=df, palette='viridis')
plt.xlabel('No of Family Members')
plt.ylabel('Count')
plt.title('No of Family Members and Loan Status')
plt.legend(title='Loan')
plt.show()

No of family members and people who took loan doesn't help us much.

In [None]:
plt.figure(figsize=(10,6),dpi=90)
plt.scatter(df['Personal Loan'],df['CCAvg'],color='red')
plt.title("Chart")
plt.xlabel('Personal Loan')
plt.ylabel('CCAvg')
plt.xticks(np.arange(0,2,1))
plt.grid()
plt.show()

In [None]:
sns.countplot(x=df['Education'])
plt.title('No of educated people')
plt.grid()

More than 2000 individuals hold undergraduate degrees, while approximately 1500 people are in PhD positions. Additionally, around 1300 individuals have obtained master's degrees.

In [None]:
data = df['Mortgage']
bins = 10
plt.figure(figsize=(10,6),dpi=90)
hist, edges, _ = plt.hist(data, bins=bins, edgecolor='black')

for i in range(bins):
    plt.text(edges[i] + (edges[i+1] - edges[i])/2, hist[i], str(hist[i]), ha='center', va='bottom')

plt.xticks(np.arange(0,700,50))
plt.title("Chart")
plt.xlabel('amount of mortgage')
plt.ylabel('No of people')
plt.show()

In [None]:
sns.set(rc = {'axes.labelsize' : 15})               
fig, ax = plt.subplots(1, 2, figsize=(15,5), dpi=120)
sns.histplot(x='Mortgage', data=df, color='royalblue', ax=ax[0])
sns.boxplot(x='Mortgage', data=df, color='royalblue', ax=ax[1])
plt.suptitle('Mortgage Distribution', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
df[stats.zscore(df['Mortgage'])>3]['Mortgage'].count()

We found 105 records with a Z-score mortgage value greater than 3. Therefore, we consider these 105 records as outliers and filter out these records from our dataset:

In [None]:
outlier_indexes = df[stats.zscore(df['Mortgage'])>3].index
df.drop(outlier_indexes, inplace=True)
df.reset_index(drop=True, inplace=True)

# Missing Value Treatment

In [None]:
df.isnull().sum().sum()

# Duplicate Values Treatment

In [None]:
df[df.duplicated(keep=False)].sum().sum()


# Feature Transformation

In our data set the CCAVG represents average monthly credit card spending, but Income represents the amount of annual income. To make the units of the features equal, we convert average monthly credit card spending to annual.

In [None]:
df['CCAvg'] = df['CCAvg']*12

# Model section

In [None]:
df.reset_index(inplace=True)
df

In [None]:
df.drop(['index'],axis=1,inplace=True)

# TRAIN TEST SPLIT

In [None]:
X = df.drop('Personal Loan', axis=1)
y = df['Personal Loan'] 

In [None]:
plt.figure(dpi=80)

df['Personal Loan'].value_counts(normalize=True).mul(100).plot(kind='barh', width=0.8, figsize=(10,6))

labels = df['Personal Loan'].value_counts(normalize=True).mul(100).round(1)
for i in labels.index:
    plt.text(labels[i], i, str(labels[i])+ '%', fontsize=15, weight='bold')

plt.xlim([0, 110])
plt.xlabel('Frequency Percentage', fontsize=15)
plt.ylabel('Personal Loan', fontsize=15)
plt.title('Frequency Percentage of Target Classes', fontsize=15)
plt.show()

We can see that our dataset is very imbalanced.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
df_perc = pd.concat([y.value_counts(normalize=True).mul(100).round(1),
                     y_train.value_counts(normalize=True).mul(100).round(1),
                     y_test.value_counts(normalize=True).mul(100).round(1)], axis=1)
df_perc.columns=['Dataset','Training','Test']
df_perc = df_perc.T

# Plot frequency percentages barplot
df_perc.plot(kind='barh', stacked=True, figsize=(10,5), width=0.6)

# Add the percentages to our plot
for idx, val in enumerate([*df_perc.index.values]):
    for (percentage, y_location) in zip(df_perc.loc[val], df_perc.loc[val].cumsum()):
        plt.text(x=(y_location - percentage) + (percentage / 2)-3,
                 y=idx - 0.05,
                 s=f'{percentage}%', 
                 color="black",
                 fontsize=12,
                 fontweight="bold")
plt.legend(title='Personal Loan', loc=(1.01,0.8))
plt.xlabel('Frequency Percentage', fontsize=15)
plt.title('Frequency Percentage of Target Classes among Training and Test Sets', fontsize=15)
plt.show()            

# F1 Score function
To prevent rewriting the same function over and over.


In [None]:
def f1_metric(model, X_train, y_train):
    return f1_score(y_train, model.predict(X_train), average='binary')

# Drop-column Feature

In [None]:
def drop_column_importance(model, X_train, y_train, random_state=0):
    # List to store feature importances
    importances = []
    # Clone the model to ensure it's a fresh instance
    model_clone = clone(model)
    # Set random_state for consistency
    if hasattr(model_clone, 'random_state'):
        model_clone.random_state = random_state
    # Train the benchmark model
    model_clone.fit(X_train, y_train)
    # Create the cross-validation object using StratifiedKFold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    # Score the benchmark model using cross-validation
    benchmark_score = cross_val_score(model_clone, X_train, y_train, cv=cv, scoring='f1').mean()
    
    # Iterate over all features and store feature importance
    for col in X_train.columns:
        # Clone the model for each iteration
        model_clone = clone(model)
        if hasattr(model_clone, 'random_state'):
            model_clone.random_state = random_state
        # Train the model on the dataset with a single feature removed
        model_clone.fit(X_train.drop(col, axis=1), y_train)
        # Score the model with the dropped column
        drop_column_score = cross_val_score(model_clone, X_train.drop(col, axis=1), y_train, cv=cv, scoring='f1').mean()
        # Calculate and store the importance of the dropped feature
        importances.append(benchmark_score - drop_column_score)
    
    # Return the features along with their importances in a DataFrame
    importances_df = pd.DataFrame({'feature': X_train.columns, 'feature importance': importances}) \
                     .sort_values('feature importance', ascending=False).reset_index(drop=True)
    
    return importances_df


In [None]:
def drop_column_importance_plot(model, X_train, y_train):
    # Call drop-column feature importance function
    df_drop_column = drop_column_importance(model, X_train, y_train, random_state=0)
    # Rename columns
    df_drop_column.columns = ['Feature', 'Feature Importance']
    
    # Plot barchart
    plt.figure(figsize=(12,10))
    sns.barplot(data=df_drop_column, x='Feature Importance', y='Feature', orient='h', color='royalblue')
    plt.title('Drop Column Feature Importance', fontsize=20)
    plt.show()

# Model Evaluation

In [None]:
def metrics_calculator(clf, X_test, y_test, model_name):
    '''
    This function calculates all desired performance metrics for a given model on test data.
    '''
    y_pred = clf.predict(X_test)
    result = pd.DataFrame(data=[accuracy_score(y_test, y_pred),
                                precision_score(y_test, y_pred, average='binary'),
                                recall_score(y_test, y_pred, average='binary'),
                                f1_score(y_test, y_pred, average='binary'),
                                roc_auc_score(y_test, clf.predict_proba(X_test)[::,1])],
                          index=['Accuracy','Precision','Recall','F1-score','AUC'],
                          columns = [model_name])
    
    result = (result * 100).round(2).astype(str) + '%'                            
    return result

In [None]:
def model_evaluation(clf, X_train, X_test, y_train, y_test, model_name):
    '''
    This function provides a complete report of the model's performance including classification reports, 
    confusion matrix and ROC curve.
    '''
    sns.set(font_scale=1.2)
    
    # Generate classification report for training set
    y_pred_train = clf.predict(X_train)
    print("\n\t  Classification report for training set")
    print("-"*55)
    print(classification_report(y_train, y_pred_train))

    # Generate classification report for test set
    y_pred_test = clf.predict(X_test)
    print("\n\t   Classification report for test set")
    print("-"*55)
    print(classification_report(y_test, y_pred_test))
    
    # Create figure and subplots 
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5), dpi=100, gridspec_kw={'width_ratios': [2, 2, 1]})
    
    # Plot confusion matrix for test set
    ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test, colorbar=False, ax=ax1)
    ax1.set_title('Confusion Matrix for Test Data')                                     
    ax1.grid(False)
    
    # Plot ROC curve for test data and display AUC score 
    RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax2)
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.set_title('ROC Curve for Test Data (Positive label: 1)')
    
    # Report results for the class specified by positive label
    result = metrics_calculator(clf, X_test, y_test, model_name)
    table = ax3.table(cellText=result.values, colLabels=result.columns, rowLabels=result.index, loc='center')
    table.scale(0.6, 2)
    table.set_fontsize(12)
    ax3.axis('tight')
    ax3.axis('off')
    # Modify color 
    for key, cell in table.get_celld().items():
        if key[0] == 0:
            cell.set_color('royalblue')
    plt.tight_layout()
    plt.show() 

In [None]:
def discretization_report(df, clf):
    '''
    This function finds the optimal combination of n_bins and strategy for continuous features discretization
    '''
    # Define continuous features to perform discretization on
    cols_to_discretize = ['Age', 'Income', 'CCAvg', 'Mortgage']

    # Define the features (X) and the output labels (y) 
    X = df[cols_to_discretize]
    y = df['Personal Loan']

    # Split dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

    # Define the grid search parameters
    param_grid = {'discretizer__strategy': ['uniform', 'quantile', 'kmeans'],
                  'discretizer__n_bins': np.arange(2,11)}

    # Define the KBinsDiscretizer and OneHotEncoder and ComplementNB objects
    discretizer = KBinsDiscretizer(encode='ordinal')
    onehot = OneHotEncoder(handle_unknown='ignore', drop='first')

    # Create the pipeline
    pipeline = Pipeline([('discretizer', discretizer), ('onehot', onehot), ('clf', clf)])

    # Create the cross-validation object using StratifiedKFold to ensure the class distribution is the same across all the folds
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # Create the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1')
    
    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train, y_train)

    # Print the best parameters and the best score
    print("Best discretization parameters:", grid_search.best_params_)
    print("Best score:", grid_search.best_score_)
    
    # Return optimal values for n_bins and strategy
    return grid_search.best_params_['discretizer__n_bins'], grid_search.best_params_['discretizer__strategy']

# Hyperparameter

In [None]:
def tune_clf_hyperparameters(clf, param_grid, X_train, y_train):
    '''
    This function optimize the hyperparameters for a classifier by searching over a specified hyperparameter grid. It uses 
    GridSearchCV and cross-validation (StratifiedKFold) to evaluate different combinations of hyperparameters, and selects  
    the combination with the highest f1-score. The function returns the best classifier with the optimal hyperparameters.
    '''
    
    # Create the cross-validation object using StratifiedKFold to ensure the class distribution is the same across all the folds
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # Create the GridSearchCV object
    clf_grid = GridSearchCV(clf, param_grid, cv=cv, scoring=f1_metric, n_jobs=-1)

    # Fit the GridSearchCV object to the training data
    clf_grid.fit(X_train, y_train)

    # Get the best hyperparameters
    print("Best hyperparameters:\n", clf_grid.best_params_)
    
    # Return best_estimator_ attribute which gives us the best model that has been fitted to the training data
    return clf_grid.best_estimator_

# KNN Model Building

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
# Scale the training and test data using the same scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert training and test sets from numpy array to pandas dataframes
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
# Define hyperparameters grid to search
param_grid = [{'n_neighbors': np.arange(2, 30), 'metric': ['euclidean','manhattan'], 'weights': ['uniform']},
              {'n_neighbors': np.arange(2, 30), 'metric': ['minkowski'], 'p': [3,4,5], 'weights': ['uniform']}]

In [None]:
# Create a KNN classifier object
knn = KNeighborsClassifier()

# Find the best classifier with the optimal hyperparameters
knn_opt = tune_clf_hyperparameters(knn, param_grid, X_train, y_train)

In [None]:
drop_column_importance_plot(knn_opt, X_train, y_train)

In [None]:
# Find Important features with positive feature_importance value
feature_importances = drop_column_importance(knn_opt, X_train, y_train, 0)
selected_features = feature_importances[feature_importances['feature importance']>0]['feature']

# Filter dataset
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [None]:
# Create a KNN classifier object
knn = KNeighborsClassifier()

# Find the best classifier with the optimal hyperparameters
knn_opt = tune_clf_hyperparameters(knn, param_grid, X_train, y_train)

In [None]:
model_evaluation(knn_opt, X_train, X_test, y_train, y_test, 'KNN')

In [None]:
knn_result = metrics_calculator(knn_opt, X_test, y_test, 'K-Nearest Neighbors')

# SVM Model Building

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)


In [None]:
# Scale the training and test data using the same scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert training and test sets from numpy array to pandas dataframes
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
# Weights associated with classes
class_weights = [{0:x, 1:1.0-x} for x in np.linspace(0.001,0.5,12)]

# Define the hyperparameter grid to search
param_grid = [{'kernel': ['poly'], 
               'degree': [2,3,4,5], 
               'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
               'C': [0.01,0.1,1, 10, 100, 1000],
               'class_weight': class_weights},
                  
              {'kernel': ['rbf','sigmoid'],
               'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
               'C': [0.01,0.1,1, 10, 100, 1000],
               'class_weight': class_weights},
                  
              {'kernel': ['linear'],
               'C': [0.01,0.1,1, 10, 100, 1000],
               'class_weight': class_weights}
             ]

In [None]:
# Weights associated with classes
class_weights = [{0:x, 1:1.0-x} for x in np.linspace(0.001,0.5,12)]

# Define the hyperparameter grid to search
param_grid = [{'kernel': ['rbf'],
               'gamma': [0.1, 0.01, 0.001, 0.0001], 
               'C': [0.1, 1, 10, 100, 1000],
               'class_weight': class_weights}]  

In [None]:
# Create a SVC object
svm = SVC(probability=True, random_state=0)

# Find the best classifier with the optimal hyperparameters
svm_opt = tune_clf_hyperparameters(svm, param_grid, X_train, y_train)

In [None]:
drop_column_importance_plot(svm_opt, X_train, y_train)

In [None]:
# Find Important features with positive feature_importance value
feature_importances = drop_column_importance(svm_opt, X_train, y_train, 0)
selected_features = feature_importances[feature_importances['feature importance']>0.01]['feature']  # Threshold value of 0.01

# Filter dataset
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [None]:
# Create a SVC object
svm = SVC(probability=True, random_state=0)

# Find the best classifier with the optimal hyperparameters
svm_opt = tune_clf_hyperparameters(svm, param_grid, X_train, y_train)

In [None]:
model_evaluation(svm_opt, X_train, X_test, y_train, y_test, 'SVM')

In [None]:
# Save the final performance of SVM classifier
svm_result = metrics_calculator(svm_opt, X_test, y_test, 'SVM')

# Decision Tree Model Building

In [None]:
# Weights associated with classes
class_weights = [{0:x, 1:1.0-x} for x in np.linspace(0.001,1,20)]
    
# Define the hyperparameter grid
param_grid = {'criterion': ['gini', 'entropy', 'log_loss'],
              'max_depth': np.arange(1, 10),
              'min_samples_split': np.arange(1, 10),
              'min_samples_leaf': np.arange(1, 10),
              'max_features': [None, 'sqrt', 'log2'],
              'class_weight': class_weights} 

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Create a  Decision Tree Classifier object
dt = DecisionTreeClassifier(random_state=0)

# Find the best classifier with the optimal hyperparameters
dt_opt = tune_clf_hyperparameters(dt, param_grid, X_train, y_train)

In [None]:
drop_column_importance_plot(dt_opt, X_train, y_train)

In [None]:
# Find Important features with positive feature_importance value
feature_importances = drop_column_importance(dt_opt, X_train, y_train, 0)
selected_features = feature_importances[feature_importances['feature importance']>0.01]['feature'] # Threshold value of 0.01

# Filter dataset
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [None]:
# Create a  Decision Tree Classifier object
dt = DecisionTreeClassifier(random_state=0)

# Find the best classifier with the optimal hyperparameters
dt_opt = tune_clf_hyperparameters(dt, param_grid, X_train, y_train)

In [None]:
model_evaluation(dt_opt, X_train, X_test, y_train, y_test, 'Decision Tree')

In [None]:
dt_result = metrics_calculator(dt_opt, X_test, y_test, 'Decision Tree')

# Random Forest Model Building

In [None]:
# Weights associated with classes
class_weights = [{0:x, 1:1.0-x} for x in np.linspace(0.001,1,20)]

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150], 
    'max_depth': np.arange(5, 12),
    'min_samples_split': [1, 2, 3],
    'min_samples_leaf': [1, 2, 3],
    'class_weight': class_weights
}

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Create a random forest classifier object
rf = RandomForestClassifier(criterion='gini', max_features=None, bootstrap=True, random_state=0)

# Find the best classifier with the optimal hyperparameters
rf_opt = tune_clf_hyperparameters(rf, param_grid, X_train, y_train)

In [None]:
drop_column_importance_plot(rf_opt, X_train, y_train)

In [None]:
model_evaluation(rf_opt, X_train, X_test, y_train, y_test, 'Primary RF')

In [None]:
# Build random forest classifier object considering the obtained optimal values for hyperparameters
rf_final = RandomForestClassifier(criterion='gini', max_features=None, bootstrap=True,  n_estimators=100, 
                                  max_depth = 9,  min_samples_leaf=6, min_samples_split=2,
                                  class_weight={0: 0.58, 1: 0.42}, random_state=0)
                             
                            
# Train the final Random Forest model
rf_final.fit(X_train, y_train)

In [None]:
model_evaluation(rf_final, X_train, X_test, y_train, y_test, 'Random Forest')

In [None]:
rf_result = metrics_calculator(rf_final, X_test, y_test, 'Random Forest')

# AdaBoost Model Building

In [None]:
# Define the hyperparameter grid for AdaBoost
ada_param_grid = {
    'base_estimator__max_depth': [3, 5, 7],
    'base_estimator__min_samples_split': [3, 5, 7],
    'base_estimator__min_samples_leaf': [1, 2, 3],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.8, 0.9, 1]
}

In [None]:
warnings.filterwarnings("ignore")

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Create the Decision Tree classifier as the base estimator
dt = DecisionTreeClassifier(criterion='gini', max_features=None, random_state=0)

# Create the AdaBoost classifier using Decision Tree as base estimator
ada = AdaBoostClassifier(base_estimator=dt, random_state=0)

# Find the best AdaBoost classifier with the optimal hyperparameters
ada_opt = tune_clf_hyperparameters(ada, ada_param_grid, X_train, y_train)

In [None]:
drop_column_importance_plot(ada_opt, X_train, y_train)

In [None]:
model_evaluation(ada_opt, X_train, X_test, y_train, y_test, 'Primary AdaBoost')

In [None]:
dt = DecisionTreeClassifier(criterion='gini', max_features=None, random_state=0, max_depth=5, min_samples_leaf=2, min_samples_split=5)

# Create the AdaBoost classifier using Decision Tree as base estimator
ada_final = AdaBoostClassifier(base_estimator=dt, random_state=0, learning_rate=0.8, n_estimators=100)

# Train the final AdaBoost classifier
ada_final.fit(X_train, y_train)

In [None]:
model_evaluation(ada_final, X_train, X_test, y_train, y_test, 'AdaBoost')

In [None]:
ada_result = metrics_calculator(ada_final, X_test, y_test, 'AdaBoost')

# Gradient Boosting Model Building

In [None]:
# Define the hyperparameter grid for tuning
gbc_param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 4, 6, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': [None, 'sqrt', 'log2'],
    'loss': ['deviance', 'exponential'],
    'criterion': ['friedman_mse', 'squared_error'],
    'subsample': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3]
}

In [None]:
# Define the hyperparameter grid for tuning
gbc_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [4, 5, 6],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'subsample': [0.9, 1.0],
    'learning_rate': [0.3, 0.4, 0.5]
}

In [None]:
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Initialize the Gradient Boosting Classifier
gbc = GradientBoostingClassifier(max_features=None, loss='deviance', criterion='friedman_mse', random_state=0)

# Find the best hyperparameters from the tuning process
gbc_opt = tune_clf_hyperparameters(gbc, gbc_param_grid, X_train, y_train)


In [None]:
drop_column_importance_plot(gbc_opt, X_train, y_train)

In [None]:
model_evaluation(gbc_opt, X_train, X_test, y_train, y_test, 'Primary Grad. Boosting')

In [None]:
# Initialize the Gradient Boosting Classifier
gbc_final = GradientBoostingClassifier(max_features=None, loss='deviance', criterion='friedman_mse',
                                 learning_rate=0.2, max_depth=5, n_estimators=100, subsample=1.0,
                                 min_samples_leaf=4, min_samples_split=2, random_state=0)

# Train the final AdaBoost classifier
gbc_final.fit(X_train, y_train)

In [None]:
model_evaluation(gbc_final, X_train, X_test, y_train, y_test, 'Gradient Boosting')

In [None]:
gbc_result = metrics_calculator(gbc_final, X_test, y_test, 'Gradient Boosting')

# XGBoost Model Building

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
# Define imbalance ratio
ratio = sum(y_train==0)/sum(y_train==1) 

# Define the hyperparameter grid to search
xgb_param_grid = {
    'max_depth': [5, 6, 7],
    'learning_rate': [0.1, 0.2, 0.3],
    'n_estimators': [50, 100, 200],
    'min_child_weight': [1, 5, 10],
    'scale_pos_weight': [ratio, ratio*1.3, ratio*1.5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'colsample_bylevel': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1],
    'max_delta_step': [0, 1, 2],
    'gamma': [0, 0.1, 1],
    'max_leaf_nodes': [2, 4, 6]
}

In [None]:
# Initialize the XGBoost Classifier
xgb_opt = XGBClassifier(max_depth=5,
                        learning_rate=0.3,
                        n_estimators=200,
                        min_child_weight=1,
                        scale_pos_weight=1.5,
                        colsample_bytree=0.8,
                        gamma=0.1,
                        booster='gbtree',
                        objective='binary:logistic',
                        eval_metric='error', 
                        random_state=0)

# Train the XGBoost Classifier
xgb_opt.fit(X_train, y_train)

In [None]:
drop_column_importance_plot(xgb_opt, X_train, y_train)

In [None]:
# Find Important features
feature_importances = drop_column_importance(xgb_opt, X_train, y_train, 0)
selected_features = feature_importances[feature_importances['feature importance']>0.002]['feature'] # Threshold value of 0.002

# Filter dataset
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [None]:
# Initialize the XGBoost Classifier
xgb = XGBClassifier(max_depth=5,
                    learning_rate=0.3,
                    n_estimators=200,
                    min_child_weight=1,
                    scale_pos_weight=1.5,
                    colsample_bytree=0.8,
                    gamma=0.1,
                    booster='gbtree',
                    objective='binary:logistic',
                    eval_metric='error', 
                    random_state=0)

# Train the XGBoost Classifier
xgb.fit(X_train, y_train)

In [None]:
model_evaluation(xgb, X_train, X_test, y_train, y_test, 'Primary XGBoost')

In [None]:
# Initialize the XGBoost Classifier
xgb_final = XGBClassifier(max_depth=4,
                          learning_rate=0.3,
                          n_estimators=200,
                          min_child_weight=1,
                          scale_pos_weight=1.5,
                          colsample_bytree=0.8,
                          gamma=0.1,
                          booster='gbtree',
                          objective='binary:logistic',
                          eval_metric='error', 
                          random_state=0)

# Train the XGBoost Classifier
xgb_final.fit(X_train, y_train)

In [None]:
model_evaluation(xgb_final, X_train, X_test, y_train, y_test, 'XGBoost')


In [None]:
xgb_result = metrics_calculator(xgb_final, X_test, y_test, 'XGBoost')

# Conclusion

In [None]:
# Concatenate previous classifiers perfermance results into a single dataframe
results = pd.concat([knn_result,svm_result,
                     dt_result, rf_result,ada_result, gbc_result, xgb_result], axis=1).T

# Sort the dataframe in descending order based on F1-score values
results.sort_values(by='F1-score', ascending=False, inplace=True)

# Color the F1-score column
results.style.applymap(lambda x: 'background-color: royalblue', subset='F1-score')