# Configuring libraries & utilities

In [None]:
# Upgrade scikit-learn
!pip install --upgrade scikit-learn -q

# Install my custom module
!pip install git+https://github.com/Althaf9900/flash.git -q

In [None]:
# Standard Library Imports
import os
import json
import math

# Google Drive Integration
from google.colab import drive

# Data Manipulation and Preprocessing
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Data Preprocessing and Transformation
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    FunctionTransformer, PowerTransformer, QuantileTransformer,
    StandardScaler, MinMaxScaler, RobustScaler,
    LabelEncoder, OneHotEncoder
)

# Machine Learning Models
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier,
    VotingClassifier
)

# Model Evaluation and Hyperparameter Tuning
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Saving
import joblib

# Custom module for additional functionalities
import flash as fz

In [None]:
# Mount Google Drive if it's not already mounted
mount_point = "/content/drive"

if not os.path.ismount(mount_point):
    print("Mounting Google Drive...")
    drive.mount(mount_point)
else:
    print("Google Drive is already mounted.")

%cd /content/drive/MyDrive/Projects/loan-sanction-prediction


# Initial dataset assessment & preparation


In [None]:
# Loading the dataset
df_copy = pd.read_csv('loan_sanction_train.csv')
df = df_copy # Keep a copy if needed

In [None]:
# Understanding structure of the dataset
df.sample(5)

In [None]:
# Checking whether Loan_ID contains duplicate IDs
df['Loan_ID'].duplicated().sum()

In [None]:
# Checking whether the target feature is imbalanced or not
plt.pie(df['Loan_Status'].value_counts(), labels = df['Loan_Status'].unique(), autopct='%0.2f%%',
        shadow=True, explode=(0, 0.1), counterclock=False, colors=['lime', 'cyan'])
plt.show()

In [None]:
# Columns of the dataset
print(df.columns)

In [None]:
# Getting some information about the dataset
df.info()

In [None]:
# Dropping useless features that does not provide any predictive value to model training
df.drop('Loan_ID', axis=1, inplace=True)

In [None]:
# Extracting numerical features & categorical features from the dataset using a custom made module
num_cols = fz.get_num_col(df)
cat_cols = fz.get_cat_col(df, ignore_cols=['Loan_Status'])

# Print
print(num_cols)
print(cat_cols)

In [None]:
# Count the number of categorical and numerical features
n_cat_cols = len(cat_cols)
n_num_cols = len(num_cols)

# Print
print(f'Number of numerical features: {n_num_cols}')
print(f'Number of categorical features: {n_cat_cols}')


# EDA (Before data cleaning)



## Outlier analysis


In [None]:
# Statistical measures
df[num_cols].describe().T

In [None]:
# Histogram & Box-plot

def hist_box_plt(df, num_feature_list, figsize=None, title=None, hist_xlabel=None,
                hist_ylabel=None, box_xlabel=None, box_ylabel=None):
    # Number of features
    n_num_cols = len(num_feature_list)

    # Subplots
    if not figsize and not isinstance(figsize, tuple):
        # Create subplots with dynamic figure size based on the number of numerical columns
        fig, axs = plt.subplots(n_num_cols, 2, figsize=(13, n_num_cols*3 + 1))
    else:
        fig, axs = plt.subplots(n_num_cols, 2, figsize=figsize)

    # Plotting histograms and boxplots
    for i, col in enumerate(num_feature_list):
        # Histogram
        sns.histplot(df[col], kde=True, ax=axs[i, 0])
        axs[i, 0].set_title(f'Histogram of {col}')
        axs[i, 0].set_xlabel(hist_xlabel if hist_xlabel else '')
        axs[i, 0].set_ylabel(hist_ylabel if hist_ylabel else '')
        axs[i, 0].grid(True)

        # Boxplot
        sns.boxplot(data=df, x=col, ax=axs[i, 1])
        axs[i, 1].set_title(f'Boxplot of {col}')
        axs[i, 1].set_xlabel(box_xlabel if box_xlabel else '')
        axs[i, 1].set_ylabel(box_ylabel if box_ylabel else '')
        axs[i, 1].grid(True)

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
hist_box_plt(df, num_cols)

In [None]:
# Features with outliers
ftrs_with_outliers = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

In [None]:
def find_outliers(df, features_with_outliers):
    outlier_df = pd.DataFrame()
    for feature in features_with_outliers:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = df[feature][(df[feature] < lower_bound) | (df[feature] > upper_bound)]

        # Add outliers to the DataFrame if any exist
        if not outliers.empty:
            outlier_df[feature] = outliers.sort_values().reset_index(drop=True)

    return outlier_df

In [None]:
outlier_df = find_outliers(df, ftrs_with_outliers)
outlier_df

Conclusions:

- There are many outliers on the upper side of all numerical features.

- None of the numerical features have outliers on the lower side.

- Since we only have few data points, we can't afford to drop any data points.
    
- None of the numerical features follow a normal distribution.

- The outliers appear to be valid and are not due to data entry issues.

- Since the outliers are valid, apply capping methods, such as:

    - Custom threshold capping: Set a threshold value based on analysis of the boxplots.
    - Percentile-based capping: Limit outliers to a specified percentile range.
    - Median imputation: Replace extreme values with the median.

- After building the predictive model, evaluate the accuracy of all capping methods.


## Missing value analysis


In [None]:
def calc_na_values(df, features, pct=True):
    # Count of missing values in features
    missing_value_count = df[features].isna().sum()

    # Filter out features with no missing values
    missing_value_count = missing_value_count[missing_value_count > 0]

    # Store features with missing values
    features_with_missing_values = missing_value_count.index.to_list()

    if pct:
        # Percentage of missing values in features
        missing_value_pct = round(missing_value_count / df.shape[0] * 100, 2)
        return missing_value_pct, features_with_missing_values
    else:
        return missing_value_count, features_with_missing_values

In [None]:
# Numerical features
num_miss_pct, num_ftrs_with_na = calc_na_values(df, num_cols)

# Percentage of missing values in numerical features
print(num_miss_pct)

# Numerical features with missing values
print(num_ftrs_with_na)

In [None]:
# Categorical features
cat_miss_pct, cat_ftrs_with_na = calc_na_values(df, cat_cols)

# Percentage of missing values in categorical features
print(cat_miss_pct)

# Categorical features with missing values
print(cat_ftrs_with_na)

In [None]:
def na_value_viz(df, figsize=None, cmap='Blues', xticks_rotation=None):
    if figsize:
        plt.figure(figsize=figsize)
    else:
        plt.figure(figsize=(15, 4))
    sns.heatmap(df.isna(), cbar=False, cmap=cmap, yticklabels=False)
    plt.xticks(rotation=xticks_rotation)
    plt.show()

In [None]:
# Visualizing whether the missing values are missing at random or not
na_value_viz(df, xticks_rotation=45)

In [None]:
# Plotting histogram of numerical features that have missing values to decide whether to use mean or median
plt.figure(figsize=(6, 4))
sns.histplot(df['LoanAmount'], kde=True)
plt.show()

Conclusions:

- Only one numerical feature (['LoanAmount']) has missing values.

- Six categorical features (['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']) have missing values.

- Since we only have few data points, we cannot afford to drop any data points.

- The percentage of missing values is low across all features, so there is no need to drop any columns.

- It appears that the missingness of values is random.

- Missing value handling:

    - Median imputation (for numerical features that are not normally distributed):
        - Loan amount

    - Mode imputation:
        - Categorical features



# Data Cleaning


Data cleaning steps:

- Outlier handling:

    - Since the outliers are valid, apply capping methods, such as:

        - Custom threshold capping: Set a threshold value based on analysis of the boxplots.
        - Percentile-based capping: Limit outliers to a specified percentile range.
        - Median imputation: Replace outliers with the median.


- Missing value handling:

    - Median imputation (for numerical features that are not normally distributed):
        - Loan amount

    - Mode imputation:
        - Categorical features

- Data type adjustments:

    - Data type comapatibility:
        - Applicant income: float
        - Loan_Amount_Term & Credit_History: int then, str

    - Memory usage optimization:
        - Categorical features: category

In [None]:
def custom_threshold_capping(df, features_with_outliers, cap_values):
    for i, feature in enumerate(ftrs_with_outliers):
        # Cap the values
        df[feature] = df[feature].clip(upper=cap_values[i])

In [None]:
# Outlier handling: Custom threshold capping

# Define the cap value
cap_values = [20833, 8980, 500]
custom_threshold_capping(df, ftrs_with_outliers, cap_values)

In [None]:
# Missing value handling

# Imputing missing values in LoanAmount feature with median
median_imputer = SimpleImputer(strategy='median')
df['LoanAmount'] = median_imputer.fit_transform(df[['LoanAmount']])

# Imputing missing values in categorical features with mode
mode_imputer = SimpleImputer(strategy='most_frequent')
df[cat_ftrs_with_na] = mode_imputer.fit_transform(df[cat_ftrs_with_na])

# Test
if df.isna().sum().sum() == 0:
    print("There are no missing values left in the DataFrame.")
else:
    print("There are still missing values in the DataFrame.")

In [None]:
# Checking memory usage before dtype adjustments
print("Memory usage before adjustment:", df.memory_usage(deep=True).sum())
print()

# Data type adjustments

# Data type compatibility
df['ApplicantIncome'] = df['ApplicantIncome'].astype(float)

# Converting numerical categorical features to int and then to str
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].astype(int).astype(str)
df['Credit_History'] = df['Credit_History'].astype(int).astype(str)

# Memory usage optimization
df[cat_cols] = df[cat_cols].astype('category')

# Print data types to confirm changes
print(df.dtypes)

# Checking memory usage after dtype adjustments
print()
print("Memory usage after adjustment:", df.memory_usage(deep=True).sum())


# EDA (On Independent features)



## Univariate analysis



### Numerical


In [None]:
# Statistical measures
df[num_cols].describe().T

In [None]:
def calc_stats_moments(df, features):
    stats_moments = pd.DataFrame(
        [
            {
                'Mean': round(float(df[col].mean()), 2),
                'Standard deviation': round(float(df[col].std()), 2),
                'Skewness': round(float(df[col].skew()), 2),
                'Kurtosis': round(float(df[col].kurtosis()), 2)
            }
            for col in features
        ],
        index=features
    )
    return stats_moments

In [None]:
# Statistical moments
calc_stats_moments(df, num_cols)

In [None]:
# Plotting histogram & boxplot
hist_box_plt(df, num_cols)

Conclusions:

- The distributions of applicant income and loan amount are right-skewed (positively skewed).
- Feature transformation is required for all numerical features to address this skewness.
- It looks like people with a co-applicant income of 0 don't have a co-applicant. So, we should create a new feature called 'Has_coapplicant'. For this feature, set the value to 'No' for individuals with a co-applicant income of 0, and 'Yes' for those with a non-zero co-applicant income.


### Categorical


In [None]:
# Statistical measures
df[cat_cols].describe().T

In [None]:
def countplots(df, features, n_cols=3, figsize=None, rotate_x_labels=None, rotation=45):
    n_features = len(features)

    # Calculate number of rows needed for subplots
    n_rows = math.ceil(n_features / n_cols)

    # Create subplots
    if not figsize:
        figsize=(n_cols * 4 + 1, n_rows * 3)

    fig, axs = plt.subplots(n_rows, n_cols, figsize=figsize)

    # Flatten axs array
    axs = axs.flatten()

    # Plot countplots
    for i, feature in enumerate(features):
        sns.countplot(data=df, x=feature, ax=axs[i])
        axs[i].set_title(feature)
        axs[i].set_xlabel('')
        axs[i].set_ylabel('')

        if feature in rotate_x_labels:
            axs[i].tick_params(axis='x', rotation=rotation)

    # Turn off any unused subplots
    for j in range(n_features, len(axs)):
        axs[j].axis('off')

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
# Countplots
countplots(df, cat_cols, rotate_x_labels=['Loan_Amount_Term'])

Conclusions:

- Males take out more loans compared to females.
- Married individuals take out more loans compared to - unmarried individuals.
- People without dependents take out more loans compared to those with dependents.
- Graduates take out more loans compared to non-graduates.
- Non-self-employed individuals take out more loans compared to self-employed individuals.
- Most people opt for a loan term of 360 months (30 years), followed by 180 months (15 years).
- People with a credit history of 1 take out more loans compared to those with a credit history of 0.
- People living in semi-urban areas take out more loans compared to those living in rural and urban areas. Rural residents take out the fewest loans. Although these relationships aren't strong, they may represent general trends.


## Bivariate analysis



### Numerical - Numerical


In [None]:
def hide_current_axis(*args, **kwds):
    plt.gca().set_visible(False)

In [None]:
def pairplot(df, features, kind='scatter', diag_kind='kde', plot_kws=None, fig_width=12.5,
             fig_height=None):
    n_features = len(features)

    if not fig_height:
        fig_height = n_features + 3

    height = fig_height / n_features
    aspect = fig_width / fig_height

    if kind == 'reg' and plot_kws is None:
        plot_kws = {'line_kws':{'color':'red'}}

    g = sns.pairplot(df[features], kind=kind, diag_kind=diag_kind, plot_kws=plot_kws,
                     height=height, aspect=aspect)

    g.map_upper(hide_current_axis)

    plt.show()

In [None]:
# Scatter plots with pairplot
pairplot(df, num_cols)

In [None]:
# Regplots with pairplot
pairplot(df, num_cols, kind='reg')

In [None]:
def create_custom_cmap(colors=["#FF0000", "#FFFF00", "#00FF00"]):
    cmap = mcolors.LinearSegmentedColormap.from_list("custom_cmap", colors)
    return cmap

In [None]:
# Function to create a mask for the upper triangle
def create_upper_triangle_mask(df, method):
    corr = df.corr(method=method)
    mask = np.triu(np.ones_like(corr, dtype=bool))
    np.fill_diagonal(mask, False)  # Optional: keep or remove diagonal elements
    return mask

# Function to plot heatmap
def plot_corr_heatmap(df, method='pearson', cmap=None, title=None, ax=None):
    mask = create_upper_triangle_mask(df, method)

    if cmap is None:
        cmap = create_custom_cmap()

    sns.heatmap(df.corr(method=method), mask=mask, annot=True, cmap=cmap, ax=ax, cbar=False)

    if title is None:
        if method in ['pearson', 'spearman']:
            title = f'{method.capitalize()} Correlation Heatmap'
        else:
            title = 'Correlation Heatmap'

    ax.set_title(title)

In [None]:
# Heatmap

# Create subplots
fig, axs = plt.subplots(1, 3, figsize=(13, 5), gridspec_kw={'width_ratios': [1, 1, 0.05]})

# Plot Pearson and Spearman heatmaps
plot_corr_heatmap(df[num_cols], ax=axs[0])
plot_corr_heatmap(df[num_cols], method='spearman', ax=axs[1])

# Create a common colorbar for both heatmaps
cbar = fig.colorbar(axs[0].collections[0], cax=axs[2])

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()

Conclusions:

- None of the features show a strong linear relationship with each other. However, there is a moderate relationship between applicant income and loan amount. This makes sense because individuals with higher incomes often need larger loan amounts.

- Both Pearson and Spearman correlation coefficients show similar patterns, but their values are slightly different. Since the heatmaps from both are similar, the exact values are less important. In this case, Spearman's correlation is more suitable because the data isn't normally distributed, doesn't have a linear relationship between features, and has outliers.


### Categorical - Categorical


In [None]:
def crosstab_heatmap(df, features, target = None, cmap=None, fig_width=12.5,
                     fig_height=None, annot=True, cbar=False):
    if cmap is None:
        cmap = create_custom_cmap()

    n_features = len(features)
    if target is None:
        n_plots = n_features * (n_features-1) // 2
    else:
        n_plots = n_features

    # Automatically adjust fig_height if not provided
    if not fig_height:
        fig_height = n_plots * 5

    fig, axs = plt.subplots(n_plots, 2, figsize=(fig_width, fig_height))
    axs = axs.reshape(-1, 2)  # Flatten the array of subplots

    def plot_heatmaps(ax, table_index, table_column, title_index, title_column):
        sns.heatmap(table_index, annot=annot, cmap=cmap, cbar=cbar, fmt='0.2f',
                    xticklabels=True, yticklabels=True, ax=ax[0])
        sns.heatmap(table_column, annot=annot, cmap=cmap, cbar=cbar, fmt='0.2f',
                    xticklabels=True, yticklabels=True, ax=ax[1])
        ax[0].set_title(title_index)
        ax[1].set_title(title_column)

    if target is None:
        plot_index = 0
        for i in range(n_features):
            for j in range(i + 1, n_features):
                table_index = pd.crosstab(df[features[i]], df[features[j]], normalize='index') * 100
                table_column = pd.crosstab(df[features[i]], df[features[j]], normalize='columns') * 100
                title_index = f"{features[i]} vs {features[j]} (Index Normalized)"
                title_column = f"{features[i]} vs {features[j]} (Column Normalized)"
                plot_heatmaps(axs[plot_index], table_index, table_column, title_index, title_column)
                plot_index += 1
    else:
        for i, feature in enumerate(features):
            table_index = pd.crosstab(df[feature], df[target], normalize='index') * 100
            table_column = pd.crosstab(df[feature], df[target], normalize='columns') * 100
            title_index = f"{feature} vs {target} (Index Normalized)"
            title_column = f"{feature} vs {target} (Column Normalized)"
            plot_heatmaps(axs[i], table_index, table_column, title_index, title_column)

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
# Heatmap
crosstab_heatmap(df, cat_cols)


### Numerical - Categorical


In [None]:
def num_cat_analysis(df, num_feature, cat_features, fig_width=13, fig_height=None,
                     mean_color='blue', median_color='red'):
    n_cat_features = len(cat_features)

    # Set default figure height based on number of categorical features
    if not fig_height:
        fig_height = n_cat_features * 4

    fig, axs = plt.subplots(n_cat_features, 2, figsize=(fig_width, fig_height))

    for i, cat_feature in enumerate(cat_features):
        # Plot violinplot
        sns.violinplot(df, x=cat_feature, y=num_feature, hue=cat_feature,
                       ax=axs[i, 0])
        axs[i, 0].set_title(f'Violinplot of {num_feature} by {cat_feature}')

        # Plot pointplot
        sns.pointplot(df, x=cat_feature, y=num_feature, errorbar=None, color=mean_color,
                      ax=axs[i, 1], label='Mean')
        sns.pointplot(df, x=cat_feature, y=num_feature, errorbar=None, color=median_color,
                      estimator='median', ax=axs[i, 1], label='Median')
        axs[i, 1].set_title(f'Pointplot of {num_feature} by {cat_feature}')

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
# Applicant income
num_cat_analysis(df, 'ApplicantIncome', cat_cols)

In [None]:
# Coapplicant income
num_cat_analysis(df, 'CoapplicantIncome', cat_cols)

In [None]:
# Loan amount
num_cat_analysis(df, 'LoanAmount', cat_cols)


# Feature construction


- It appears that individuals with a co-applicant income of 0 do not have a co-applicant. Therefore, create a feature named 'Has_coapplicant'. In this feature, set individuals with a co-applicant income of 0 to 'No', and those with a non-zero co-applicant income to 'Yes'.

In [None]:
df['Has_coapplicant'] = np.where(df['CoapplicantIncome'] == 0, 'No', 'Yes')
df['Has_coapplicant']

In [None]:
# Appending newly constructed features to feature lists based on their type
cat_cols.append('Has_coapplicant')


# EDA (On newly constructed features)



## Univariate analysis


In [None]:
df['Has_coapplicant'].describe().T

In [None]:
sns.countplot(x = df['Has_coapplicant'])
plt.show()


## Bivariate analysis



#### Categorical - Categorical

In [None]:
# Heatmap

# Create subplots
fig, axs = plt.subplots(n_cat_cols, 2, figsize=(13, 40))  # Adjust size as needed
axs = axs.reshape(-1, 2)  # Flatten the array of subplots

# Plot heatmaps
for i in range(n_cat_cols):
    # Create contingency tables
    contingency_table_1 = pd.crosstab(df[cat_cols[i]], df['Has_coapplicant'], normalize='index') * 100
    contingency_table_2 = pd.crosstab(df[cat_cols[i]], df['Has_coapplicant'], normalize='columns') * 100

    # Plot heatmaps
    sns.heatmap(contingency_table_1, annot=True, cmap=create_custom_cmap, cbar=False, fmt='0.2f',
                xticklabels=True, yticklabels=True, ax=axs[i, 0])
    sns.heatmap(contingency_table_2, annot=True, cmap=create_custom_cmap, cbar=False, fmt='0.2f',
                xticklabels=True, yticklabels=True, ax=axs[i, 1])

    # Set titles for each subplot
    axs[i, 0].set_title(f"{cat_cols[i]} vs Has_coapplicant (Index Normalized)")
    axs[i, 1].set_title(f"{cat_cols[i]} vs Has_coapplicant (Column Normalized)")

# Adjust layout
plt.tight_layout()
plt.show()


#### Numerical - Categorical


In [None]:
# Violinplot & Pointplot

# Create subplots
fig, axs = plt.subplots(n_num_cols, 2, figsize=(12, 9))

for i in range(n_num_cols):

    # Plot violinplot
    sns.violinplot(data=df, x='Has_coapplicant', y=num_cols[i], ax=axs[i, 0])
    axs[i, 0].set_title(f'Violinplot of {num_cols[i]} by Has_coapplicant')

    # Plot pointplot
    sns.pointplot(data=df, x='Has_coapplicant', y=num_cols[i], errorbar=None, color='blue', ax=axs[i, 1])
    sns.pointplot(data=df, x='Has_coapplicant', y=num_cols[i], errorbar=None, color='red', estimator='median', ax=axs[i, 1])
    axs[i, 1].set_title(f'Pointplot of {num_cols[i]} by Has_coapplicant')
    handles = [
        plt.Line2D([0], [0], color='blue', marker='o', markersize=8, label='Mean'),
        plt.Line2D([0], [0], color='red', marker='o', markersize=8, label='Median')
    ]
    axs[i, 1].legend(handles=handles, title='Estimation type')

# Adjust layout
plt.tight_layout()
plt.show()


# EDA (Between independent features and target feature)


## Categorical - Categorical


In [None]:
crosstab_heatmap(df, cat_cols, target='Loan_Status')


## Numerical - Categorical


In [None]:
def num_cat_analysis(df, num_feature, cat_feature, fig_width=13, fig_height=None,
                     mean_color='blue', median_color='red'):
    if isinstance(num_feature, list) and isinstance(cat_feature, str):
        n_features = len(num_feature)

        # Set default figure height based on number of categorical features
        if not fig_height:
            fig_height = n_features * 4

        fig, axs = plt.subplots(n_features, 2, figsize=(fig_width, fig_height))

        for i, feature in enumerate(num_feature):
            # Plot violinplot
            sns.violinplot(df, x=cat_feature, y=feature, hue=cat_feature,
                        ax=axs[i, 0])
            axs[i, 0].set_title(f'Violinplot of {feature} by {cat_feature}')

            # Plot pointplot
            sns.pointplot(df, x=cat_feature, y=feature, errorbar=None, color=mean_color,
                        ax=axs[i, 1], label='Mean')
            sns.pointplot(df, x=cat_feature, y=feature, errorbar=None, color=median_color,
                        estimator='median', ax=axs[i, 1], label='Median')
            axs[i, 1].set_title(f'Pointplot of {feature} by {cat_feature}')
    elif isinstance(cat_feature, list) and isinstance(num_feature, str):
        n_features = len(cat_feature)

        # Set default figure height based on number of categorical features
        if not fig_height:
            fig_height = n_features * 4

        fig, axs = plt.subplots(n_features, 2, figsize=(fig_width, fig_height))

        for i, feature in enumerate(cat_feature):
            # Plot violinplot
            sns.violinplot(df, x=feature, y=num_feature, hue=feature,
                        ax=axs[i, 0])
            axs[i, 0].set_title(f'Violinplot of {feature} by {num_feature}')

            # Plot pointplot
            sns.pointplot(df, x=feature, y=num_feature, errorbar=None, color=mean_color,
                        ax=axs[i, 1], label='Mean')
            sns.pointplot(df, x=feature, y=num_feature, errorbar=None, color=median_color,
                        estimator='median', ax=axs[i, 1], label='Median')
            axs[i, 1].set_title(f'Pointplot of {feature} by {num_feature}')

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
num_cat_analysis(df, num_feature=num_cols, cat_feature='Loan_Status')

In [None]:
num_cat_analysis(df, cat_feature=cat_cols, num_feature='ApplicantIncome')

In [None]:
# Violinplot & Pointplot

# Create subplots
fig, axs = plt.subplots(n_num_cols, 2, figsize=(12, 9))

for i in range(n_num_cols):

    # Plot violinplot
    sns.violinplot(data=df, x='Loan_Status', y=num_cols[i], ax=axs[i, 0])
    axs[i, 0].set_title(f'Violinplot of {num_cols[i]} by Loan_Status')

    # Plot pointplot
    sns.pointplot(data=df, x='Loan_Status', y=num_cols[i], errorbar=None, color='blue', ax=axs[i, 1])
    sns.pointplot(data=df, x='Loan_Status', y=num_cols[i], errorbar=None, color='red', estimator='median', ax=axs[i, 1])
    axs[i, 1].set_title(f'Pointplot of {num_cols[i]} by Loan_Status')
    handles = [
        plt.Line2D([0], [0], color='blue', marker='o', markersize=8, label='Mean'),
        plt.Line2D([0], [0], color='red', marker='o', markersize=8, label='Median')
    ]
    axs[i, 1].legend(handles=handles, title='Estimation type')

# Adjust layout
plt.tight_layout()
plt.show()



# Feature transformation


In [None]:
# Epsilon to avoid log(0) and sqrt(0)
epsilon = 1e-10

# Initialize transformers
transformers = {
    'Log': FunctionTransformer(func=lambda X: np.log(X + epsilon), validate=False),
    'Square Root': FunctionTransformer(func=lambda X: np.sqrt(X + epsilon),
                                       validate=False),
    'Square': FunctionTransformer(func=np.square, validate=False),
    'Reciprocal': FunctionTransformer(func=lambda X: np.reciprocal(X + epsilon),
                                      validate=False),
    'Yeo-Johnson': PowerTransformer(standardize=False),
    'Quantile': QuantileTransformer(n_quantiles=df.shape[0], output_distribution='normal')
}

# Apply transformations
transformed_data = {}
for name, transformer in transformers.items():
    transformed_data[name] = pd.DataFrame(transformer.fit_transform(df[num_cols]),
                                          columns=num_cols)

In [None]:
# Histogram

# Create subplots
fig, axs = plt.subplots(n_num_cols, len(transformers) + 1, figsize=(26, 9))

# Plot histograms
for i, col in enumerate(num_cols):
    # Original
    sns.histplot(df[col], kde=True, ax=axs[i, 0])
    axs[i, 0].set_title(f'Original {col}')

    for j, (name, transformed_df) in enumerate(transformed_data.items()):
        sns.histplot(transformed_df[col], kde=True, ax=axs[i, j + 1])
        axs[i, j + 1].set_title(f'{name} {col}')

# Turn off y axis labels for all subplots
axs = axs.flatten()
for ax in axs:
    ax.set_ylabel('')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# QQ plot

# Create subplots
fig, axs = plt.subplots(n_num_cols, len(transformers) + 1, figsize=(26, 9))

# Plot Q-Q plots
for i, col in enumerate(num_cols):
    # Original
    stats.probplot(df[col], dist="norm", plot=axs[i, 0])
    axs[i, 0].set_title(f'Original {col}')
    axs[i, 0].get_lines()[1].set_color('red')  # Make the reference line red

    for j, (name, transformed_df) in enumerate(transformed_data.items()):
        stats.probplot(transformed_df[col], dist="norm", plot=axs[i, j + 1])
        axs[i, j + 1].set_title(f'{name} {col}')
        axs[i, j + 1].get_lines()[1].set_color('red')  # Make the reference line red

# Turn off x and y axis labels for all subplots
axs = axs.flatten()
for ax in axs:
    ax.set_xlabel('')
    ax.set_ylabel('')

# Adjust layout
plt.tight_layout()
plt.show()

Conclusions:

- ApplicantIncome & LoanAmount: Quantile Transform
- CoapplicantIncome: Reciprocal Transform

In [None]:
df['ApplicantIncome'] = transformed_data['Quantile']['ApplicantIncome']
df['CoapplicantIncome'] = transformed_data['Reciprocal']['CoapplicantIncome']
df['LoanAmount'] = transformed_data['Quantile']['LoanAmount']


# Building predictive model


## Preparing the data


In [None]:
# Splitting the data into features and target
X_train = df.drop('Loan_Status', axis=1)
X_train = X_train[num_cols + cat_cols] # To place numerical columns first
y_train = df['Loan_Status']

In [None]:
# Label encode the target feature

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit and transform the target feature
y_train = le.fit_transform(y_train)

In [None]:
# Getting indices of numerical and categorical features of the X_train dataframe
indices_num_cols = X_train.columns.get_indexer(num_cols)
indices_cat_cols = X_train.columns.get_indexer(cat_cols)

In [None]:
# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), indices_num_cols), # Scaling numerical columns
        ('encoder', OneHotEncoder(drop='first', sparse_output=False), indices_cat_cols) # OneHotEncoding categorical columns
    ],
    remainder='passthrough'  # Keep the columns not listed in num_cols or cat_cols as is
)

In [None]:
# Pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor)
])

In [None]:
# Preprocessing and Transforming training data using pipeline
X_train_transformed = pipe.fit_transform(X_train)


### Handling imbalanced dataset


In [None]:
# Oversampling the dataset using SMOTE
smote = SMOTE(random_state=42)  # Initialize SMOTE with optional random_state for reproducibility
X_train_transformed, y_train = smote.fit_resample(X_train_transformed, y_train)

In [None]:
# Test
unique_values, counts = np.unique(y_train, return_counts=True)

# Print the counts of each class
for value, count in zip(unique_values, counts):
    print(f"Class {value}: {count}")


## Model selection (Before hyperparameter tuning)


In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression,
    'Random Forest': RandomForestClassifier,
    'Gradient Boosting': GradientBoostingClassifier,
    'Support Vector Machine': SVC,
    'KNN': KNeighborsClassifier,
    'Decision Trees': DecisionTreeClassifier,
    'Xgboost': XGBClassifier,
    'Extra Trees': ExtraTreesClassifier
}
# Define metric functions
metrics = {
    'accuracy': accuracy_score,
    'precision': precision_score,
    'recall': recall_score,
    'f1': f1_score
}

In [None]:
def eval_models_across_metrics(models, metrics, X_train, y_train, cv=5, sort=False,
                               model_params=None):

    models_across_metrics = {metric: {} for metric in metrics}

    for metric in metrics:
        for model_name, model in models.items():

            if isinstance(model_params, dict):
                cv_scores = cross_val_score(model(**model_params[model_name]), X_train,
                                            y_train, cv=cv, scoring=metric)
            else:
                cv_scores = cross_val_score(model(), X_train,
                                            y_train, cv=cv, scoring=metric)

            cv_scores_mean = cv_scores.mean()

            models_across_metrics[metric][model_name] = round(cv_scores_mean, 3)

    if sort:
        for metric, model_scores in models_across_metrics.items():
            models_across_metrics[metric] = dict(
                sorted(model_scores.items(), key=lambda item: item[1], reverse=True)
                )

    return models_across_metrics

In [None]:
# Comparing models across metrics
models_across_metrics = eval_models_across_metrics(models, metrics.keys(), X_train_transformed,
                                                   y_train, sort=True)
print(json.dumps(models_across_metrics, indent=4))

Conclusions:

After evaluating the metrics, I have decided to focus on the top 3 models: Random Forest Classifier, Extra Trees Classifier, Xgboost Classifier. These models have demonstrated strong performance across the different metrics, making them the best candidates for further fine-tuning and optimization.


## Hyperparameter tuning


In [None]:
# Define top models for further hyperparameter tuning
models = {
    'Random Forest': RandomForestClassifier,
    'Xgboost': XGBClassifier,
    'Extra Trees': ExtraTreesClassifier
}

In [None]:
# Define parameter grids
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Xgboost': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2]
    },
    'Extra Trees': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }
}

In [None]:
def hyperparameter_tuning(models_dict = None, param_grids = None, X_train = None,
                          y_train = None, file_path=None, force_overwrite = False):
    def perform_grid_search(models_dict, param_grids, X_train, y_train):
        best_params = {}
        for model_name, model in models_dict.items():
            print(f"Processing {model_name}...")
            param_grid = param_grids[model_name]
            grid_search = GridSearchCV(estimator=model(), param_grid=param_grid,
                                    scoring='accuracy', cv=5, n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_params[model_name] = {
                'Best Parameters': grid_search.best_params_,
                'Average accuracy score on the best parameters': round(grid_search.best_score_, 3)
            }
        return best_params

    if file_path:
        if os.path.exists(file_path) and not force_overwrite:
            best_params = joblib.load(file_path)
        else:
            best_params = perform_grid_search(models_dict, param_grids, X_train, y_train)
            joblib.dump(best_params, file_path)
    else:
        best_params = perform_grid_search(models_dict, param_grids, X_train, y_train)

    return best_params

In [None]:
# Finding best hyperparameters on top models using GridSearchCV
best_params = hyperparameter_tuning(file_path='best_params.joblib')
print(json.dumps(best_params, indent=4))

In [None]:
# Comparing top models across metrics after hyperparameter tuning
models_across_metrics = eval_models_across_metrics(models, metrics.keys(), X_train_transformed,
                                                   y_train, sort=True, model_params=best_params)
print(json.dumps(models_across_metrics, indent=4))


## Model training


In [None]:
estimators = []
for model_name, model in models.items():
    estimators.append((model_name, model(**best_params[model_name]['Best Parameters'])))

In [None]:
def eval_voting_clf(estimators, X_train, y_train, cv = 5):
    # Create a voting classifier (hard voting)
    voting_clf_hard = VotingClassifier(estimators=estimators, voting='hard')

    # Create a voting classifier (soft voting)
    voting_clf_soft = VotingClassifier(estimators=estimators, voting='soft')

    # Apply cross-validation
    cv_scores_h = cross_val_score(voting_clf_hard, X_train, y_train, cv=cv, scoring='accuracy')
    cv_scores_s = cross_val_score(voting_clf_soft, X_train, y_train, cv=cv, scoring='accuracy')

    accuracy_results = {}

    accuracy_results['Hard Margin'] = round(cv_scores_h.mean(), 3)
    accuracy_results['Soft Margin'] = round(cv_scores_s.mean(), 3)

    return accuracy_results

In [None]:
# Accuracy on hard and soft margin voting classifiers
accuracy = eval_voting_clf(estimators, X_train_transformed, y_train)
accuracy

In [None]:
# Fit the best model
voting_clf = VotingClassifier(estimators, voting='hard')
voting_clf.fit(X_train_transformed, y_train)

## Saving

In [None]:
# Save the Machine Learning model
joblib.dump(voting_clf, 'model.joblib')

In [None]:
# Save the preprocessing steps
joblib.dump(pipe, 'preprocessor.joblib')