In [None]:
# Basic Libraries
import pandas as pd
import numpy as np
from scipy import stats

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import PolynomialFeatures
from sklearn.utils import resample

# For data manipulation
import pandas as pd
import numpy as np

# For splitting the data
from sklearn.model_selection import train_test_split

# For resampling (handling class imbalance)
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

# For scaling numeric features
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from lightgbm import LGBMClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping  # Correct import for the early stopping callback
import xgboost as xgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier


# MLFlow for MLOps
import mlflow
import mlflow.sklearn  # For scikit-learn models
import mlflow.catboost  # For CatBoost models
import mlflow.lightgbm  # For LightGBM models
import mlflow.xgboost  # For XGBoost models

# Configurations (optional)
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')



In [None]:
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')

In [None]:
train.head()

In [None]:
train['Loan_Status'].value_counts()

## EDA

### Inspect The Data

In [None]:
# Display the first few rows
print(train.head())

# Get a concise summary of the DataFrame
print(train.info())

# Check for missing values
missing_values = train.isnull().sum()
print("Missing Values:\n", missing_values)


Observations:

1. No Missing Values: None of the columns have missing values, so we don’t need imputation for this dataset.
2. Data Types:
    * Most columns are numeric (int64, float64).
    * Loan_ID and Dependents are object types and may need preprocessing (e.g., encoding or converting to numerical values where necessary).
3. Features:
    * ID and Loan_ID appear to be identifiers and might not contribute to the predictive power of the model.
    * Categorical features such as Gender, Married, Dependents, Education, Self_Employed, and Property_Area might need encoding or exploration to understand their distributions.
    * Numerical features (ApplicantIncome, CoapplicantIncome, LoanAmount, etc.) can be further analyzed for distributions and relationships with Loan_Status.


### Univariate, Bivariate, and Multivariate Exploration

##### Exploring the Loan_ID column

In [None]:
# Step 1: Examine unique values and patterns
print("Sample Loan_IDs:")
print(train['Loan_ID'].head(10))

# Step 2: Extract prefix and suffix (assuming alphanumeric IDs)
train['Loan_ID_Prefix'] = train['Loan_ID'].str.extract(r'([A-Za-z]+)')  # Extract alphabetic prefix
train['Loan_ID_Suffix'] = train['Loan_ID'].str.extract(r'(\d+)')       # Extract numeric suffix

# Step 3: Analyze the extracted components
print("\nUnique prefixes:")
print(train['Loan_ID_Prefix'].unique())

print("\nSummary of suffixes:")
print(train['Loan_ID_Suffix'].astype(int).describe())

# Step 4: Visualize the prefix distribution
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.countplot(data=train, x='Loan_ID_Prefix', order=train['Loan_ID_Prefix'].value_counts().index)
plt.title('Distribution of Loan_ID Prefixes')
plt.xlabel('Loan_ID Prefix')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Step 5: Check the correlation with Loan_Status
prefix_status = train.groupby(['Loan_ID_Prefix', 'Loan_Status']).size().unstack(fill_value=0)
print("\nLoan Status Distribution by Prefix:")
print(prefix_status)

# Visualize the relationship between prefixes and Loan_Status
prefix_status_normalized = prefix_status.div(prefix_status.sum(axis=1), axis=0)
prefix_status_normalized.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.title('Loan Status Proportions by Loan_ID Prefix')
plt.ylabel('Proportion')
plt.xlabel('Loan_ID Prefix')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Step 1: Extract numeric suffix
train['Loan_ID_Suffix'] = train['Loan_ID'].str.extract(r'(\d+)$').astype(int)
test['Loan_ID_Suffix'] = test['Loan_ID'].str.extract(r'(\d+)$').astype(int)

# Step 2: Analyze the suffix
print("Train Loan_ID_Suffix Statistics:")
print(train['Loan_ID_Suffix'].describe())

print("\nTest Loan_ID_Suffix Statistics:")
print(test['Loan_ID_Suffix'].describe())

# Visualize distribution of suffix
plt.figure(figsize=(12, 6))
sns.histplot(train['Loan_ID_Suffix'], bins=20, kde=True, color="purple")
plt.title("Distribution of Loan_ID Suffix in Train Dataset")
plt.xlabel("Loan_ID Suffix")
plt.ylabel("Frequency")
plt.show()

# Step 3: Correlation with Loan_Status
suffix_corr = train[['Loan_ID_Suffix', 'Loan_Status']].corr()
print("\nCorrelation between Loan_ID_Suffix and Loan_Status:")
print(suffix_corr)

# Step 4: Create bins for Loan_ID_Suffix
train['Loan_ID_Suffix_Bin'] = pd.qcut(train['Loan_ID_Suffix'], q=4, labels=['Early', 'Mid-Early', 'Mid-Late', 'Late'])
test['Loan_ID_Suffix_Bin'] = pd.qcut(test['Loan_ID_Suffix'], q=4, labels=['Early', 'Mid-Early', 'Mid-Late', 'Late'])

# Visualize Loan_Status proportions by Loan_ID_Suffix_Bin
plt.figure(figsize=(10, 6))
sns.countplot(data=train, x='Loan_ID_Suffix_Bin', hue='Loan_Status', palette='viridis')
plt.title("Loan_Status Proportions by Loan_ID Suffix Bins")
plt.xlabel("Loan_ID Suffix Bins")
plt.ylabel("Count")
plt.show()

# Step 5: Feature encoding for Loan_ID_Suffix_Bin
train = pd.get_dummies(train, columns=['Loan_ID_Suffix_Bin'], drop_first=True)
test = pd.get_dummies(test, columns=['Loan_ID_Suffix_Bin'], drop_first=True)


Observations

1. Distribution:
    * The Loan_ID_Suffix values are not evenly distributed; they exhibit a clustering pattern.
    * Most suffixes are concentrated around higher values, as evident from the histogram.

2. Binned Suffix Categories:
    * Suffixes were grouped into four bins: Early, Mid-Early, Mid-Late, and Late.
    * These bins correspond to ranges within the Loan_ID_Suffix and allow us to analyze loan approval rates across different segments.

3. Loan Approval by Suffix Bins:
    * Loan_Status proportions in all suffix bins are consistent, with a higher number of approved loans (Loan_Status = 1) compared to rejected loans (Loan_Status = 0).
    * However, no significant variation is observed between bins, suggesting that Loan_ID_Suffix may not strongly influence loan approval directly.

4. Correlation with Loan_Status:
    * The correlation coefficient between Loan_ID_Suffix and Loan_Status is -0.029, indicating a negligible linear relationship.

5. Train vs Test Statistics:
* The summary statistics (mean, median, range) for Loan_ID_Suffix in both train and test datasets are very similar, suggesting consistent data distribution across both datasets.

Next Steps:
* Feature Engineering: While the Loan_ID_Suffix feature doesn't directly correlate with Loan_Status, its bins can be used as categorical features in the modeling process to capture any non-linear trends.
* Validation: Assess the importance of this feature during model training to decide whether to keep or drop it.


In [None]:
# Ensure bins are consistent for train and test datasets
bins = [1000, 1500, 2000, 2500, 2602]
labels = ['Early', 'Mid-Early', 'Mid-Late', 'Late']

# Apply binning
train['Loan_ID_Bin'] = pd.cut(train['Loan_ID_Suffix'], bins=bins, labels=labels, include_lowest=True)
test['Loan_ID_Bin'] = pd.cut(test['Loan_ID_Suffix'], bins=bins, labels=labels, include_lowest=True)


In [None]:
# Perform one-hot encoding
train = pd.get_dummies(train, columns=['Loan_ID_Bin'], prefix='LoanID')
test = pd.get_dummies(test, columns=['Loan_ID_Bin'], prefix='LoanID')

# Ensure train and test have the same columns (in case one bin is missing in the test set)
train, test = train.align(test, join='outer', axis=1, fill_value=0)


In [None]:
# Check the first few rows of the train and test datasets
print(train.head())
print(test.head())


##### Exploring the Gender column

In [None]:
# Distribution of Gender
print(train['Gender'].value_counts())

# Gender vs Loan_Status
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Gender', hue='Loan_Status', data=train, palette='viridis')
plt.title('Gender Distribution by Loan Status')
plt.xlabel('Gender (0=Female, 1=Male)')
plt.ylabel('Count')
plt.legend(title='Loan Status', labels=['Not Approved', 'Approved'])
plt.show()


Observations:
1. Gender Distribution:
    * The dataset is heavily skewed towards males (1), with a count of 5372 compared to 526 females (0).
    * This imbalance might mean that the model could learn more about the loan approval patterns of males than females unless balanced in some way.

2. Loan Status by Gender:
    * Both males and females have higher counts in the approved (1) class than in the not approved (0) class.
    * However, the majority of loan applicants and approvals are male, which might reflect trends in the dataset or underlying societal patterns.


Discussion:
While Gender is somewhat imbalanced, it might still provide useful information for the model, particularly in conjunction with other features (e.g., Married, ApplicantIncome, etc.). The following steps are possible:

1. Retain Gender as-is:
    * If no significant preprocessing is required, we can keep the feature as-is for now. It is already encoded as 0 and 1.

2. Transformations:
    * If we suspect imbalance might affect model predictions, consider balancing methods later (e.g., SMOTE for oversampling the minority class).
3. Feature Interaction:
    * Investigate how Gender interacts with other features (e.g., ApplicantIncome, Married, etc.) to uncover deeper insights.


In [None]:
# Gender and Married Interaction
sns.countplot(x='Married', hue='Gender', data=train, palette='viridis')
plt.title('Gender vs Married')
plt.xlabel('Married (0=No, 1=Yes)')
plt.ylabel('Count')
plt.legend(title='Gender', labels=['Female', 'Male'])
plt.show()

# Proportion of Married Applicants by Gender
gender_married = train.groupby(['Gender', 'Married'])['Loan_Status'].value_counts(normalize=True).unstack()
print("Proportion of Loan Status by Gender and Married:\n", gender_married)


In [None]:
# Applicant Income by Gender
sns.boxplot(x='Gender', y='ApplicantIncome', data=train, palette='viridis')
plt.title('Applicant Income Distribution by Gender')
plt.xlabel('Gender (0=Female, 1=Male)')
plt.ylabel('Applicant Income')
plt.show()


In [None]:
# Gender vs Credit_History
sns.countplot(x='Credit_History', hue='Gender', data=train, palette='viridis')
plt.title('Gender vs Credit History')
plt.xlabel('Credit History (0=No, 1=Yes)')
plt.ylabel('Count')
plt.legend(title='Gender', labels=['Female', 'Male'])
plt.show()

# Proportion of Loan Status by Gender and Credit History
gender_credit = train.groupby(['Gender', 'Credit_History'])['Loan_Status'].value_counts(normalize=True).unstack()
print("Proportion of Loan Status by Gender and Credit History:\n", gender_credit)


In [None]:
# Create Gender_Married interaction feature
train['Gender_Married'] = train['Gender'].astype(str) + "_" + train['Married'].astype(str)

# Create Gender_Credit_History interaction feature
train['Gender_Credit_History'] = train['Gender'].astype(str) + "_" + train['Credit_History'].astype(str)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Prepare data for feature importance evaluation
X = train.drop(['Loan_Status', 'ID', 'Loan_ID'], axis=1)
y = train['Loan_Status']

# Encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

# Plot feature importances
importances[:10].plot(kind='bar', figsize=(10, 6), title="Top 10 Feature Importances")
plt.ylabel("Importance Score")
plt.show()


Analysis of Gender Interactions
1. Gender vs Married:
The majority of applicants are males who are married.
Loan approval rates are similar across categories, with no significant difference between married and unmarried individuals, regardless of gender (around 83%-84% approval rate for all groups).
2. Gender vs ApplicantIncome:
Males have higher median ApplicantIncome compared to females.
There are several outliers for both genders, with males having some exceptionally high incomes (>80,000).
Income might be more predictive of loan status when combined with gender, as males appear to earn more overall.
3. Gender vs Credit History:
Most applicants (both male and female) have a credit history (Credit_History = 1), and the approval rate is highest among those with a credit history (>83%).
Gender does not seem to significantly impact the approval rate within the same credit history category.
4. Feature Importance (Random Forest):
ApplicantIncome and Loan_Amount_Term are the most important features.
The new interaction features (Gender_Married and Gender_Credit_History) are not in the top 10, indicating that Gender alone or its interactions might have limited predictive power.

Recommendations for Gender Feature
1. Retain Gender as-is:
It might still contribute to the model in combination with other features like income or credit history.
2. Drop Interaction Features:
The interaction features (Gender_Married and Gender_Credit_History) did not show strong predictive power based on feature importance. We can remove them for simplicity unless further exploration proves otherwise.
3. Transform Applicant Income: To reduce the effect of outliers, we can apply a log transformation to ApplicantIncome before using it in the model.


Implementation Steps: 
Let’s apply the transformations and clean up interaction features:

> Step 1: Remove Interaction Features

In [None]:
# Drop Gender_Married and Gender_Credit_History if created
train.drop(['Gender_Married', 'Gender_Credit_History'], axis=1, inplace=True)


> Step 2: Log Transform Applicant Income

In [None]:
# Apply log transformation to ApplicantIncome
train['Log_ApplicantIncome'] = np.log1p(train['ApplicantIncome'])


> Step 3: Validate Changes

In [None]:
# Verify the changes
train.head()

##### Exploring Married Feature

> Step 1: Distribution of Married: 
We’ll begin by analyzing the distribution of Married and its relationship with Loan_Status. Run the following code:



In [None]:
# Distribution of Married
print(train['Married'].value_counts())

# Married vs Loan_Status
sns.countplot(x='Married', hue='Loan_Status', data=train, palette='viridis')
plt.title('Married Distribution by Loan Status')
plt.xlabel('Married (0=No, 1=Yes)')
plt.ylabel('Count')
plt.legend(title='Loan Status', labels=['Not Approved', 'Approved'])
plt.show()


> Step 2: Interaction with Other Features

In [None]:
# Interaction: Married and Gender
sns.countplot(x='Married', hue='Gender', data=train, palette='viridis')
plt.title('Married vs Gender')
plt.xlabel('Married (0=No, 1=Yes)')
plt.ylabel('Count')
plt.legend(title='Gender', labels=['Female', 'Male'])
plt.show()


In [None]:
# Applicant Income by Married
sns.boxplot(x='Married', y='ApplicantIncome', data=train, palette='viridis')
plt.title('Applicant Income Distribution by Married Status')
plt.xlabel('Married (0=No, 1=Yes)')
plt.ylabel('Applicant Income')
plt.show()


In [None]:
# Married vs Credit_History
sns.countplot(x='Credit_History', hue='Married', data=train, palette='viridis')
plt.title('Married vs Credit History')
plt.xlabel('Credit History (0=No, 1=Yes)')
plt.ylabel('Count')
plt.legend(title='Married', labels=['No', 'Yes'])
plt.show()


In [None]:
# Proportion of Loan_Status by Married
married_loan = train.groupby('Married')['Loan_Status'].value_counts(normalize=True).unstack()
print("Proportion of Loan Status by Married:\n", married_loan)


Analysis of Married Feature
1. Marital Status Distribution:
    * The majority of applicants are married (5040 vs. 858 not married).
    * Married applicants have a slightly lower loan approval rate (83.17%) compared to unmarried applicants (84.03% However, this difference is negligible.
2. Married vs Gender:
    * Most married applicants are male, reinforcing the observation from the Gender analysis.
    * Since both Gender and Married have a strong imbalance, any interaction between them might not add significant value.
3. Married vs Applicant Income:
    * Married applicants tend to have higher median incomes than unmarried applicants.
    * Outliers exist for both groups, but these outliers are more prominent for married individuals.
4. Married vs Credit History:
    * The majority of married applicants have a credit history (Credit_History=1), and the approval rates are consistently higher for applicants with a credit history.
    * Being married doesn’t appear to strongly influence the loan outcome when compared to credit history.
5. Loan Approval Proportions:
    * The loan approval rates are similar across married and unmarried groups, with no significant variation.


Recommendations for Married
1. Retain Married:
    * Despite the limited influence, the feature may still contribute when combined with other variables, so we retain it as-is for now.
2. No Interaction Features:
    * Interactions such as Married_Gender or Married_Credit_History don’t seem to add significant value based on the current observations and approval proportions. We can avoid creating these features unless further modeling experiments suggest otherwise.
3. No Transformations Needed:
    * Married is already encoded (0=No, 1=Yes), and no additional transformations are required.


##### Exploring the Dependents feature

In [None]:
# Distribution of Dependents
print(train['Dependents'].value_counts())

# Dependents vs Loan_Status
sns.countplot(x='Dependents', hue='Loan_Status', data=train, palette='viridis')
plt.title('Dependents Distribution by Loan Status')
plt.xlabel('Dependents')
plt.ylabel('Count')
plt.legend(title='Loan Status', labels=['Not Approved', 'Approved'])
plt.show()


In [None]:
# Applicant Income by Dependents
sns.boxplot(x='Dependents', y='ApplicantIncome', data=train, palette='viridis')
plt.title('Applicant Income Distribution by Dependents')
plt.xlabel('Dependents')
plt.ylabel('Applicant Income')
plt.show()


In [None]:
# Dependents vs Married
sns.countplot(x='Dependents', hue='Married', data=train, palette='viridis')
plt.title('Dependents Distribution by Marital Status')
plt.xlabel('Dependents')
plt.ylabel('Count')
plt.legend(title='Married', labels=['No', 'Yes'])
plt.show()


In [None]:
# Dependents vs Credit_History
sns.countplot(x='Dependents', hue='Credit_History', data=train, palette='viridis')
plt.title('Dependents vs Credit History')
plt.xlabel('Dependents')
plt.ylabel('Count')
plt.legend(title='Credit History', labels=['No', 'Yes'])
plt.show()


In [None]:
# Proportion of Loan_Status by Dependents
dependents_loan = train.groupby('Dependents')['Loan_Status'].value_counts(normalize=True).unstack()
print("Proportion of Loan Status by Dependents:\n", dependents_loan)


Analysis of Dependents Feature
1. Dependents Distribution:
    * The majority of applicants have no dependents (0 = 3659), followed by 1 dependent (1157).
    * Categories 2 and 3+ are smaller, but still relevant.
2. Dependents and Loan Status:
    * Loan approval rates decrease slightly as the number of dependents increases:
        * 0 dependents: ~84% approval.
        * 3+ dependents: ~81.7% approval.
    * This suggests a possible inverse relationship between the number of dependents and loan approval probability.
3. Dependents vs Applicant Income:
    * Applicants with more dependents tend to have slightly higher median incomes. This makes sense as families with more dependents may have more earning members or require higher incomes to support their households.
4. Dependents vs Marital Status:
    * Married applicants are more likely to report dependents, especially in the 1, 2, and 3+ categories.
    * Single applicants overwhelmingly fall into the 0 dependents category.
5. Dependents vs Credit History:
    * Applicants with a credit history dominate across all dependent categories.
    * The proportion of applicants with a good credit history does not vary significantly across categories.


Recommendations for Dependents
1. Retain Dependents as-is:
The feature shows some predictive power, especially with its slight inverse relationship to loan approval.
2. Group Rare Categories:
Consider grouping 2 and 3+ into a single category (2+) to simplify the feature and address the smaller size of these groups.
3. No Interaction Features:
While Dependents interacts with Married and Credit_History, these interactions do not appear to add significant predictive power. For now, avoid creating interaction features.
4. Encode as Integer:
If grouping categories, convert Dependents into integers to make it compatible with ML algorithms:

In [None]:
train['Dependents'] = train['Dependents'].replace('3+', '2+')
train['Dependents'] = train['Dependents'].replace({'0': 0, '1': 1, '2': 2, '2+': 3}).astype(int)

##### Exploring the Education feature

In [None]:
# Distribution of Education
print(train['Education'].value_counts())

# Education vs Loan_Status
sns.countplot(x='Education', hue='Loan_Status', data=train, palette='viridis')
plt.title('Education Distribution by Loan Status')
plt.xlabel('Education (0=Undergraduate, 1=Graduate)')
plt.ylabel('Count')
plt.legend(title='Loan Status', labels=['Not Approved', 'Approved'])
plt.show()


In [None]:
# Applicant Income by Education
sns.boxplot(x='Education', y='ApplicantIncome', data=train, palette='viridis')
plt.title('Applicant Income Distribution by Education Level')
plt.xlabel('Education (0=Undergraduate, 1=Graduate)')
plt.ylabel('Applicant Income')
plt.show()


In [None]:
# Education vs Dependents
sns.countplot(x='Dependents', hue='Education', data=train, palette='viridis')
plt.title('Dependents Distribution by Education Level')
plt.xlabel('Dependents')
plt.ylabel('Count')
plt.legend(title='Education', labels=['Undergraduate', 'Graduate'])
plt.show()


In [None]:
# Education vs Credit_History
sns.countplot(x='Credit_History', hue='Education', data=train, palette='viridis')
plt.title('Education vs Credit History')
plt.xlabel('Credit History (0=No, 1=Yes)')
plt.ylabel('Count')
plt.legend(title='Education', labels=['Undergraduate', 'Graduate'])
plt.show()


In [None]:
# Proportion of Loan_Status by Education
education_loan = train.groupby('Education')['Loan_Status'].value_counts(normalize=True).unstack()
print("Proportion of Loan Status by Education:\n", education_loan)


Analysis of Education Feature
1. Education Distribution:
    1. The majority of applicants are undergraduates (category 0), significantly outnumbering graduates.
    2. This suggests that the dataset is imbalanced in terms of education level.
2. Education and Loan Status:
    * Graduates have a slightly lower loan approval rate (~82.2%) compared to undergraduates (~83.4%).
    * The difference is small but may still hold predictive value.
3. Education vs Applicant Income:
    * Graduates tend to have slightly higher median incomes than undergraduates, as expected.
    * However, the difference is not substantial, and both groups show a similar range of incomes.
4. Education vs Dependents:
    * A majority of applicants with no dependents are undergraduates, while graduates are distributed more evenly across dependent categories.
    * This could indicate that graduates are more likely to have families.
5. Education vs Credit History:
    * Undergraduates dominate the population with a credit history (Credit_History = 1), reflecting the general imbalance in education levels in the dataset.
    * There is no significant variation in credit history proportions between education groups.

Recommendations for Education
1. Retain Education as-is:
The feature is already binary (0 = Undergraduate, 1 = Graduate), which is optimal for most machine learning algorithms.
2. No Interaction Features:
While Education interacts with other features like Dependents and Credit_History, these interactions do not show significant predictive potential for Loan_Status. No additional interaction features are recommended for now.
3. No Additional Transformations:
Education is already encoded and well-suited for modeling.


##### Exploring the self_employed column

In [None]:
# Distribution of Self_Employed
print(train['Self_Employed'].value_counts())

# Self_Employed vs Loan_Status
sns.countplot(x='Self_Employed', hue='Loan_Status', data=train, palette='viridis')
plt.title('Self_Employed Distribution by Loan Status')
plt.xlabel('Self_Employed (0=No, 1=Yes)')
plt.ylabel('Count')
plt.legend(title='Loan Status', labels=['Not Approved', 'Approved'])
plt.show()


In [None]:
# Applicant Income by Self_Employed
sns.boxplot(x='Self_Employed', y='ApplicantIncome', data=train, palette='viridis')
plt.title('Applicant Income Distribution by Self-Employment Status')
plt.xlabel('Self_Employed (0=No, 1=Yes)')
plt.ylabel('Applicant Income')
plt.show()


In [None]:
# Self_Employed vs Credit_History
sns.countplot(x='Credit_History', hue='Self_Employed', data=train, palette='viridis')
plt.title('Self_Employed vs Credit History')
plt.xlabel('Credit History (0=No, 1=Yes)')
plt.ylabel('Count')
plt.legend(title='Self_Employed', labels=['No', 'Yes'])
plt.show()


In [None]:
# Self_Employed vs Education
sns.countplot(x='Education', hue='Self_Employed', data=train, palette='viridis')
plt.title('Self_Employed vs Education')
plt.xlabel('Education (0=Undergraduate, 1=Graduate)')
plt.ylabel('Count')
plt.legend(title='Self_Employed', labels=['No', 'Yes'])
plt.show()


In [None]:
# Proportion of Loan_Status by Self_Employed
self_employed_loan = train.groupby('Self_Employed')['Loan_Status'].value_counts(normalize=True).unstack()
print("Proportion of Loan Status by Self_Employed:\n", self_employed_loan)


Analysis of Self_Employed Feature
1. Self-Employed Distribution:
    * The majority of applicants are not self-employed (0), while a small portion is self-employed (1).
    * This imbalance may indicate limited predictive power for Self_Employed on its own.
2. Self-Employed and Loan Status:
    * Both groups have similar loan approval rates:
        * Non-self-employed: ~83.2% approval.
        * Self-employed: ~83.9% approval.
    * The difference is negligible and may not be a strong differentiator.
3. Self-Employed vs Applicant Income:
    * Self-employed individuals generally have slightly higher median incomes compared to non-self-employed individuals.
    * However, the difference in income distribution between the two groups is not significant.
4. Self-Employed vs Credit History:
    * Most applicants with a credit history are not self-employed, reflecting the overall distribution imbalance.
    * Self-employment status does not seem to strongly correlate with having a credit history.
5. Self-Employed vs Education:
    * The majority of self-employed individuals are undergraduates, aligning with the dominance of undergraduates in the dataset.


Recommendations for Self_Employed
1. Retain Self_Employed as-is:
Despite the imbalance, the feature may still hold some predictive power when combined with other features like income or credit history.
2. No Interaction Features:
Interactions such as Self_Employed_Education or Self_Employed_Credit_History don’t show significant value based on current analysis. No additional interaction features are recommended for now.
3. No Transformations Required:
The feature is already binary (0=No, 1=Yes), and no additional preprocessing is needed.


##### Exploring the ApplicantIncome feature

In [None]:
# Distribution of ApplicantIncome
plt.figure(figsize=(8, 5))
sns.histplot(train['ApplicantIncome'], kde=True, bins=30, color='blue')
plt.title('Distribution of Applicant Income')
plt.xlabel('Applicant Income')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Applicant Income vs Loan Status
plt.figure(figsize=(8, 5))
sns.boxplot(x='Loan_Status', y='ApplicantIncome', data=train, palette='viridis')
plt.title('Applicant Income Distribution by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Applicant Income')
plt.xticks(ticks=[0, 1], labels=['Not Approved', 'Approved'])
plt.show()


In [None]:
# Applicant Income by Self_Employed
sns.boxplot(x='Self_Employed', y='ApplicantIncome', data=train, palette='viridis')
plt.title('Applicant Income Distribution by Self-Employment Status')
plt.xlabel('Self_Employed (0=No, 1=Yes)')
plt.ylabel('Applicant Income')
plt.show()


In [None]:
# Applicant Income by Education
sns.boxplot(x='Education', y='ApplicantIncome', data=train, palette='viridis')
plt.title('Applicant Income Distribution by Education Level')
plt.xlabel('Education (0=Undergraduate, 1=Graduate)')
plt.ylabel('Applicant Income')
plt.show()


In [None]:
# Applicant Income by Dependents
sns.boxplot(x='Dependents', y='ApplicantIncome', data=train, palette='viridis')
plt.title('Applicant Income Distribution by Number of Dependents')
plt.xlabel('Dependents')
plt.ylabel('Applicant Income')
plt.show()


In [None]:
# Log transformation of ApplicantIncome
train['Log_ApplicantIncome'] = np.log1p(train['ApplicantIncome'])

# Visualize transformed distribution
plt.figure(figsize=(8, 5))
sns.histplot(train['Log_ApplicantIncome'], kde=True, bins=30, color='green')
plt.title('Log-Transformed Distribution of Applicant Income')
plt.xlabel('Log Applicant Income')
plt.ylabel('Frequency')
plt.show()


Analysis of ApplicantIncome

1. Distribution
    * The raw ApplicantIncome is highly skewed with a long tail of high incomes.
    * The log transformation significantly normalizes the distribution, making it more suitable for machine learning models.
2. Loan Status
    * Loan approval is not strongly differentiated by applicant income. Both approved and not-approved groups show a similar income distribution.
    * Outliers with very high incomes do not appear to significantly influence loan approval.
3. Interactions
    * Dependents: Income levels slightly increase with the number of dependents, but the variance within each group is large.
    * Education: Graduates tend to have higher incomes compared to undergraduates, as expected.
    * Self-Employment: Self-employed individuals generally earn more than non-self-employed ones, aligning with expectations.
4. Insights on Transformation
    * Using the log-transformed version (Log_ApplicantIncome) is preferred for modeling due to its normalized distribution.
    * Creating interaction features (e.g., Income_Education, Income_SelfEmployed) could improve model performance as income patterns vary across these groups.

Recommendations
1. Retain Log_ApplicantIncome:

    *Replace the raw ApplicantIncome with the log-transformed version in the dataset.
2. Create Interaction Features:
    * Interaction features such as:
        * Income_Education = Log_ApplicantIncome * Education
        * Income_SelfEmployed = Log_ApplicantIncome * Self_Employed
    * These features could add value by capturing combined effects of income and categorical attributes.

3. Optional Binning:
If interpretability is a priority, binning income into categories (e.g., low, medium, high) can make the feature easier to explain. However, this may lead to loss of information.


In [None]:
# Create interaction features
train['Income_Education'] = train['Log_ApplicantIncome'] * train['Education']
train['Income_SelfEmployed'] = train['Log_ApplicantIncome'] * train['Self_Employed']

# Preview the new features
print(train[['Log_ApplicantIncome', 'Education', 'Self_Employed', 'Income_Education', 'Income_SelfEmployed']].head())


In [None]:
# Distribution of Income_Education by Loan Status
plt.figure(figsize=(8, 5))
sns.boxplot(x='Loan_Status', y='Income_Education', data=train, palette='viridis')
plt.title('Income_Education Distribution by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Income_Education')
plt.xticks(ticks=[0, 1], labels=['Not Approved', 'Approved'])
plt.show()


In [None]:
# Distribution of Income_SelfEmployed by Loan Status
plt.figure(figsize=(8, 5))
sns.boxplot(x='Loan_Status', y='Income_SelfEmployed', data=train, palette='viridis')
plt.title('Income_SelfEmployed Distribution by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Income_SelfEmployed')
plt.xticks(ticks=[0, 1], labels=['Not Approved', 'Approved'])
plt.show()


In [None]:
# Correlation of interaction features with Loan_Status
interaction_features = ['Income_Education', 'Income_SelfEmployed']
correlation = train[interaction_features + ['Loan_Status']].corr()
print(correlation['Loan_Status'].sort_values(ascending=False))


##### Exploring the CoapplicantIncome

In [None]:
# Plot raw distribution of CoapplicantIncome
plt.figure(figsize=(10, 6))
sns.histplot(train['CoapplicantIncome'], kde=True, color='blue')
plt.title('Distribution of Coapplicant Income')
plt.xlabel('Coapplicant Income')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Boxplot of CoapplicantIncome by Loan_Status
plt.figure(figsize=(10, 6))
sns.boxplot(x='Loan_Status', y='CoapplicantIncome', data=train, palette='Set2')
plt.title('Coapplicant Income Distribution by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Coapplicant Income')
plt.show()


In [None]:
# Apply log transformation (add 1 to avoid log(0))
train['Log_CoapplicantIncome'] = np.log1p(train['CoapplicantIncome'])

# Plot the log-transformed distribution
plt.figure(figsize=(10, 6))
sns.histplot(train['Log_CoapplicantIncome'], kde=True, color='green')
plt.title('Log-Transformed Distribution of Coapplicant Income')
plt.xlabel('Log Coapplicant Income')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Boxplot for log-transformed CoapplicantIncome by Loan_Status
plt.figure(figsize=(10, 6))
sns.boxplot(x='Loan_Status', y='Log_CoapplicantIncome', data=train, palette='Set3')
plt.title('Log Coapplicant Income Distribution by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Log Coapplicant Income')
plt.show()


In [None]:
# Grouped statistics for CoapplicantIncome
coapplicant_grouped = train.groupby('Loan_Status')['CoapplicantIncome'].agg(['mean', 'median', 'std', 'min', 'max'])
print(coapplicant_grouped)


In [None]:
# Create a new feature for TotalIncome
train['TotalIncome'] = train['ApplicantIncome'] + train['CoapplicantIncome']

# Plot distribution of TotalIncome
plt.figure(figsize=(10, 6))
sns.histplot(train['TotalIncome'], kde=True, color='purple')
plt.title('Distribution of Total Income')
plt.xlabel('Total Income')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Boxplot of TotalIncome by Loan_Status
plt.figure(figsize=(10, 6))
sns.boxplot(x='Loan_Status', y='TotalIncome', data=train, palette='coolwarm')
plt.title('Total Income Distribution by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Total Income')
plt.show()


In [None]:
# Correlation check for TotalIncome and Loan_Status
correlation = train[['Loan_Status', 'CoapplicantIncome', 'Log_CoapplicantIncome', 'TotalIncome']].corr()
print(correlation['Loan_Status'])


Observations from Visualizations and Statistics:

1. Distribution:
    * CoapplicantIncome has a right-skewed distribution with most values concentrated at 0, as seen in the histograms.
    * The log transformation (Log_CoapplicantIncome) slightly normalizes the distribution but retains a large mass at 0 due to the presence of many zero-income co-applicants.

2. Loan Status:
    * Boxplots indicate slight differences in CoapplicantIncome and Log_CoapplicantIncome between approved and non-approved loans, though the medians and interquartile ranges suggest limited predictive power.
3. Correlation:
    * Very low correlations of CoapplicantIncome, Log_CoapplicantIncome, and TotalIncome with Loan_Status suggest weak direct influence.
4. Summary Statistics:
    * Mean and median differences between loan-approved and non-approved cases for CoapplicantIncome are negligible, supporting the weak predictive potential.

Recommendations

1. Feature Transformation:
    * Consider retaining both Log_CoapplicantIncome and TotalIncome for modeling as they capture different aspects of the income structure, despite weak individual correlation.
2. Interaction Features:
    * Explore additional interactions with other features, such as the relationship between CoapplicantIncome and marital status, dependents, or education.


##### Exploring the LoanAmount feature

In [None]:
# 1. Distribution of LoanAmount
plt.figure(figsize=(10, 6))
sns.histplot(train['LoanAmount'], kde=True, bins=30, color='blue')
plt.title('Distribution of Loan Amount')
plt.xlabel('Loan Amount')
plt.ylabel('Frequency')
plt.show()

# 2. Log Transformation of LoanAmount (to normalize skewness)
train['Log_LoanAmount'] = np.log1p(train['LoanAmount'])  # Adding 1 to avoid log(0)

plt.figure(figsize=(10, 6))
sns.histplot(train['Log_LoanAmount'], kde=True, bins=30, color='green')
plt.title('Log-Transformed Distribution of Loan Amount')
plt.xlabel('Log Loan Amount')
plt.ylabel('Frequency')
plt.show()

# 3. LoanAmount Distribution by Loan Status
plt.figure(figsize=(10, 6))
sns.boxplot(x=train['Loan_Status'], y=train['LoanAmount'], palette='pastel')
plt.title('Loan Amount Distribution by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Loan Amount')
plt.show()

# 4. Log LoanAmount Distribution by Loan Status
plt.figure(figsize=(10, 6))
sns.boxplot(x=train['Loan_Status'], y=train['Log_LoanAmount'], palette='Set3')
plt.title('Log Loan Amount Distribution by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Log Loan Amount')
plt.show()

# 5. Summary statistics for LoanAmount and Log_LoanAmount by Loan_Status
summary_stats = train.groupby('Loan_Status')[['LoanAmount', 'Log_LoanAmount']].agg(['mean', 'median', 'std', 'min', 'max'])
print(summary_stats)

# 6. Correlation with Loan_Status
correlations = train[['Loan_Status', 'LoanAmount', 'Log_LoanAmount']].corr()['Loan_Status']
print(correlations)


Observations:

1. Distribution Analysis:

    * The LoanAmount feature shows a right-skewed distribution, meaning most values are concentrated towards lower loan amounts, with a long tail towards higher values.
    * The log-transformed distribution (Log_LoanAmount) reduces skewness, resulting in a more normalized distribution.
2. By Loan Status:

    * The mean and median loan amounts are slightly higher for loans that were approved (Loan_Status = 1) compared to those not approved (Loan_Status = 0).
    * The standard deviation is high for both groups, indicating variability in loan amounts.
    * The log-transformed boxplots further confirm that approved loans tend to have slightly higher loan amounts, but the difference is not stark.
3. Summary Statistics:

    * LoanAmount has a mean of 98.08 for approved loans and 94.03 for not approved loans.
    * Log-transformed values show reduced variability with a mean around 4 for both categories.
    * Minimum values (log) and maximum values do not differ significantly across approval categories.
4. Correlation with Target:

    * The correlation values between Loan_Status and LoanAmount (0.0146) or Log_LoanAmount (0.0102) indicate very weak positive relationships, suggesting minimal direct influence on loan approval.

Recommendations for the CoapplicantIncome Feature:

1. Retain the Feature:
Despite the weak correlation, CoapplicantIncome could still provide useful information in combination with other features, especially interaction terms (e.g., TotalIncome, Income_SelfEmployed, or Income_Education).

2. Log Transformation:
Use the log-transformed version (Log_CoapplicantIncome) to reduce skewness and normalize the distribution.

3. Consider Binning:
To capture non-linear relationships, bin Log_CoapplicantIncome into categories (e.g., low, medium, high) and test whether these categories reveal patterns in loan approval.

4. Interaction Features:
    * Create interaction terms involving CoapplicantIncome, such as:
        * TotalIncome = ApplicantIncome + CoapplicantIncome.
        * Ratios between CoapplicantIncome and loan amount, or other relevant features.

5. Feature Engineering for Zeros:
* Since many coapplicants have zero income, create a binary feature like HasCoapplicantIncome to differentiate between those who contribute financially and those who don’t.

6. Model Testing:
    * Test the feature's contribution during feature importance analysis after training the model. If it remains insignificant, consider dropping it for simplification.


##### Exploring the Loan_Amount_Term feature

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Check for missing values
print("Missing values in Loan_Amount_Term:", train['Loan_Amount_Term'].isna().sum())

# Unique values and value counts
print("\nUnique values in Loan_Amount_Term:")
print(train['Loan_Amount_Term'].value_counts())

# Plot distribution
plt.figure(figsize=(12, 6))
sns.histplot(train['Loan_Amount_Term'].dropna(), kde=False, bins=10, color="purple")
plt.title("Distribution of Loan_Amount_Term")
plt.xlabel("Loan Amount Term (in months)")
plt.ylabel("Frequency")
plt.show()

# Boxplot by Loan_Status
plt.figure(figsize=(12, 6))
sns.boxplot(data=train, x="Loan_Status", y="Loan_Amount_Term", palette="pastel")
plt.title("Loan_Amount_Term by Loan_Status")
plt.xlabel("Loan Status (0=Not Approved, 1=Approved)")
plt.ylabel("Loan Amount Term (in months)")
plt.show()

# Grouped summary statistics
grouped = train.groupby("Loan_Status")['Loan_Amount_Term'].agg(['mean', 'median', 'std', 'min', 'max'])
print("\nSummary statistics for Loan_Amount_Term grouped by Loan_Status:")
print(grouped)

# Correlation with Loan_Status
correlation = train[['Loan_Status', 'Loan_Amount_Term']].corr()
print("\nCorrelation between Loan_Status and Loan_Amount_Term:")
print(correlation)


Observations: 

1. Distribution Observations:
    * The distribution is highly concentrated around a specific value (e.g., 372 or 373 months) with several outliers, suggesting that most loans have similar repayment terms.
    * There is a large number of unique values (259), indicating that Loan_Amount_Term is numeric but may have some anomalies or rare values.

2. Relationship with Loan_Status:
    * Both approved (Loan_Status=1) and not approved (Loan_Status=0) loans have similar mean and median Loan_Amount_Term values (mean ~358-359 months, median ~367-368 months).
    * The standard deviations are slightly higher for unapproved loans, but the overall pattern is consistent.
    * Boxplots show that the majority of values cluster tightly around the median, with a few extreme outliers.

3. Correlation:
    * There is an extremely weak positive correlation (0.0083) between Loan_Status and Loan_Amount_Term, suggesting almost no linear relationship.

4. Missing Values:
    * No missing values were observed, which simplifies preprocessing for this feature.


Recommendations for Loan_Amount_Term:

1. Feature Transformation:

    * Since the feature has weak correlation and limited variability for Loan_Status, we can either:
        * Leave it as-is (retain as numeric).
        * Bin the values into broader categories (e.g., short-term, medium-term, long-term) to simplify the feature and potentially make it more interpretable.

2. Handling Outliers:
    * Outliers should be considered carefully. For example:
        * Check for business logic around extremely small or large values (e.g., terms under 100 months or over 400 months).
        * Remove or cap outliers if they are deemed unrealistic.

3. Interaction Terms:
    * Create interaction features, such as Loan_Amount_Term multiplied or divided by LoanAmount, to explore if repayment terms relative to the loan amount influence Loan_Status.

4. Retention Decision:
    * Retain this feature in the model as it might contribute marginally to prediction despite weak correlation.


In [None]:
# Binning Loan_Amount_Term
bins = [0, 180, 360, 480]
labels = ['Short-term (<180)', 'Medium-term (180-360)', 'Long-term (>360)']
train['Loan_Term_Bins'] = pd.cut(train['Loan_Amount_Term'], bins=bins, labels=labels, include_lowest=True)

# Distribution of Loan_Status across bins
term_status_dist = train.groupby(['Loan_Term_Bins', 'Loan_Status'])['Loan_ID'].count().unstack()
term_status_dist.plot(kind='bar', stacked=True, figsize=(10, 6), color=['lightblue', 'salmon'])
plt.title('Distribution of Loan Status by Loan Term Bins')
plt.xlabel('Loan Term Bins')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.legend(['Not Approved', 'Approved'], title='Loan Status')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Loan_Status', y='Loan_Amount_Term', data=train, palette='coolwarm')
plt.title('Boxplot of Loan Amount Term by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Loan Amount Term')
plt.tight_layout()
plt.show()


In [None]:
train['Loan_Term_Ratio'] = train['LoanAmount'] / train['Loan_Amount_Term']
sns.boxplot(x='Loan_Status', y='Loan_Term_Ratio', data=train, palette='viridis')
plt.title('Loan Amount to Loan Term Ratio by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Loan Amount / Loan Term Ratio')
plt.tight_layout()
plt.show()


In [None]:
top_terms = train['Loan_Amount_Term'].value_counts().head(10)
top_terms.plot(kind='bar', color='purple', figsize=(8, 5))
plt.title('Top 10 Most Frequent Loan Terms')
plt.xlabel('Loan Amount Term')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


In [None]:
correlations = train[['Loan_Amount_Term', 'LoanAmount', 'ApplicantIncome', 'CoapplicantIncome']].corr()
sns.heatmap(correlations, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation of Loan_Amount_Term with Other Numerical Features')
plt.tight_layout()
plt.show()


Key Observations:

1. Distribution:
    * The Loan_Amount_Term feature is highly skewed, with most loans concentrated around 360 months (30 years).
    * A few shorter-term loans exist (12 to 180 months), but these are rare.

2. Correlation:
    * Loan_Amount_Term shows minimal correlation with Loan_Status (0.008), indicating limited direct influence on loan approval.

3. Interaction with Other Numerical Features:
    * Weak correlations exist between Loan_Amount_Term and other numerical features like LoanAmount (0.05) and ApplicantIncome (-0.04).
    * The scatterplot of the Loan_Amount/Loan_Amount_Term ratio highlights outliers in loan term and amount.

4. Most Frequent Terms:
    * The top 10 most frequent loan terms are around 360 months, suggesting a standard term length for loans.

5. Loan Term Bins:
    * Binning into categories like "Short-term (<180 months)," "Medium-term (180-360 months)," and "Long-term (>360 months)" highlights that most loans fall under long-term loans.

6. Loan Term Ratio:
    * The ratio of LoanAmount to Loan_Amount_Term helps standardize loan amount comparisons and reveals potential outliers.

Recommendations:

1. Feature Engineering:
    * Bin Loan Terms: Create categorical bins for loan terms (e.g., short, medium, long-term) to better capture patterns in loan approval.
    * Loan Term Ratio: The ratio of LoanAmount to Loan_Amount_Term can be used as a normalized feature representing the loan amount spread over the term.

2. Outlier Handling:
    * Investigate outliers in both Loan_Amount_Term and the LoanAmount/Loan_Amount_Term ratio for potential data issues or anomalies.

3. Interaction Features:
    * Combine Loan_Amount_Term with features like LoanAmount or ApplicantIncome to derive meaningful interaction terms that could provide predictive value.
4. Modeling Considerations:
    * Due to the low correlation with Loan_Status, Loan_Amount_Term may not be a strong predictor. However, interaction features or bins could reveal hidden patterns.








In [None]:
# Step 1: Binning Loan_Amount_Term
def bin_loan_term(term):
    if term < 180:
        return 'Short-term'
    elif 180 <= term <= 360:
        return 'Medium-term'
    else:
        return 'Long-term'

train['Loan_Term_Bins'] = train['Loan_Amount_Term'].apply(bin_loan_term)

# Visualize the distribution of bins
bin_counts = train['Loan_Term_Bins'].value_counts()
plt.bar(bin_counts.index, bin_counts.values, color='purple')
plt.title("Loan Term Bins Distribution")
plt.ylabel("Count")
plt.xlabel("Loan Term Bins")
plt.show()

# Step 2: Creating the Loan Amount to Loan Term Ratio
train['LoanAmount_Term_Ratio'] = train['LoanAmount'] / train['Loan_Amount_Term']
train['LoanAmount_Term_Ratio'].fillna(0, inplace=True)  # Handle divisions by zero

# Visualize the LoanAmount_Term_Ratio distribution
sns.histplot(train['LoanAmount_Term_Ratio'], kde=True, color='blue')
plt.title("Distribution of Loan Amount to Loan Term Ratio")
plt.xlabel("Loan Amount / Loan Term Ratio")
plt.ylabel("Frequency")
plt.show()


Observations from the Visuals:

1. Loan Term Bins Distribution:
    * The majority of loans fall under the "Long-term" category (>360 months).
    * A significant proportion is in the "Medium-term" range (180–360 months), while "Short-term" loans (<180 months) are rare.

2. Loan Amount to Loan Term Ratio Distribution:
    * The ratio is heavily skewed to the left, with most values concentrated near zero. This suggests that for most loans, the amount is small relative to the term.

Recommendations:

1. Loan Term Bins:
    * Retain Loan_Term_Bins as a categorical feature for the model, as it provides interpretable groupings of loan durations.
    * Perform additional analysis to check if loan approval rates differ significantly across these bins.

2. Loan Amount to Term Ratio:
    * Consider applying a logarithmic transformation to normalize the skewness, or bin this feature into categories to simplify its use in the model.

3. Further Analysis:
    * Investigate how Loan_Term_Bins and LoanAmount_Term_Ratio correlate with Loan_Status to confirm their predictive value.
    * Perform bivariate analysis to explore interactions between Loan_Term_Bins and other features.


In [None]:
# 1. Loan Term Bins Distribution by Loan Status
loan_term_bin_status = train.groupby(['Loan_Term_Bins', 'Loan_Status']).size().unstack()
loan_term_bin_status.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='coolwarm')
plt.title('Loan Term Bins Distribution by Loan Status')
plt.xlabel('Loan Term Bins')
plt.ylabel('Count')
plt.legend(['Not Approved', 'Approved'], title='Loan Status')
plt.show()

# 2. Boxplot of LoanAmount_Term_Ratio by Loan Status
plt.figure(figsize=(8, 6))
sns.boxplot(x='Loan_Status', y='LoanAmount_Term_Ratio', data=train, palette='Set2')
plt.title('Loan Amount to Loan Term Ratio by Loan Status')
plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
plt.ylabel('Loan Amount / Loan Term Ratio')
plt.show()

# 3. Correlation Heatmap with Loan_Term_Bins (One-Hot Encoded)
loan_term_bins_encoded = pd.get_dummies(train['Loan_Term_Bins'], drop_first=True)
correlation_matrix = pd.concat([loan_term_bins_encoded, train[['LoanAmount_Term_Ratio', 'Loan_Status']]], axis=1).corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Heatmap Including Loan Term Features')
plt.show()

# 4. Interaction Analysis: Scatter Plot of LoanAmount_Term_Ratio vs ApplicantIncome
plt.figure(figsize=(10, 6))
sns.scatterplot(x='LoanAmount_Term_Ratio', y='ApplicantIncome', hue='Loan_Status', data=train, palette='viridis')
plt.title('Interaction of Loan Amount/Term Ratio and Applicant Income')
plt.xlabel('Loan Amount / Loan Term Ratio')
plt.ylabel('Applicant Income')
plt.legend(title='Loan Status', loc='upper right')
plt.show()


Key Findings:
1. Correlation Heatmap:
    * The Loan_Amount_Term and its derived features (bins and ratio) show weak correlations with Loan_Status. This aligns with earlier observations that loan term features individually may not be highly predictive.

2. Loan Term Bins Distribution:
    * A majority of loans fall under the "Long-term" category, followed by "Medium-term." Short-term loans are rare.
    * Loan approval rates are highest for long-term loans, though this could also reflect the distribution bias of data being skewed toward long-term loans.

3. Loan Amount/Term Ratio:
    * Most observations cluster near lower values of the ratio, with a few outliers reaching high values. Approved and non-approved loans appear similar when analyzed against the ratio.

4. Applicant Income Interaction:
    * When plotting Loan Amount/Term Ratio against Applicant Income, we notice that higher ratios often align with lower incomes, indicating that lower-income applicants might request higher amounts relative to their loan terms. This could influence loan decisions but needs further statistical testing.

Recommendations:

1. Feature Selection:
    * Retain the binned categories (Loan Term Bins) as they introduce categorical information that might enhance the interpretability of models.
    * Include the Loan Amount/Term Ratio as a numerical feature to capture the relative size of the loan to its term, which could provide more nuanced insights in modeling.

2. Interaction Terms:
    * Consider creating interaction terms between Loan Term Bins and other numerical features like Applicant Income or Loan Amount. These could highlight patterns that are not obvious in univariate analysis.

3. Model Testing:
    * Use these features in preliminary model testing to evaluate their predictive power. The weak correlations observed so far may not fully capture non-linear relationships that more complex models like XGBoost or Random Forest could uncover.

4. Statistical Testing:
    * Perform hypothesis tests to check if the mean of Loan Amount/Term Ratio significantly differs between approved and non-approved loans. This could validate its inclusion in the model.


In [None]:
# Interaction between Loan Term Bins and Applicant Income
train['Interaction_LoanBins_Income'] = train['Loan_Term_Bins'].map({
    'Short-term': 1, 'Medium-term': 2, 'Long-term': 3}) * train['ApplicantIncome']

# Interaction between Loan Term Bins and Loan Amount
train['Interaction_LoanBins_Amount'] = train['Loan_Term_Bins'].map({
    'Short-term': 1, 'Medium-term': 2, 'Long-term': 3}) * train['LoanAmount']

# Interaction between Loan Amount/Term Ratio and Applicant Income
train['Interaction_Ratio_Income'] = train['LoanAmount_Term_Ratio'] * train['ApplicantIncome']

# Summary statistics of interaction terms
interaction_summary = train[['Interaction_LoanBins_Income', 'Interaction_LoanBins_Amount', 'Interaction_Ratio_Income']].describe()
interaction_summary


In [None]:
from scipy.stats import ttest_ind

# T-test for Loan Amount/Term Ratio between Loan Status groups
approved_ratio = train.loc[train['Loan_Status'] == 1, 'LoanAmount_Term_Ratio']
not_approved_ratio = train.loc[train['Loan_Status'] == 0, 'LoanAmount_Term_Ratio']

# T-test
t_stat, p_value = ttest_ind(approved_ratio, not_approved_ratio, nan_policy='omit')

# Results for T-test
{
    't_stat': t_stat,
    'p_value': p_value
}

# T-test for Interaction_Ratio_Income
approved_ratio_income = train.loc[train['Loan_Status'] == 1, 'Interaction_Ratio_Income']
not_approved_ratio_income = train.loc[train['Loan_Status'] == 0, 'Interaction_Ratio_Income']

# T-test
t_stat_income, p_value_income = ttest_ind(approved_ratio_income, not_approved_ratio_income, nan_policy='omit')

# Results for Interaction_Ratio_Income T-test
{
    't_stat_income': t_stat_income,
    'p_value_income': p_value_income
}


Analysis of Interaction Terms

1. Summary Statistics:
    * The mean values for the interaction terms (Interaction_LoanBins_Income, Interaction_LoanBins_Amount, and Interaction_Ratio_Income) are quite large, especially for Interaction_LoanBins_Income (mean = ~20,521) and Interaction_Ratio_Income (mean = ~2,312).
    * There is significant variability in the data as indicated by the standard deviation (std) values, particularly for Interaction_Ratio_Income (std = ~12,231).
    * Extreme values are present (e.g., max = 243,000 for Interaction_LoanBins_Income and 497,068 for Interaction_Ratio_Income), which may indicate potential outliers.

2. T-Test for Interaction_Ratio_Income:
    * The t-statistic is -2.28, and the corresponding p-value is 0.0228.
    * Since the p-value is below the commonly used significance level of 0.05, the interaction term Interaction_Ratio_Income is statistically significant. This suggests that there is a significant difference in the mean Interaction_Ratio_Income between the Loan_Status groups (Approved vs. Not Approved).

Recommendations

1. Keep Interaction_Ratio_Income:
    * Given its statistical significance, it should be considered as a feature in the modeling phase.
    * However, due to the presence of extreme values, you might consider scaling this feature (e.g., MinMaxScaler or StandardScaler) before using it in a model.

2. Monitor Other Interaction Terms:
    * While Interaction_LoanBins_Income and Interaction_LoanBins_Amount were not directly tested for significance, their variability and descriptive statistics suggest potential utility. These features can be included as candidates in the feature selection phase.

3. Handle Outliers:
    * The high maximum values for the interaction terms may cause issues in modeling. You could cap these extreme values (e.g., Winsorization) or transform the data further.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of interaction terms
interaction_terms = ['Interaction_LoanBins_Income', 'Interaction_LoanBins_Amount', 'Interaction_Ratio_Income']

# Set up the plot grid
plt.figure(figsize=(15, 10))

for i, feature in enumerate(interaction_terms, 1):
    plt.subplot(3, 1, i)
    sns.histplot(train[feature], kde=True, bins=30, color='purple')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Apply log transformations (adding 1 to avoid log(0))
train['Log_Interaction_LoanBins_Income'] = np.log1p(train['Interaction_LoanBins_Income'])
train['Log_Interaction_LoanBins_Amount'] = np.log1p(train['Interaction_LoanBins_Amount'])
train['Log_Interaction_Ratio_Income'] = np.log1p(train['Interaction_Ratio_Income'])

# Visualize transformed features by Loan_Status
interaction_features = [
    'Log_Interaction_LoanBins_Income',
    'Log_Interaction_LoanBins_Amount',
    'Log_Interaction_Ratio_Income'
]

plt.figure(figsize=(12, 12))

for i, feature in enumerate(interaction_features, 1):
    plt.subplot(3, 1, i)
    sns.boxplot(data=train, x='Loan_Status', y=feature, palette='pastel')
    plt.title(f'{feature} Distribution by Loan_Status')
    plt.xlabel('Loan Status (0=Not Approved, 1=Approved)')
    plt.ylabel(feature)

plt.tight_layout()
plt.show()


Observations:

1. Log_Interaction_LoanBins_Income:
    * Approved loans (Loan_Status = 1) tend to have a higher median and a wider range of values compared to not approved loans.
    * There are a few outliers on both ends, but the overall distribution suggests this feature might have some predictive power.

2. Log_Interaction_LoanBins_Amount:
    * Similar to the income-based interaction, approved loans show higher median values and a broader range.
    * The feature seems to differentiate well between the two loan status categories.

3. Log_Interaction_Ratio_Income:
    * This feature appears to show a strong difference between the approved and not approved loans, with approved loans having a higher median.
    * The range of values for approved loans is more spread out, indicating potential variability.


Recommendations:

* Feature Importance Check: Use these transformed features in a preliminary model (like logistic regression or feature importance from XGBoost) to evaluate their significance.
* Further Testing: Perform hypothesis testing (e.g., t-tests or Mann-Whitney U tests) to assess the statistical significance of the differences between loan status categories for these features.
* Scaling: Since these features are skewed, ensure they are appropriately scaled when used in models.


##### Exploring the Credit_History feature

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Check for missing values and unique values
print("Missing values in Credit_History:", train['Credit_History'].isnull().sum())
print("Unique values in Credit_History:", train['Credit_History'].unique())

# 2. Plot the distribution of Credit_History
plt.figure(figsize=(10, 6))
sns.countplot(data=train, x='Credit_History', palette='viridis')
plt.title('Distribution of Credit_History')
plt.xlabel('Credit History')
plt.ylabel('Count')
plt.show()

# 3. Analyze the relationship between Credit_History and Loan_Status
credit_loan_status = train.groupby(['Credit_History', 'Loan_Status']).size().unstack()
credit_loan_status.plot(kind='bar', stacked=True, figsize=(10, 6), colormap='viridis')
plt.title('Loan Status by Credit History')
plt.xlabel('Credit History')
plt.ylabel('Count')
plt.legend(['Not Approved', 'Approved'], title='Loan Status')
plt.show()

# 4. Summary statistics for Credit_History with respect to Loan_Status
summary_credit = train.groupby('Loan_Status')['Credit_History'].value_counts(normalize=True).unstack()
print("Proportion of Credit_History by Loan_Status:\n", summary_credit)


Observations 

1. Distribution of Credit_History:
    * The vast majority of applicants have a Credit_History value of 1 (positive credit history).
    * Only a small portion of the dataset has a Credit_History value of 0 (negative credit history).

2. Relationship with Loan_Status:

    * Applicants with a Credit_History value of 1 have a significantly higher likelihood of loan approval compared to those with a Credit_History of 0.
    * The proportion of loan approvals (Loan_Status = 1) for those with Credit_History = 1 is approximately 92%, while only 8% of applicants with Credit_History = 0 are approved.
    * The relationship between Credit_History and Loan_Status appears strong and could be a key predictive feature.

Recommendations:

1. Feature Importance:
    * Since Credit_History shows a clear and strong relationship with Loan_Status, it should be retained as an essential feature in the model.
    * Consider using this feature as a categorical variable with appropriate encoding (e.g., one-hot or binary).

2. Interaction Features:
    * Create interaction terms between Credit_History and numerical features (e.g., ApplicantIncome, LoanAmount) to explore if they add predictive power to the model.

3. Handling Imbalances:
    * If class imbalance exists (e.g., significantly fewer applicants with Credit_History = 0), sampling techniques or careful model evaluation metrics (like F1-score) should be considered.


In [None]:
# Interaction with continuous features
train['Interaction_CreditIncome'] = train['Credit_History'] * train['Total_Income']
train['Interaction_CreditLoan'] = train['Credit_History'] * train['LoanAmount']

# Interaction with categorical features
train['Interaction_CreditProperty'] = train['Credit_History'].astype(str) + "_" + train['Property_Area'].astype(str)

# Visualize distributions of continuous interactions
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.histplot(train['Interaction_CreditIncome'], bins=30, kde=True, ax=axes[0], color="purple")
axes[0].set_title("Distribution of Credit History × Total Income")
sns.histplot(train['Interaction_CreditLoan'], bins=30, kde=True, ax=axes[1], color="green")
axes[1].set_title("Distribution of Credit History × Loan Amount")
plt.tight_layout()
plt.show()

# Analyze categorical interaction
credit_property_counts = train['Interaction_CreditProperty'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=credit_property_counts.index, y=credit_property_counts.values, palette="muted")
plt.title("Counts of Credit History × Property Area Interactions")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Statistical testing (example for Interaction_CreditIncome)
from scipy.stats import ttest_ind

group_0 = train[train['Loan_Status'] == 0]['Interaction_CreditIncome']
group_1 = train[train['Loan_Status'] == 1]['Interaction_CreditIncome']
t_stat, p_value = ttest_ind(group_0, group_1, equal_var=False)

# Output the t-test results
print(f"T-statistic for Interaction_CreditIncome: {t_stat}")
print(f"P-value for Interaction_CreditIncome: {p_value}")


Observations:

1. Distributions of Continuous Interactions:
    * The interaction between Credit_History and Total_Income shows a bimodal distribution, likely driven by the binary nature of Credit_History (0 or 1).
    * The interaction between Credit_History and LoanAmount is highly right-skewed, with most values concentrated near zero due to smaller loan amounts.

2. Counts of Credit_History × Property Area Interaction:
    * The most frequent interaction categories are 1_Urban, 1_Semiurban, and 1_Rural, corresponding to Credit_History = 1 and different Property_Area values.
    * Interactions involving Credit_History = 0 are significantly fewer, reflecting the imbalanced distribution of the Credit_History feature.

3. Statistical Test Results:
    * For the Interaction_CreditIncome feature, the t-test between approved (Loan_Status = 1) and not approved (Loan_Status = 0) loans shows no significant difference (p-value = 0.891), indicating that this interaction may not contribute much to differentiating loan approval status.

Recommendations:

1. Continuous Interactions:
    * The Interaction_CreditLoan feature might be more promising for capturing variability due to its distribution. Perform further statistical testing to confirm its significance.

2. Categorical Interaction:
    * The Interaction_CreditProperty feature could capture meaningful trends when analyzed using a model that can handle categorical variables effectively, such as tree-based algorithms or embedding techniques.

Next Steps:

Perform similar analyses for the remaining features, including Property_Area and Total_Income.
Include interaction terms in feature selection pipelines to identify their potential contribution to predictive performance.

##### Exploring the Property_Area feature

In [None]:
# 1. Basic Exploration
print(f"Missing values in Property_Area: {train['Property_Area'].isnull().sum()}")
print(f"Unique values in Property_Area: {train['Property_Area'].unique()}")
proportion_area = pd.crosstab(train['Property_Area'], train['Loan_Status'], normalize='index')
print("\nProportion of Property_Area by Loan_Status:")
print(proportion_area)

# 2. Visualize Distributions
fig, axs = plt.subplots(2, 1, figsize=(8, 12))
sns.countplot(data=train, x='Property_Area', ax=axs[0], palette='viridis')
axs[0].set_title("Overall Distribution of Property_Area")
axs[0].set_xlabel("Property Area")
axs[0].set_ylabel("Count")

sns.countplot(data=train, x='Property_Area', hue='Loan_Status', ax=axs[1], palette='viridis')
axs[1].set_title("Loan Status by Property_Area")
axs[1].set_xlabel("Property Area")
axs[1].set_ylabel("Count")
plt.tight_layout()
plt.show()

# 3. Interaction Features
train['Interaction_PropertyIncome'] = train['Property_Area'].astype(str) + "_" + train['ApplicantIncome'].astype(str)
train['Interaction_PropertyCoapplicant'] = train['Property_Area'].astype(str) + "_" + train['CoapplicantIncome'].astype(str)
train['Interaction_PropertyLoan'] = train['Property_Area'].astype(str) + "_" + train['LoanAmount'].astype(str)

# 4. Statistical Testing for Interaction_PropertyIncome (example)
property_income = train[['Interaction_PropertyIncome', 'Loan_Status']].copy()
property_income['PropertyArea_Num'] = train['Property_Area'].factorize()[0]
property_income['Log_ApplicantIncome'] = np.log1p(train['ApplicantIncome'])
t_stat, p_value = ttest_ind(
    property_income[property_income['Loan_Status'] == 1]['Log_ApplicantIncome'],
    property_income[property_income['Loan_Status'] == 0]['Log_ApplicantIncome'],
    equal_var=False
)
print(f"T-statistic for Interaction_PropertyIncome: {t_stat}")
print(f"P-value for Interaction_PropertyIncome: {p_value}")


Observations from Property_Area Analysis:

1. Overall Distribution:
    * The Property_Area feature contains three categories: 0, 1, and 2.
    * The count distribution shows that categories 1 and 2 have a higher frequency than 0.

2. Loan Status by Property Area:
    * Across all categories, the proportion of loan approvals (Loan_Status = 1) is higher than rejections.
    * The highest approval rate is observed in Property_Area = 2 (84.48%), followed by Property_Area = 1 (82.95%), and then Property_Area = 0 (81.79%).

3. Interaction with Income:
    * A t-test for the interaction between Property_Area and ApplicantIncome against Loan_Status yielded:
        * T-statistic: 0.554
        * P-value: 0.580
    * The p-value is greater than 0.05, indicating no significant difference in applicant income distributions across loan statuses for different property areas.

Recommendations for the Property_Area Feature:

1. Encoding:
    * Since Property_Area is categorical, it can be encoded using one-hot encoding or ordinal encoding based on model preference.

2. Interactions:
    * The interaction with income does not show statistical significance. However, it might still contribute to non-linear patterns, so consider keeping the interactions (Property_Area × Income or Property_Area × LoanAmount) for further modeling.
    
3. Feature Usefulness:
    * Property_Area has a strong association with loan status, as evident from the approval proportions. It should be retained as an important feature.


In [None]:
# Create interaction features
train['Interaction_PropertyLoan'] = train['Property_Area'] * train['LoanAmount']
train['Interaction_PropertyCredit'] = train['Property_Area'] * train['Credit_History']

# Summary statistics for the interactions
interaction_summary = train[['Interaction_PropertyLoan', 'Interaction_PropertyCredit']].describe()

# Visualization of interaction distributions
plt.figure(figsize=(12, 8))

# Plot for Interaction_PropertyLoan
plt.subplot(2, 1, 1)
sns.histplot(train['Interaction_PropertyLoan'], kde=True, color='purple', bins=30)
plt.title('Distribution of Property_Area × LoanAmount Interaction')
plt.xlabel('Interaction_PropertyLoan')
plt.ylabel('Frequency')

# Plot for Interaction_PropertyCredit
plt.subplot(2, 1, 2)
sns.histplot(train['Interaction_PropertyCredit'], kde=True, color='green', bins=30)
plt.title('Distribution of Property_Area × Credit_History Interaction')
plt.xlabel('Interaction_PropertyCredit')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Group by Loan_Status for additional analysis
grouped_interactions = train.groupby('Loan_Status')[['Interaction_PropertyLoan', 'Interaction_PropertyCredit']].mean()

# Statistical testing for interaction significance
from scipy.stats import ttest_ind

# Interaction_PropertyLoan significance
t_stat_property_loan, p_value_property_loan = ttest_ind(
    train[train['Loan_Status'] == 1]['Interaction_PropertyLoan'],
    train[train['Loan_Status'] == 0]['Interaction_PropertyLoan']
)

# Interaction_PropertyCredit significance
t_stat_property_credit, p_value_property_credit = ttest_ind(
    train[train['Loan_Status'] == 1]['Interaction_PropertyCredit'],
    train[train['Loan_Status'] == 0]['Interaction_PropertyCredit']
)

# Print results
interaction_tests = {
    'Interaction_PropertyLoan': {'t_stat': t_stat_property_loan, 'p_value': p_value_property_loan},
    'Interaction_PropertyCredit': {'t_stat': t_stat_property_credit, 'p_value': p_value_property_credit}
}

print("Interaction Summary Statistics:\n", interaction_summary)
print("\nGrouped Interactions by Loan_Status:\n", grouped_interactions)
print("\nInteraction Statistical Tests:\n", interaction_tests)


Analysis of Interaction Features for Property_Area

1. Distribution Insights:
    * Interaction_PropertyLoan:
        * Skewed towards smaller values, with most interactions concentrated below 200.
        * Maximum value is 1400, indicating some extreme cases.

    * Interaction_PropertyCredit:
        * Bimodal distribution reflecting the binary nature of Credit_History and discrete Property_Area values.
        * Most values are concentrated at 0, 1, or 2.

2. Summary Statistics:
    * Interaction_PropertyLoan has a mean of 114.86, with a wide range (0–1400) and high variability (std = 160.01).
    * Interaction_PropertyCredit is mostly clustered around its discrete levels (mean = 1.09, std = 0.77).

3. Grouped Insights by Loan_Status:
    * For Interaction_PropertyLoan, loans that are approved (1) have slightly higher mean values (116.35 vs. 107.42).
    * For Interaction_PropertyCredit, approved loans also have a slightly higher mean (1.10 vs. 1.04).

4. Statistical Significance:
    * Interaction_PropertyLoan:
        * p_value = 0.1098, indicating no statistically significant difference between groups.
    * Interaction_PropertyCredit:
        * p_value = 0.0478, indicating a statistically significant difference between approved and non-approved loans.

Recommendations:

* Interaction_PropertyLoan: This feature may not add substantial predictive power due to the lack of statistical significance. It can be considered for removal unless further feature engineering or transformations improve its relevance.
* Interaction_PropertyCredit: Given its statistical significance, this feature should be retained as it demonstrates a meaningful relationship with the target variable.


##### Exploring Total_Income feature

In [None]:
# 1. Basic Statistics and Distribution
print("Summary statistics for Total_Income:")
print(train["Total_Income"].describe())

# Plot the distribution of Total_Income
plt.figure(figsize=(10, 5))
sns.histplot(train["Total_Income"], kde=True, color="purple", bins=30)
plt.title("Distribution of Total_Income")
plt.xlabel("Total_Income")
plt.ylabel("Frequency")
plt.show()

# 2. Target-Based Distribution
plt.figure(figsize=(10, 5))
sns.boxplot(x="Loan_Status", y="Total_Income", data=train, palette="pastel")
plt.title("Total_Income Distribution by Loan_Status")
plt.xlabel("Loan_Status")
plt.ylabel("Total_Income")
plt.show()

# 3. Log Transformation
train["Log_Total_Income"] = np.log1p(train["Total_Income"])

# Plot the distribution of log-transformed Total_Income
plt.figure(figsize=(10, 5))
sns.histplot(train["Log_Total_Income"], kde=True, color="green", bins=30)
plt.title("Distribution of Log-Transformed Total_Income")
plt.xlabel("Log_Total_Income")
plt.ylabel("Frequency")
plt.show()

# Boxplot for log-transformed Total_Income grouped by Loan_Status
plt.figure(figsize=(10, 5))
sns.boxplot(x="Loan_Status", y="Log_Total_Income", data=train, palette="muted")
plt.title("Log_Total_Income Distribution by Loan_Status")
plt.xlabel("Loan_Status")
plt.ylabel("Log_Total_Income")
plt.show()


Key Observations:

1. Distribution:
    * The feature has a right-skewed distribution, as evident from both the raw and log-transformed histograms.
    * Log transformation significantly normalizes the distribution, which can be useful for modeling.

2. Relationship with Loan_Status:
    * Higher-income applicants (both raw and log-transformed) appear to have higher loan approval rates.
    * Approved applicants tend to have slightly higher median total income compared to not-approved applicants.

3. Statistics:
    * Total income ranges from 1963 to 22,500, with a median of 6000.
    * The log-transformed values are more centered, making them potentially better for feature representation in models.

Next Steps:

1. Interactions:
    * Create interaction terms for Total_Income with other features, such as:
        * Loan_Amount (e.g., TotalIncome_to_LoanAmount_Ratio).
        * Credit_History (e.g., CreditHistory_TotalIncome).
        * Loan_Amount_Term (e.g., Income_Per_LoanTerm).
    * Visualize these interactions and assess their importance.

2. Feature Engineering:
    * Decide whether to use the raw or log-transformed feature based on correlation and model performance.

3. Statistical Testing:
    * Perform statistical tests (e.g., t-test or ANOVA) to determine if Total_Income significantly impacts Loan_Status.


In [None]:
# Create interaction terms
train['TotalIncome_to_LoanAmount_Ratio'] = train['Total_Income'] / (train['LoanAmount'] + 1e-6)
train['Income_Per_LoanTerm'] = train['Total_Income'] / (train['Loan_Amount_Term'] + 1e-6)
train['CreditHistory_TotalIncome'] = train['Credit_History'] * train['Total_Income']

# Visualize distributions of interactions
fig, axes = plt.subplots(3, 1, figsize=(10, 15))
sns.histplot(train['TotalIncome_to_LoanAmount_Ratio'], kde=True, ax=axes[0], color="purple")
axes[0].set_title("Distribution of Total Income to Loan Amount Ratio")
sns.histplot(train['Income_Per_LoanTerm'], kde=True, ax=axes[1], color="blue")
axes[1].set_title("Distribution of Income Per Loan Term")
sns.histplot(train['CreditHistory_TotalIncome'], kde=True, ax=axes[2], color="green")
axes[2].set_title("Distribution of Credit History × Total Income")
plt.tight_layout()
plt.show()

# Boxplots by Loan_Status
fig, axes = plt.subplots(3, 1, figsize=(10, 15))
sns.boxplot(data=train, x='Loan_Status', y='TotalIncome_to_LoanAmount_Ratio', ax=axes[0], palette="Set2")
axes[0].set_title("Total Income to Loan Amount Ratio by Loan Status")
sns.boxplot(data=train, x='Loan_Status', y='Income_Per_LoanTerm', ax=axes[1], palette="Set2")
axes[1].set_title("Income Per Loan Term by Loan Status")
sns.boxplot(data=train, x='Loan_Status', y='CreditHistory_TotalIncome', ax=axes[2], palette="Set2")
axes[2].set_title("Credit History × Total Income by Loan Status")
plt.tight_layout()
plt.show()

# Perform statistical tests
stats_results = {}
for col in ['TotalIncome_to_LoanAmount_Ratio', 'Income_Per_LoanTerm', 'CreditHistory_TotalIncome']:
    t_stat, p_value = ttest_ind(
        train[train['Loan_Status'] == 1][col],
        train[train['Loan_Status'] == 0][col],
        nan_policy='omit'
    )
    stats_results[col] = {'t_stat': t_stat, 'p_value': p_value}

# Display statistical test results
print("Statistical Test Results for Interactions:")
for key, value in stats_results.items():
    print(f"{key}: T-stat = {value['t_stat']:.4f}, P-value = {value['p_value']:.4f}")


Observation: 

1. TotalIncome_to_LoanAmount_Ratio
    * Visual Interpretation:
        * The distribution shows a right-skewed pattern, with most values concentrated in the lower range.
        * When stratified by Loan_Status, both approved (1) and not approved (0) loans show similar distributions, but the approved group seems to have slightly higher ratios in its upper range.
    * Statistical Test:
        * T-statistic: -0.7150, P-value: 0.4746
        * Interpretation: There is no statistically significant difference in the TotalIncome_to_LoanAmount_Ratio between approved and not approved loans (p > 0.05). This suggests that this ratio might not strongly influence the loan approval outcome.

2. Income_Per_LoanTerm
    * Visual Interpretation:
        * The boxplot shows most values concentrated at the lower end, with some significant outliers in both loan statuses.
        * Approved loans (1) have a slightly higher median Income_Per_LoanTerm compared to not approved loans (0), but the difference is not visually striking.
    * Statistical Test:
        * T-statistic: -1.8450, P-value: 0.0651
        * Interpretation: The difference in Income_Per_LoanTerm between approved and not approved loans is borderline significant (p ≈ 0.065). While not statistically significant at the 0.05 level, the near-significance suggests this feature might contribute to the loan approval process, especially in models that capture non-linear relationships.

3. CreditHistory_TotalIncome
    * Visual Interpretation:
        * The distribution highlights peaks corresponding to the primary income categories in the dataset, showing a bimodal pattern.
        * Boxplots reveal that the distribution of CreditHistory_TotalIncome is fairly similar for both loan statuses, with approved loans showing slightly higher median values.
    * Statistical Test:
        * T-statistic: 0.1369, P-value: 0.8911
        * Interpretation: There is no statistically significant difference in CreditHistory_TotalIncome between approved and not approved loans (p > 0.05). This suggests that this feature, in isolation, might not strongly differentiate the loan approval status.

Summary of Insights:
* Among the three interactive features analyzed, Income_Per_LoanTerm appears to show the most potential for distinguishing between loan statuses, albeit borderline significant.
* TotalIncome_to_LoanAmount_Ratio and CreditHistory_TotalIncome do not exhibit significant differences between loan statuses, indicating they might have limited direct influence on loan approval. However, these features could still be useful in combination with others in a model that captures complex patterns.


## Feature Engineering & Selection

##### Step 1: Set ID column as Index and Drop Loan_ID

In [None]:
# Set ID column as the index
train.set_index('ID', inplace=True)
test.set_index('ID', inplace=True)

# Drop the Loan_ID column
train.drop('Loan_ID', axis=1, inplace=True)
test.drop('Loan_ID', axis=1, inplace=True)

# Fix for the Dependents column
# Replace '3+' with 3 and ensure Dependents is numeric
train['Dependents'] = train['Dependents'].replace('3+', 3).astype(int)
test['Dependents'] = test['Dependents'].replace('3+', 3).astype(int)

# Create Dependents_Grouped by grouping 2 and 3 into a single category (2+)
train['Dependents_Grouped'] = train['Dependents'].apply(lambda x: 2 if x >= 2 else x)
test['Dependents_Grouped'] = test['Dependents'].apply(lambda x: 2 if x >= 2 else x)


##### Step 2: Log Transformation
We will apply log transformation to continuous features with skewed distributions in both train and test. Features like ApplicantIncome, CoapplicantIncome, LoanAmount, and Total_Income are good candidates for log transformation.


In [None]:
# List of features for log transformation
log_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Total_Income']

# Apply log1p (log(x+1)) transformation to handle zero or negative values
for feature in log_features:
    train[f'Log_{feature}'] = np.log1p(train[feature])
    test[f'Log_{feature}'] = np.log1p(test[feature])


##### Step 3: Interaction Features
Create the interaction features we identified during analysis. Apply the same transformations to both datasets.



In [None]:
# Interaction features
train['Loan_Amount_to_Loan_Term_Ratio'] = train['LoanAmount'] / train['Loan_Amount_Term']
test['Loan_Amount_to_Loan_Term_Ratio'] = test['LoanAmount'] / test['Loan_Amount_Term']

train['CreditHistory_TotalIncome'] = train['Credit_History'] * train['Total_Income']
test['CreditHistory_TotalIncome'] = test['Credit_History'] * test['Total_Income']

train['Income_Per_LoanTerm'] = train['Total_Income'] / train['Loan_Amount_Term']
test['Income_Per_LoanTerm'] = test['Total_Income'] / test['Loan_Amount_Term']


##### Step 4: Loan Term Binning
Create categorical bins for Loan_Amount_Term in both datasets.



In [None]:
# Define bins and labels
bins = [0, 180, 360, float('inf')]
labels = ['Short-term', 'Medium-term', 'Long-term']

# Apply binning
train['Loan_Term_Bin'] = pd.cut(train['Loan_Amount_Term'], bins=bins, labels=labels)
test['Loan_Term_Bin'] = pd.cut(test['Loan_Amount_Term'], bins=bins, labels=labels)


##### Step 5: Categorical Encoding
Convert categorical variables into numeric format using one-hot or label encoding. Apply the encoding to both datasets consistently.

In [None]:
# Define categorical features
categorical_features = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Term_Bin']

# One-hot encode the features
train = pd.get_dummies(train, columns=categorical_features, drop_first=True)
test = pd.get_dummies(test, columns=categorical_features, drop_first=True)

# Ensure both datasets have the same columns
missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0  # Add missing columns to the test dataset

# Align train and test datasets
test = test[train.columns.drop('Loan_Status')]


##### Step 6: Scaling
Normalize continuous features using Min-Max scaling or Standard scaling. Scaling should be fit on the train set and applied to both datasets.

In [None]:
from sklearn.preprocessing import StandardScaler

# Continuous features to scale
continuous_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Total_Income', 
                       'Loan_Amount_to_Loan_Term_Ratio', 'CreditHistory_TotalIncome', 'Income_Per_LoanTerm']

# Initialize scaler
scaler = StandardScaler()

# Fit scaler on train data
scaler.fit(train[continuous_features])

# Transform both train and test
train[continuous_features] = scaler.transform(train[continuous_features])
test[continuous_features] = scaler.transform(test[continuous_features])


##### Step 7: Convert Booleans to Integers

In [None]:
# List of boolean columns to convert
boolean_columns = ['Gender_1',
       'Married_1', 'Education_1', 'Self_Employed_1', 'Property_Area_1',
       'Property_Area_2', 'Loan_Term_Bin_Medium-term',
       'Loan_Term_Bin_Long-term']

# Convert each column in the list from True/False to 1/0
for col in boolean_columns:
    train[col] = train[col].astype(int)
    test[col] = test[col].astype(int)


In [None]:
train.head()

In [None]:
test.head()

##### Step 8: Save Processed Data
Save the transformed datasets for use in modeling.



In [None]:
# Save processed datasets
train.to_csv('data/Train_Processed.csv', index=True)
test.to_csv('data/Test_Processed.csv', index=True)


## Base Model & Feature Engineering & Selection

### Splitting the Data
Since the Loan_Status column is the target in the train dataset, we need to split it into features (X_train) and the target (y_train). Additionally, we would want to create a validation set for model evaluation.


In [None]:
train = pd.read_csv('data/Train_Processed.csv', index_col='ID')
test = pd.read_csv('data/Test_Processed.csv', index_col='ID')

In [None]:
train.head(1)

In [None]:
test.head(1)

In [None]:
train.corr()['Loan_Status']

In [None]:
# Replace with your dataset
X = train.drop(columns=['Loan_Status'])
y = train['Loan_Status']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


##### Addressing Imbalance

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Check the class distribution after SMOTE
print("Class distribution after SMOTE:")
print(y_train_res.value_counts())


##### Base Model

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Initialize LightGBM with default parameters
lgbm_model = LGBMClassifier(random_state=42)

# Train the model
lgbm_model.fit(
    X_train_res, 
    y_train_res,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    callbacks=[early_stopping(stopping_rounds=50, verbose=-1)]
)

# Make predictions
y_val_pred = lgbm_model.predict(X_val)
y_val_prob = lgbm_model.predict_proba(X_val)[:, 1]

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))
print("\nROC-AUC Score:")
print(roc_auc_score(y_val, y_val_prob))


##### Feature Importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import pandas as pd

# Initialize a random forest model for feature importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(train.drop('Loan_Status', axis=1), train['Loan_Status'])

# Extract feature importances
feature_importances = pd.DataFrame({
    'Feature': train.drop('Loan_Status', axis=1).columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importances['Feature'], feature_importances['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()


##### Re evaluating Feature Engineering

In [None]:
import seaborn as sns

# Plot numerical features against the target
num_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Total_Income']
for feature in num_features:
    sns.boxplot(x=train['Loan_Status'], y=train[feature])
    plt.title(f"{feature} vs Loan_Status")
    plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Evaluate interaction terms
interaction_features = ['Income_Per_LoanTerm', 'CreditHistory_TotalIncome']
X_interactions = train[interaction_features]
y = train['Loan_Status']

# Train a logistic regression to see if interactions add value
model = LogisticRegression(random_state=42)
model.fit(X_interactions, y)
predictions = model.predict_proba(X_interactions)[:, 1]
auc = roc_auc_score(y, predictions)
print(f"ROC-AUC for Interaction Features: {auc}")


##### Addressing Outliers

In [None]:
# Define percentile caps for outliers
caps = {
    'ApplicantIncome': 0.99,
    'CoapplicantIncome': 0.99,
    'LoanAmount': 0.99,
    'Total_Income': 0.99,
}

# Apply capping
for col, cap in caps.items():
    upper_limit = train[col].quantile(cap)
    train[col] = np.clip(train[col], None, upper_limit)
    test[col] = np.clip(test[col], None, upper_limit)


##### Enhance Interaction Features
Action:
* Introduce and refine interaction terms based on domain knowledge and observed patterns.

Features to Consider:

1. Income Ratios:
    * ApplicantIncome / CoapplicantIncome
    * ApplicantIncome / LoanAmount
    * CoapplicantIncome / LoanAmount

2. Loan Term Ratios:
    * LoanAmount / Loan_Amount_Term
    * Total_Income / Loan_Amount_Term

In [None]:
# Create interaction features
train['ApplicantIncome_to_CoapplicantIncome'] = train['ApplicantIncome'] / (train['CoapplicantIncome'] + 1)
train['ApplicantIncome_to_LoanAmount'] = train['ApplicantIncome'] / (train['LoanAmount'] + 1)
train['CoapplicantIncome_to_LoanAmount'] = train['CoapplicantIncome'] / (train['LoanAmount'] + 1)

train['LoanAmount_to_LoanTerm'] = train['LoanAmount'] / train['Loan_Amount_Term']
train['Total_Income_to_LoanTerm'] = train['Total_Income'] / train['Loan_Amount_Term']

# Apply the same for the test set
test['ApplicantIncome_to_CoapplicantIncome'] = test['ApplicantIncome'] / (test['CoapplicantIncome'] + 1)
test['ApplicantIncome_to_LoanAmount'] = test['ApplicantIncome'] / (test['LoanAmount'] + 1)
test['CoapplicantIncome_to_LoanAmount'] = test['CoapplicantIncome'] / (test['LoanAmount'] + 1)

test['LoanAmount_to_LoanTerm'] = test['LoanAmount'] / test['Loan_Amount_Term']
test['Total_Income_to_LoanTerm'] = test['Total_Income'] / test['Loan_Amount_Term']


**Rationale**: These features explore relationships between income, loan amount, and loan terms that directly influence loan approvals.

##### Reviewed Engineering Steps

In [None]:
# Reload original dataset if needed
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')

train.head()

In [None]:
# # Reload original dataset if needed
# train = pd.read_csv('data/Train.csv')
# test = pd.read_csv('data/Test.csv')

# # Set ID column as index
# train.set_index('ID', inplace=True)
# test.set_index('ID', inplace=True)

# # Drop Loan_ID column
# train.drop('Loan_ID', axis=1, inplace=True)
# test.drop('Loan_ID', axis=1, inplace=True)

# # Handle Dependents column
# train['Dependents'] = train['Dependents'].replace('3+', 3).astype(int)
# test['Dependents'] = test['Dependents'].replace('3+', 3).astype(int)
# train['Dependents_Grouped'] = train['Dependents'].apply(lambda x: 2 if x >= 2 else x)
# test['Dependents_Grouped'] = test['Dependents'].apply(lambda x: 2 if x >= 2 else x)

# # Calculate Total_Income (if not already done)
# train['Total_Income'] = train['ApplicantIncome'] + train['CoapplicantIncome'] + train['Total_Income']
# test['Total_Income'] = test['ApplicantIncome'] + test['CoapplicantIncome'] + test['Total_Income']

# # Log transform features
# log_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Total_Income']
# for feature in log_features:
#     train[f'Log_{feature}'] = np.log1p(train[feature])
#     test[f'Log_{feature}'] = np.log1p(test[feature])

# # Interaction features
# train['Loan_Amount_to_Loan_Term_Ratio'] = train['LoanAmount'] / train['Loan_Amount_Term']
# test['Loan_Amount_to_Loan_Term_Ratio'] = test['LoanAmount'] / test['Loan_Amount_Term']

# train['CreditHistory_TotalIncome'] = train['Credit_History'] * train['Total_Income']
# test['CreditHistory_TotalIncome'] = test['Credit_History'] * test['Total_Income']

# train['Income_Per_LoanTerm'] = train['Total_Income'] / train['Loan_Amount_Term']
# test['Income_Per_LoanTerm'] = test['Total_Income'] / test['Loan_Amount_Term']

# # Add missing interaction features
# train['ApplicantIncome_Squared'] = train['ApplicantIncome'] ** 2
# test['ApplicantIncome_Squared'] = test['ApplicantIncome'] ** 2

# train['LoanAmount_TotalIncome_Interaction'] = train['LoanAmount'] * train['Total_Income']
# test['LoanAmount_TotalIncome_Interaction'] = test['LoanAmount'] * test['Total_Income']

# # Polynomial features
# poly = PolynomialFeatures(degree=2, include_bias=False)
# train_poly_features = poly.fit_transform(train[['ApplicantIncome', 'LoanAmount']])
# test_poly_features = poly.transform(test[['ApplicantIncome', 'LoanAmount']])

# poly_feature_names = poly.get_feature_names_out(['ApplicantIncome', 'LoanAmount'])
# train[poly_feature_names] = train_poly_features
# test[poly_feature_names] = test_poly_features

# # Binning Loan_Amount_Term
# bins = [0, 180, 360, float('inf')]
# labels = ['Short-term', 'Medium-term', 'Long-term']
# train['Loan_Term_Bin'] = pd.cut(train['Loan_Amount_Term'], bins=bins, labels=labels)
# test['Loan_Term_Bin'] = pd.cut(test['Loan_Amount_Term'], bins=bins, labels=labels)

# # One-hot encode categorical features
# categorical_features = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Term_Bin']
# train = pd.get_dummies(train, columns=categorical_features, drop_first=True)
# test = pd.get_dummies(test, columns=categorical_features, drop_first=True)

# # Align columns
# test = test.reindex(columns=train.columns.drop('Loan_Status'), fill_value=0)

# # Continuous features
# continuous_features = [
#     'ApplicantIncome', 
#     'CoapplicantIncome', 
#     'LoanAmount', 
#     'Total_Income', 
#     'Loan_Amount_to_Loan_Term_Ratio', 
#     'CreditHistory_TotalIncome', 
#     'Income_Per_LoanTerm',
#     'ApplicantIncome_Squared',               # Newly added squared feature
#     'LoanAmount_TotalIncome_Interaction',    # Newly added interaction term
# ] + list(poly_feature_names)                # Add polynomial feature names dynamically

# # Scale continuous features
# scaler = StandardScaler()
# train[continuous_features] = scaler.fit_transform(train[continuous_features])
# test[continuous_features] = scaler.transform(test[continuous_features])

# # Save processed datasets
# train.to_csv('data/Train_Processed.csv', index=True)
# test.to_csv('data/Test_Processed.csv', index=True)


In [None]:
# Step 1: Reload Original Dataset
train = pd.read_csv('data/Train.csv')
hidden_set = pd.read_csv('data/Test.csv')  # Rename test set to hidden_set

# Step 2: Rename and set ID columns
train.set_index('ID', inplace=True)
hidden_set.set_index('ID', inplace=True)

# Step 3: Preprocess Dataset
# Drop Loan_ID column
train.drop('Loan_ID', axis=1, inplace=True)
hidden_set.drop('Loan_ID', axis=1, inplace=True)

# Handle Dependents column
train['Dependents'] = train['Dependents'].replace('3+', 3).astype(int)
hidden_set['Dependents'] = hidden_set['Dependents'].replace('3+', 3).astype(int)
train['Dependents_Grouped'] = train['Dependents'].apply(lambda x: 2 if x >= 2 else x)
hidden_set['Dependents_Grouped'] = hidden_set['Dependents'].apply(lambda x: 2 if x >= 2 else x)

# Calculate Total_Income
train['Total_Income'] = train['ApplicantIncome'] + train['CoapplicantIncome']
hidden_set['Total_Income'] = hidden_set['ApplicantIncome'] + hidden_set['CoapplicantIncome']

# Apply log transformation
log_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Total_Income']
for feature in log_features:
    train[f'Log_{feature}'] = np.log1p(train[feature])
    hidden_set[f'Log_{feature}'] = np.log1p(hidden_set[feature])

# Add interaction features
train['Loan_Amount_to_Loan_Term_Ratio'] = train['LoanAmount'] / train['Loan_Amount_Term']
hidden_set['Loan_Amount_to_Loan_Term_Ratio'] = hidden_set['LoanAmount'] / hidden_set['Loan_Amount_Term']

train['CreditHistory_TotalIncome'] = train['Credit_History'] * train['Total_Income']
hidden_set['CreditHistory_TotalIncome'] = hidden_set['Credit_History'] * hidden_set['Total_Income']

train['Income_Per_LoanTerm'] = train['Total_Income'] / train['Loan_Amount_Term']
hidden_set['Income_Per_LoanTerm'] = hidden_set['Total_Income'] / hidden_set['Loan_Amount_Term']

# Add missing interaction features
train['ApplicantIncome_Squared'] = train['ApplicantIncome'] ** 2
hidden_set['ApplicantIncome_Squared'] = hidden_set['ApplicantIncome'] ** 2

train['LoanAmount_TotalIncome_Interaction'] = train['LoanAmount'] * train['Total_Income']
hidden_set['LoanAmount_TotalIncome_Interaction'] = hidden_set['LoanAmount'] * hidden_set['Total_Income']

train["Income_Credit_Interaction"] = train["ApplicantIncome"] * train["CreditHistory_TotalIncome"]
hidden_set["Income_Credit_Interaction"] = hidden_set["ApplicantIncome"] * hidden_set["CreditHistory_TotalIncome"]

# Square Root Transformation for LoanAmount
train["LoanAmount_Sqrt"] = np.sqrt(train["LoanAmount"])
hidden_set["LoanAmount_Sqrt"] = np.sqrt(hidden_set["LoanAmount"])

# Ratio of Dependents to Total Income
train["Dependents_Income_Ratio"] = train["Dependents"] / (train["Total_Income"] + 1e-6)
hidden_set["Dependents_Income_Ratio"] = hidden_set["Dependents"] / (hidden_set["Total_Income"] + 1e-6)

# # Polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
train_poly_features = poly.fit_transform(train[['ApplicantIncome', 'LoanAmount']])
hidden_set_poly_features = poly.transform(hidden_set[['ApplicantIncome', 'LoanAmount']])

poly_feature_names = poly.get_feature_names_out(['ApplicantIncome', 'LoanAmount'])
train[poly_feature_names] = train_poly_features
hidden_set[poly_feature_names] = hidden_set_poly_features


# Binning Loan_Amount_Term
bins = [0, 180, 360, float('inf')]
labels = ['Short-term', 'Medium-term', 'Long-term']
train['Loan_Term_Bin'] = pd.cut(train['Loan_Amount_Term'], bins=bins, labels=labels)
hidden_set['Loan_Term_Bin'] = pd.cut(hidden_set['Loan_Amount_Term'], bins=bins, labels=labels)

# One-hot encode categorical features
categorical_features = ['Education', 'Self_Employed', 'Property_Area', 'Dependents_Grouped', 'Loan_Term_Bin']
train = pd.get_dummies(train, columns=categorical_features, drop_first=True).astype(int)
hidden_set = pd.get_dummies(hidden_set, columns=categorical_features, drop_first=True).astype(int)


# Outlier treatment
cap_thresholds = {
    'ApplicantIncome': train['ApplicantIncome'].quantile(0.99),
    'LoanAmount': train['LoanAmount'].quantile(0.99),
}
for feature, cap in cap_thresholds.items():
    train[feature] = np.clip(train[feature], None, cap)
    hidden_set[feature] = np.clip(hidden_set[feature], None, cap)

# Continuous features
continuous_features = [
    'ApplicantIncome', 
    'CoapplicantIncome', 
    'LoanAmount', 
    'Loan_Amount_Term', 
    'Total_Income', 
    'Log_ApplicantIncome', 
    'Log_CoapplicantIncome', 
    'Log_LoanAmount', 
    'Log_Total_Income', 
    'Loan_Amount_to_Loan_Term_Ratio', 
    'CreditHistory_TotalIncome', 
    'Income_Per_LoanTerm', 
    'ApplicantIncome_Squared', 
    'LoanAmount_TotalIncome_Interaction', 
    'Income_Credit_Interaction', 
    'LoanAmount_Sqrt', 
    'Dependents_Income_Ratio', 
    'ApplicantIncome', 
    'ApplicantIncome LoanAmount', 
    'LoanAmount^2'
]

# Scale continuous features
scaler = StandardScaler()
train[continuous_features] = scaler.fit_transform(train[continuous_features])
hidden_set[continuous_features] = scaler.transform(hidden_set[continuous_features])

# Step 4: Split train into train and validation
from sklearn.model_selection import train_test_split

# Separate features and target
X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']

# Step 1: Split into train_data (70%) and temp_data (30%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, stratify=y, random_state=42
)

# resampler = SMOTE(random_state=42)
# X_train, y_train = resampler.fit_resample(X_train, y_train)

# Display sizes of each split
print(f"Train set size: {X_train.shape}, {y_train.shape}")
# print(f"Validation set size: {X_val.shape}, {y_val.shape}")
print(f"Test set size: {X_test.shape}, {y_test.shape}")
print(f"Hidden set size: {hidden_set.shape}")

# # Step 3: Save processed datasets
# train_data = X_train.copy()
# train_data['Loan_Status'] = y_train
# train_data.to_csv('data/Train_Split.csv', index=True)

# val_data = X_val.copy()
# val_data['Loan_Status'] = y_val
# val_data.to_csv('data/Val_Split.csv', index=True)

# hidden_set.to_csv('data/Hidden_Set_Processed.csv', index=True)


In [None]:
train.describe(include='all').T

In [None]:
encode_list = []
numerical_list = []

for i in train.columns:
    print(f"{i}: {train[i].nunique()}")
    if train[i].nunique() <=5:
        encode_list.append(i)


In [None]:
for split_name, y_split in zip(['Train', 'Test'], [y_train, y_test]):
    print(f"{split_name} class distribution:\n{y_split.value_counts(normalize=True)}\n")


In [None]:
best_params_1 = {'n_estimators': 992,
 'learning_rate': 0.024273593878709077,
 'num_leaves': 103,
 'max_depth': 12,
 'min_child_samples': 37,
 'feature_fraction': 0.6137915003281622,
 'bagging_fraction': 0.6072555152182413,
 'bagging_freq': 6}

In [None]:
len(X_train.columns)

In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import early_stopping
from imblearn.over_sampling import SMOTE
import numpy as np

def objective(trial, X_train, y_train):
    # Define hyperparameters
    params = {
        'objective': 'binary',
        'metric': 'binary_error',  # Accuracy-focused metric
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
    }

    # Initialize SMOTE
    smote = SMOTE(random_state=42)

    # Stratified Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Apply SMOTE to the training data
        X_tr_smote, y_tr_smote = smote.fit_resample(X_tr, y_tr)

        # Train LightGBM model
        model = LGBMClassifier(**params, random_state=42)
        model.fit(
            X_tr_smote, 
            y_tr_smote, 
            eval_set=[(X_val, y_val)], 
            eval_metric='binary_error', 
            callbacks=[early_stopping(stopping_rounds=50, verbose=False)]
        )

        # Validate the model
        val_preds = model.predict(X_val)
        cv_scores.append(accuracy_score(y_val, val_preds))

    return np.mean(cv_scores)

# Prepare your train data: X_train and y_train
X_train = train.drop('Loan_Status', axis=1)
y_train = train['Loan_Status']

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=200)

# Best parameters and accuracy
print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)


In [None]:
study.best_params

In [None]:
best_params = {'n_estimators': 547,
 'learning_rate': 0.08066562616278325,
 'num_leaves': 103,
 'max_depth': 15,
 'min_child_samples': 18,
 'feature_fraction': 0.5453253046633786,
 'bagging_fraction': 0.9107055079871386,
 'bagging_freq': 8}

In [None]:
# Update the classifier with class_weight
final_model = LGBMClassifier(**study.best_params, class_weight='balanced', random_state=42)
final_model.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_test_pred = final_model.predict(X_test)

# Confusion matrix and classification report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Evaluate predictions
print("Test Set Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))
print("\nAccuracy:", accuracy_score(y_test, y_test_pred))


In [None]:
from sklearn.metrics import precision_recall_curve

# Get predicted probabilities
y_probs = final_model.predict_proba(X_test)[:, 1]

# Compute precision-recall pairs
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs, pos_label=1)

# Compute F1 scores
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)

# Find the threshold with the highest F1 score
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
print("Optimal Threshold:", optimal_threshold)

# Make predictions based on the optimal threshold
y_val_pred = (y_probs >= optimal_threshold).astype(int)
print("\nAccuracy:", accuracy_score(y_test, y_test_pred))


In [None]:
print("Confusion Matrix with Optimal Threshold:")
print(confusion_matrix(y_test, y_val_pred))

print("\nClassification Report with Optimal Threshold:")
print(classification_report(y_test, y_val_pred, digits=4))


In [None]:
# Predict on the hidden set
hidden_predictions = final_model.predict(hidden_set)

# Since the hidden set does not have true labels, output predictions for submission
hidden_set['Loan_Status_Prediction'] = hidden_predictions

# Save predictions for submission or further analysis
hidden_set[['Loan_Status_Prediction']].to_csv('submssions/lgbm_hse_006.csv', index=True)

print("Predictions for the hidden set saved to 'Hidden_Set_Predictions.csv'.")


### Reanalysing Overfitting

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select key features to compare
key_features = ['ApplicantIncome', 'LoanAmount', 'Total_Income', 'Credit_History']

# Plot feature distributions
for feature in key_features:
    plt.figure(figsize=(10, 6))
    sns.kdeplot(train[feature], label='Train', fill=True, alpha=0.5)
    sns.kdeplot(X_val[feature], label='Validation', fill=True, alpha=0.5)
    sns.kdeplot(X_test[feature], label='Test', fill=True, alpha=0.5)
    sns.kdeplot(hidden_set[feature], label='Hidden', fill=True, alpha=0.5)
    plt.title(f'Distribution of {feature}')
    plt.legend()
    plt.show()


##### Analysis of Distributions
The density plots reveal that some features (e.g., ApplicantIncome, Total_Income, Credit_History) exhibit discrepancies between the hidden test set and the other sets. These differences could explain the poor performance on the hidden test set.

Action Plan:

* Quantify these differences using statistical tests (e.g., KS-test, Wasserstein distance) to measure the divergence between distributions.
* Investigate why the hidden test set differs. This could be due to data leakage, preprocessing differences, or shifts in feature distributions.

In [None]:
# Get predicted probabilities
hidden_preds_proba = final_model.predict_proba(hidden_set.drop('Loan_Status_Prediction', axis=1))[:, 1]

# Check the confidence distribution of predictions
plt.figure(figsize=(8, 6))
sns.histplot(hidden_preds_proba, kde=True, bins=30)
plt.title('Prediction Confidence on Hidden Set')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.show()


##### Model Calibration
The calibration curve shows the model is overconfident for certain probability ranges. Miscalibration can lead to suboptimal decisions based on predicted probabilities.

Action Plan:

* Recalibrate the model using techniques like Platt Scaling or Isotonic Regression on the validation set.
* Validate calibration improvements by re-evaluating the model's probability predictions on the hidden test set.


In [None]:
importances = final_model.feature_importances_
features = X_test.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
plt.title('Top 20 Feature Importances')
plt.show()


##### Feature Importance Insights
The feature importance plot highlights the top contributors (e.g., LoanAmount_TotalIncome_Interaction, Loan_Amount_to_Loan_Term_Ratio).

Action Plan:

* Check the stability of these features across train/validation/test/hidden sets.
* Test whether removing or transforming top features improves generalization. For instance, some features may have noise or overfitting tendencies.

In [None]:
from sklearn.calibration import calibration_curve

# Get predicted probabilities and true labels for validation set
y_val_pred_proba = final_model.predict_proba(X_val)[:, 1]

# Calibration curve
prob_true, prob_pred = calibration_curve(y_val, y_val_pred_proba, n_bins=10)

plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, marker='o', label='Model Calibration')
plt.plot([0, 1], [0, 1], linestyle='--', label='Perfect Calibration')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Curve')
plt.legend()
plt.show()


##### Regularization and Complexity Control
Overfitting may arise from a lack of regularization or overly complex models.

Action Plan:

* Tune hyperparameters like num_leaves, max_depth, and min_data_in_leaf to reduce model complexity.
* Use techniques like dropout, shuffling, or cross-validation to better generalize the model.

##### 5. Address Hidden Test Set Challenges
Since we don't have true labels for the hidden set, here's how we would proceed:

* Use pseudo-labeling: Predict on the hidden test set, assign high-confidence predictions as pseudo-labels, and fine-tune the model.
* Split the hidden test set into subsets. For example, use a small part for manual validation if possible.


#### Reattempting Upgrading the model

##### Step 1: Evaluate Feature Distributions Across Sets
We'll check if there are major distributional differences between the training, validation, test, and hidden sets.



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# List of continuous numeric features to analyze
numeric_features = [
    'ApplicantIncome', 
    'CoapplicantIncome', 
    'LoanAmount', 
    'Total_Income', 
    'Loan_Amount_to_Loan_Term_Ratio', 
    'CreditHistory_TotalIncome', 
    'Income_Per_LoanTerm',
    'ApplicantIncome_Squared', 
    'LoanAmount_TotalIncome_Interaction'
] + list(poly_feature_names)

# Combine all datasets for comparison
X_train['Dataset'] = 'Train'
X_val['Dataset'] = 'Validation'
X_test['Dataset'] = 'Test'
hidden_set['Dataset'] = 'Hidden'

combined_data = pd.concat([X_train, X_val, X_test, hidden_set])

# Plot distributions for each feature
for feature in numeric_features:
    plt.figure(figsize=(10, 6))
    sns.kdeplot(data=combined_data, x=feature, hue='Dataset', fill=True)
    plt.title(f'Distribution of {feature} Across Datasets')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend(title='Dataset')
    plt.show()

# Remove the 'Dataset' column after analysis
X_train.drop('Dataset', axis=1, inplace=True)
X_val.drop('Dataset', axis=1, inplace=True)
X_test.drop('Dataset', axis=1, inplace=True)
hidden_set.drop('Dataset', axis=1, inplace=True)


##### Distributional Issues:

From the density plots, certain features like LoanAmount, ApplicantIncome_Squared, and interaction terms show skewed distributions across datasets. This could be contributing to poor generalization, as the hidden set may not align with training distributions.


In [None]:
from sklearn.calibration import calibration_curve

# Predict probabilities on validation set
y_val_prob = final_model.predict_proba(X_val)[:, 1]

# Generate calibration curve
prob_true, prob_pred = calibration_curve(y_val, y_val_prob, n_bins=10, strategy='uniform')

# Plot the calibration curve
plt.figure(figsize=(8, 6))
plt.plot(prob_pred, prob_true, marker='o', label='Model Calibration')
plt.plot([0, 1], [0, 1], linestyle='--', label='Perfect Calibration')
plt.title('Calibration Curve (Validation Set)')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.legend()
plt.grid()
plt.show()


##### Insights from Calibration Curve:

The model is overconfident in its predictions, as seen from the sharp deviations from the perfect calibration line. This indicates potential overfitting or issues in probability calibration.


In [None]:
# Predict probabilities on hidden set
y_hidden_prob = final_model.predict_proba(hidden_set.drop('Loan_Status_Prediction', axis=1))[:, 1]

# Plot distribution of predicted probabilities
plt.figure(figsize=(10, 6))
sns.histplot(y_hidden_prob, bins=20, kde=True, color='blue')
plt.title('Prediction Confidence on Hidden Set')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.grid()
plt.show()


##### Low Confidence Predictions:

There are significant low-confidence predictions (185 cases). These might stem from the overlap or lack of discrimination in certain features, suggesting areas for further feature engineering or model refinement.


In [None]:
importances = final_model.feature_importances_
features = X_train.columns

# Sort feature importances in descending order
sorted_indices = importances.argsort()[::-1]
sorted_features = features[sorted_indices]
sorted_importances = importances[sorted_indices]

# Plot top 20 feature importances
plt.figure(figsize=(10, 8))
plt.barh(sorted_features[:20][::-1], sorted_importances[:20][::-1])
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


##### Feature Importance:

The top features like LoanAmount_TotalIncome_Interaction and Loan_Amount_to_Loan_Term_Ratio are highly influential, but some mid-ranked features might require further exploration for their interaction effects or transformation.


In [None]:
# Add predictions to hidden set
hidden_set['Predicted_Probability'] = y_hidden_prob

# Flag outliers based on prediction confidence (e.g., low confidence)
low_confidence_threshold = 0.6  # Example threshold
outliers = hidden_set[hidden_set['Predicted_Probability'] < low_confidence_threshold]

# Display the flagged outliers
print("Number of low-confidence predictions:", len(outliers))
print(outliers.head())


#### Deep Dive into Low Confidence Predictions

##### Steps for Low-Confidence Analysis
1. Basic Statistics for Low-Confidence Cases:
Compare the feature means, medians, and ranges of low-confidence samples to the full dataset.
2. Feature Distribution Comparison:
Plot the feature distributions for low-confidence predictions versus the full dataset. Identify features where the distributions diverge significantly.
3. Correlation Analysis:
Calculate correlations between features for low-confidence samples and compare them to the correlations in the full dataset.
4. Cluster Analysis:
Use clustering (e.g., K-Means or hierarchical clustering) on low-confidence samples to see if specific patterns or groups emerge.
5. Feature Importance for Low-Confidence Predictions:
Refit a model using only low-confidence samples and examine the feature importance. This helps identify what features drive uncertainty.

In [None]:
# # Predict labels (0 or 1) for the hidden set
# y_pred_hidden = final_model.predict(hidden_set)

# # Predict probabilities for the hidden set (for the positive class, 1)
# y_pred_prob_hidden = final_model.predict_proba(hidden_set)[:, 1]  # Get probabilities for class 1

# Add these predictions to the hidden_set
y_pred_hidden = hidden_set['Loan_Status_Prediction']
y_pred_prob_hidden = hidden_set['Predicted_Probability']

# Display the updated hidden_set with predictions
print("Updated Hidden Set with Predictions:")


In [None]:
full_predictions = hidden_set.copy()
full_predictions.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Filter low-confidence samples
low_confidence = full_predictions[full_predictions['Predicted_Probability'].between(0.4, 0.6)]

# 1. Basic Statistics
print("Basic Statistics for Low-Confidence Predictions:")
print(low_confidence.describe())

print("\nComparing Means with Full Dataset:")
print("Full Dataset Means:\n", train[continuous_features].mean())
print("Low Confidence Means:\n", low_confidence[continuous_features].mean())

# 2. Feature Distribution Comparison
for feature in continuous_features:
    plt.figure(figsize=(8, 5))
    sns.kdeplot(data=train, x=feature, label='Full Dataset', fill=True, alpha=0.5)
    sns.kdeplot(data=low_confidence, x=feature, label='Low Confidence', fill=True, alpha=0.5, color='red')
    plt.title(f"Feature Distribution: {feature}")
    plt.legend()
    plt.show()

# 3. Correlation Analysis
print("\nCorrelation Analysis for Low-Confidence Predictions:")
low_conf_corr = low_confidence[continuous_features].corr()
full_data_corr = train[continuous_features].corr()

# Heatmaps for correlations
plt.figure(figsize=(10, 8))
sns.heatmap(low_conf_corr, annot=True, cmap='coolwarm')
plt.title("Low-Confidence Correlation Matrix")
plt.show()

plt.figure(figsize=(10, 8))
sns.heatmap(full_data_corr, annot=True, cmap='coolwarm')
plt.title("Full Dataset Correlation Matrix")
plt.show()

# 4. Clustering Analysis
kmeans = KMeans(n_clusters=3, random_state=42)
low_confidence['Cluster'] = kmeans.fit_predict(low_confidence[continuous_features])

print("\nCluster Distribution for Low-Confidence Predictions:")
print(low_confidence['Cluster'].value_counts())

# Visualize clusters (using two key features)
sns.scatterplot(
    data=low_confidence, x="LoanAmount", y="Total_Income", hue="Cluster", palette="viridis"
)
plt.title("Clustering Low-Confidence Predictions")
plt.show()

# 5. Feature Importance for Low-Confidence Cases
from lightgbm import LGBMClassifier

# Remove duplicate features from continuous_features
continuous_features = list(dict.fromkeys(continuous_features))

# Prepare data
X_low_conf = low_confidence[continuous_features].copy()
y_low_conf = low_confidence['Loan_Status_Prediction']

# Ensure no duplicate columns in X_low_conf
X_low_conf = X_low_conf.loc[:, ~X_low_conf.columns.duplicated()]

# Verify that all columns are unique
assert X_low_conf.columns.duplicated().sum() == 0, "There are still duplicate columns in X_low_conf"

# Fit a LightGBM model
low_conf_model = LGBMClassifier(random_state=42)
low_conf_model.fit(X_low_conf, y_low_conf)

# Plot feature importances
low_conf_feature_importances = pd.DataFrame({
    'Feature': X_low_conf.columns,
    'Importance': low_conf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display top feature importances
print(low_conf_feature_importances)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(data=low_conf_feature_importances, x='Importance', y='Feature')
plt.title("Feature Importances for Low-Confidence Predictions")
plt.show()


##### Observations:

1. Low-Confidence Predictions:

    * These predictions are identified with probabilities between 0.4 and 0.6, representing ambiguous classifications.
    * The total count of low-confidence samples is 106, and these were analyzed separately for feature distributions, clustering, and feature importance.

2. Feature Importance:
    * Key features contributing to the prediction model include:
        * Loan_Amount_to_Loan_Term_Ratio
        * CoapplicantIncome
        * ApplicantIncome
        * LoanAmount_TotalIncome_Interaction
    * Features like LoanAmount^2 and ApplicantIncome^2 have minimal importance, suggesting possible redundancy.

3. Feature Distributions:
    * Comparing distributions for low-confidence predictions and the full dataset reveals noticeable differences in features such as ApplicantIncome, LoanAmount, and Total_Income.
    * This indicates that these features are likely influencing uncertainty in the model's decision-making.

4. Correlation Analysis:
    * The correlation matrix for low-confidence predictions shows weaker relationships between some features compared to the full dataset, which could explain inconsistencies in predictions.
    
5. Cluster Analysis:
    * Clustering low-confidence predictions using KMeans resulted in three distinct clusters:
        * Cluster 0: Majority of the samples
        * Cluster 2: Significant minority
        * Cluster 1: Outlier cluster with only one sample
    * This clustering provides insights into the variation within low-confidence predictions and can inform targeted re-training.


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot: Loan_Amount_to_Loan_Term_Ratio vs LoanAmount_TotalIncome_Interaction
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=low_confidence,
    x='Loan_Amount_to_Loan_Term_Ratio',
    y='LoanAmount_TotalIncome_Interaction',
    hue='Loan_Status_Prediction',  # Colored by the prediction
    palette='viridis',
    alpha=0.8
)
plt.title("Interaction: Loan_Amount_to_Loan_Term_Ratio vs LoanAmount_TotalIncome_Interaction")
plt.xlabel("Loan_Amount_to_Loan_Term_Ratio")
plt.ylabel("LoanAmount_TotalIncome_Interaction")
plt.legend(title="Loan Status")
plt.show()


In [None]:
# Heatmap of correlations for important features
important_features = [
    'Loan_Amount_to_Loan_Term_Ratio',
    'LoanAmount_TotalIncome_Interaction',
    'CreditHistory_TotalIncome',
    'Total_Income'
]

correlation_matrix = low_confidence[important_features].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap='coolwarm',
    fmt='.2f'
)
plt.title("Correlation Heatmap of Important Features")
plt.show()


In [None]:
# Pairplot for important interactions
sns.pairplot(
    data=low_confidence,
    vars=['Loan_Amount_to_Loan_Term_Ratio', 'LoanAmount_TotalIncome_Interaction', 'Total_Income'],
    hue='Loan_Status_Prediction',
    palette='viridis'
)
plt.suptitle("Pairplot for Important Features (Colored by Loan Status)", y=1.02)
plt.show()


In [None]:
from scipy.stats import ttest_ind

# Features to test
features_to_test = [
    'Loan_Amount_to_Loan_Term_Ratio',
    'LoanAmount_TotalIncome_Interaction',
    'Total_Income',
    'CreditHistory_TotalIncome'
]

# Perform t-tests for each feature
t_test_results = []
for feature in features_to_test:
    group_0 = low_confidence[low_confidence['Loan_Status_Prediction'] == 0][feature]
    group_1 = low_confidence[low_confidence['Loan_Status_Prediction'] == 1][feature]
    
    # Perform t-test
    t_stat, p_value = ttest_ind(group_0, group_1, equal_var=False)  # Assuming unequal variance
    t_test_results.append({'Feature': feature, 't-statistic': t_stat, 'p-value': p_value})

# Create a DataFrame to display the results
import pandas as pd
t_test_results_df = pd.DataFrame(t_test_results)

# # Display results
# import ace_tools as tools; tools.display_dataframe_to_user(name="T-Test Results for Features", dataframe=t_test_results_df)
# co

In [None]:
t_test_results_df

In [None]:
    # 'CreditHistory_TotalIncome',  # Removed
    # 'Income_Per_LoanTerm',        # Removed
    # 'ApplicantIncome_Squared',    # Removed
    # 'LoanAmount_TotalIncome_Interaction',  # Removed

### Simplified Dataset

In [None]:
# Simplify Features by Dropping Low Importance and high VIF Features
low_importance_features = [
    "Education_1", 
    "Log_LoanAmount", 
    "Married_1",
    # "CreditHistory_TotalIncome",
    "Income_Per_LoanTerm",
    "ApplicantIncome_Squared",
    "LoanAmount_TotalIncome_Interaction"

]
X_train_simplified = X_train.drop(columns=low_importance_features, errors="ignore")
X_val_simplified = X_val.drop(columns=low_importance_features, errors="ignore")
X_test_simplified = X_test.drop(columns=low_importance_features, errors="ignore")
hidden_set_simplified = hidden_set.drop(columns=low_importance_features, errors="ignore")

# Add Enriched Features
# Example 1: Interaction between ApplicantIncome and CreditHistory_TotalIncome
X_train_simplified["Income_Credit_Interaction"] = (
    X_train_simplified["ApplicantIncome"] * X_train_simplified["CreditHistory_TotalIncome"]
)
X_val_simplified["Income_Credit_Interaction"] = (
    X_val_simplified["ApplicantIncome"] * X_val_simplified["CreditHistory_TotalIncome"]
)
X_test_simplified["Income_Credit_Interaction"] = (
    X_test_simplified["ApplicantIncome"] * X_test_simplified["CreditHistory_TotalIncome"]
)
hidden_set_simplified["Income_Credit_Interaction"] = (
    hidden_set_simplified["ApplicantIncome"] * hidden_set_simplified["CreditHistory_TotalIncome"]
)

# Example 2: Square Root Transformation for LoanAmount
X_train_simplified["LoanAmount_Sqrt"] = np.sqrt(X_train_simplified["LoanAmount"])
X_val_simplified["LoanAmount_Sqrt"] = np.sqrt(X_val_simplified["LoanAmount"])
X_test_simplified["LoanAmount_Sqrt"] = np.sqrt(X_test_simplified["LoanAmount"])
hidden_set_simplified["LoanAmount_Sqrt"] = np.sqrt(hidden_set_simplified["LoanAmount"])

# Example 3: Ratio of Dependents to Total Income
X_train_simplified["Dependents_Income_Ratio"] = (
    X_train_simplified["Dependents"] / (X_train_simplified["Total_Income"] + 1e-6)
)
X_val_simplified["Dependents_Income_Ratio"] = (
    X_val_simplified["Dependents"] / (X_val_simplified["Total_Income"] + 1e-6)
)
X_test_simplified["Dependents_Income_Ratio"] = (
    X_test_simplified["Dependents"] / (X_test_simplified["Total_Income"] + 1e-6)
)
hidden_set_simplified["Dependents_Income_Ratio"] = (
    hidden_set_simplified["Dependents"] / (hidden_set_simplified["Total_Income"] + 1e-6)
)

# Ensure Columns are Aligned Across All Sets
X_val_simplified = X_val_simplified.reindex(columns=X_train_simplified.columns, fill_value=0)
X_test_simplified = X_test_simplified.reindex(columns=X_train_simplified.columns, fill_value=0)
hidden_set_simplified = hidden_set_simplified.reindex(columns=X_train_simplified.columns, fill_value=0)

# Save the Simplified and Enriched Datasets
X_train_simplified.to_csv("data/X_Train_Simplified.csv", index=True)
X_val_simplified.to_csv("data/X_Val_Simplified.csv", index=True)
X_test_simplified.to_csv("data/X_Test_Simplified.csv", index=True)
hidden_set_simplified.to_csv("data/Hidden_Set_Simplified.csv", index=True)


In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def objective(trial):
    # Define hyperparameters
    params = {
        'objective': 'binary',
        'metric': 'binary_error',  # Accuracy-focused metric
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
    }

    # Stratified Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    for train_idx, val_idx in skf.split(train.drop('Loan_Status', axis=1), train['Loan_Status']):
        X_train, X_val = train.iloc[train_idx].drop('Loan_Status', axis=1), train.iloc[val_idx].drop('Loan_Status', axis=1)
        y_train, y_val = train.iloc[train_idx]['Loan_Status'], train.iloc[val_idx]['Loan_Status']

        model = LGBMClassifier(**params, random_state=42)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='binary_error', callbacks=[early_stopping(stopping_rounds=50, verbose=-1)])

        val_preds = model.predict(X_val)
        cv_scores.append(accuracy_score(y_val, val_preds))

    return np.mean(cv_scores)

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Best parameters and accuracy
print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)

In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import early_stopping
import numpy as np

# Combine Train and Validation sets (excluding test/hidden sets)
combined_train = X_train.copy()
y_train = combined_train['Loan_Status']

def objective(trial):
    # Define hyperparameters
    params = {
        'objective': 'binary',
        'metric': 'binary_error',  # Accuracy-focused metric
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
    }

    # Stratified Cross-Validation (excluding test and validation sets)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    for train_idx, val_idx in skf.split(combined_train.drop('Loan_Status', axis=1), combined_train['Loan_Status']):
        # Use only part of training data
        X_fold_train, X_fold_val = combined_train.iloc[train_idx].drop('Loan_Status', axis=1), combined_train.iloc[val_idx].drop('Loan_Status', axis=1)
        y_fold_train, y_fold_val = combined_train.iloc[train_idx]['Loan_Status'], combined_train.iloc[val_idx]['Loan_Status']

        model = LGBMClassifier(**params, random_state=42)
        model.fit(
            X_fold_train, 
            y_fold_train, 
            eval_set=[(X_fold_val, y_fold_val)], 
            eval_metric='binary_error', 
            callbacks=[early_stopping(stopping_rounds=50, verbose=-1)]
        )

        val_preds = model.predict(X_fold_val)
        cv_scores.append(accuracy_score(y_fold_val, val_preds))

    return np.mean(cv_scores)

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Best parameters and accuracy
print("Best Parameters:", study.best_params)
print("Best Accuracy:", study.best_value)


In [None]:
X_train_simplified["Income_Credit_Interaction"] = (
    X_train_simplified["ApplicantIncome"] * X_train_simplified["CreditHistory_TotalIncome"]
)

# Example 2: Square Root Transformation for LoanAmount
X_train_simplified["LoanAmount_Sqrt"] = np.sqrt(X_train_simplified["LoanAmount"])


# Example 3: Ratio of Dependents to Total Income
X_train_simplified["Dependents_Income_Ratio"] = (
    X_train_simplified["Dependents"] / (X_train_simplified["Total_Income"] + 1e-6)
)


##### Key Takeaways
High Precision and Recall for Class 1 (Loan Approved):

Precision: 94%
Recall: 100%
This indicates that the model is excellent at correctly identifying approved loans.
Lower Performance for Class 0 (Loan Not Approved):

Precision: 100%
Recall: 67%
While the model is very precise when predicting "not approved," it misses about 28% of the true "not approved" cases.
Imbalanced Recall:

Recall imbalance suggests the model might still be skewed towards the majority class (approved loans). This is common with imbalanced datasets.


##### Misclassification Analysis

Steps for Misclassification Analysis
1. Extract Misclassified Cases
    * False Negatives (Class 0 predicted as Class 1): Instances where Loan_Status=0 but the model predicted Loan_Status=1.
    * False Positives (Class 1 predicted as Class 0): Instances where Loan_Status=1 but the model predicted Loan_Status=0.

2. Analyze Feature Distributions
    * Compare the distributions of key features for correctly classified vs. misclassified instances:
    * ApplicantIncome, CoapplicantIncome
    * Credit_History
    * LoanAmount, Total_Income
    * Property_Area, Education, etc.

3. Visualize Differences
    * Use boxplots or KDE plots to compare distributions for features contributing to misclassification.
    
4. Identify Patterns: Check for
    * Feature ranges where the model struggles (e.g., specific income or loan amounts).
    * Feature interactions contributing to misclassification (e.g., high income but poor credit history).

5. Propose Adjustments
Based on insights, consider feature engineering, parameter tuning, or sampling strategies to address misclassified patterns.


In [None]:
from sklearn.metrics import confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Get predictions and probabilities
y_val_probs = final_model.predict_proba(X_val)[:, 1]
y_val_pred = final_model.predict(X_val)

# Extract misclassified cases
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

# False Negatives (Actual = 0, Predicted = 1)
fn_indices = (y_val == 0) & (y_val_pred == 1)
false_negatives = X_val[fn_indices]

# False Positives (Actual = 1, Predicted = 0)
fp_indices = (y_val == 1) & (y_val_pred == 0)
false_positives = X_val[fp_indices]

# Correctly Classified
correctly_classified = X_val[(y_val == y_val_pred)]

# Summary
print(f"False Negatives: {false_negatives.shape[0]} cases")
print(f"False Positives: {false_positives.shape[0]} cases")
print(f"Correctly Classified: {correctly_classified.shape[0]} cases")

# Feature distribution comparison: ApplicantIncome example
plt.figure(figsize=(12, 6))
sns.boxplot(data=pd.concat([
    pd.DataFrame({'Feature': false_negatives['ApplicantIncome'], 'Type': 'False Negatives'}),
    pd.DataFrame({'Feature': false_positives['ApplicantIncome'], 'Type': 'False Positives'}),
    pd.DataFrame({'Feature': correctly_classified['ApplicantIncome'], 'Type': 'Correctly Classified'})
]), x='Type', y='Feature')
plt.title("ApplicantIncome Distribution Across Classification Types")
plt.show()


In [None]:
X_low_conf.head()