## Problem Definition 

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,f1_score,precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier  
#from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler,SMOTE
from collections import Counter
import warnings
warnings.filterwarnings("ignore")


from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler


## Data Insights 

In [2]:
df = pd.read_csv('Loan_Default.csv')
df.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [None]:
df.info()

In [None]:
df.describe()

Find the number of null values for each column 

In [14]:
df.isnull().sum()


loan_limit                    3344
approv_in_adv                  908
loan_type                        0
loan_purpose                   134
credit_worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
rate_of_interest             36439
interest_rate_spread         36639
upfront_charges              39642
term                            41
neg_ammortization              121
interest_only                    0
lump_sum_payment                 0
property_value               15098
construction_type                0
occupancy_type                   0
secured_by                       0
total_units                      0
income                        9150
credit_type                      0
credit_score                     0
co-applicant_credit_type         0
age                              0
submission_of_application      200
ltv                          15098
region                           0
security_type       

In [None]:
(df.isnull().sum()/len(df))*100

+ Given the above missing data information, I want to have a dual approach of column-specific imputation coupled with pipeline integration. For columns with substantial missing values, such as rate_of_interest, Interest_rate_spread, Upfront_charges, property_value, LTV, and dtir1, an in-depth analysis to determine the appropriate imputation method is better.

+ Then, I plan to systematize the chosen imputation methods within a ColumnTransformer, ensuring a consistent and automated application of these methods to both the training and testing sets.






In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the matplotlib figure with a specified figure size
plt.figure(figsize=(18, 5))

# Histogram for 'rate_of_interest'
plt.subplot(1, 3, 1)  # (rows, columns, panel number)
sns.histplot(df['rate_of_interest'].dropna(), kde=True)
plt.title('Distribution of Rate of Interest')

# Histogram for 'interest_rate_spread'
plt.subplot(1, 3, 2)
sns.histplot(df['interest_rate_spread'].dropna(), kde=True)
plt.title('Distribution of Interest Rate Spread')

# Histogram for 'upfront_charges'
plt.subplot(1, 3, 3)
sns.histplot(df['upfront_charges'].dropna(), kde=True)
plt.title('Distribution of Upfront Charges')

# Adjust the layout and display the plot
plt.tight_layout()
plt.show()



Explanation of above histograms: 
+ The "Rate of Interest" and "Interest Rate Spread" features, exhibiting normal distributions, are well-suited for mean imputation when addressing missing values. The mean is a reliable indicator of central tendency for symmetric distributions, offering a balanced central point that reflects the typical values of these features.

+ Conversely, "Upfront Charges" display a right-skewed distribution, suggesting that a significant number of very high values are pulling the mean to the right. Therefore, the median, which is not swayed by such outliers, is a better choice for imputation. Using the median ensures that the imputed values align more closely with the most common range of the data, preserving its original distributional characteristics.

In [None]:
print(df.columns)


In [None]:
Find the duplicates 

In [None]:
df.duplicated().value_counts()

In [None]:
df.drop_duplicates(inplace=True)

## age is object 

Credit Behavior by Age Group: Different age groups might exhibit distinct credit behaviors. For example, younger borrowers might have a different default risk compared to older ones due to factors like job stability, income levels, and financial obligations.

In [3]:
# Convert age ranges to ordinal values
# Convert all entries in 'age' to strings and handle NaN values
df['age'] = df['age'].astype(str)

df['age'] = df['age'].replace('nan', 'unknown')

# Now we can sort the unique age range strings and create the ordinal mapping
age_ranges = sorted(df['age'].unique())
ordinal_age_mapping = {age_range: index for index, age_range in enumerate(age_ranges)}
df['age_ordinal'] = df['age'].map(ordinal_age_mapping)



In [None]:
df.info()

In [4]:
df.columns = [col.lower() for col in df.columns]


In [5]:
# Identify categorical and numerical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df.select_dtypes(exclude=['object']).columns.tolist()

# Confirm the lists of categorical and numerical columns
print("Categorical Columns:")
print(categorical_columns)
print("\nNumerical Columns:")
print(numerical_columns)


Categorical Columns:
['loan_limit', 'gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'credit_worthiness', 'open_credit', 'business_or_commercial', 'neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'secured_by', 'total_units', 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'region', 'security_type']

Numerical Columns:
['id', 'year', 'loan_amount', 'rate_of_interest', 'interest_rate_spread', 'upfront_charges', 'term', 'property_value', 'income', 'credit_score', 'ltv', 'status', 'dtir1', 'age_ordinal']


In [6]:
categorical_columns.remove('age')

In [None]:
# Confirm the lists of categorical and numerical columns
print("Categorical Columns:")
print(categorical_columns)
print("\nNumerical Columns:")
print(numerical_columns)


In [7]:
columns_to_drop = ['id', 'year', 'gender']
# Drop the columns
df = df.drop(columns=columns_to_drop)

In [None]:
df.info()

In [12]:
# Preprocess the 'age' column to extract representative values
df['age'] = df['age'].apply(lambda x: int(x.split('-')[0]) if '-' in x else (int(x) if x.isdigit() else None))

# Replace NaN values with the median of the valid age values
median_age = df['age'].median()
df['age'].fillna(median_age, inplace=True)


In [13]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression
from sklearn.metrics import classification_report, accuracy_score

# Preprocess the 'age' column to extract representative values
# Preprocess the 'age' column to handle non-integer values
#df['age'] = df['age'].apply(lambda x: int(x.split('-')[0]) if '-' in x else (int(x) if x.isdigit() else df['age'].median()))

# Convert age ranges to ordinal values
age_ranges = sorted(df['age'].unique())
ordinal_age_mapping = {age: index for index, age in enumerate(age_ranges)}
df['age_ordinal'] = df['age'].map(ordinal_age_mapping)


# Separate the features and the target variable
X = df.drop(columns=['status', 'age'])  # Replace 'status' with the actual target column name and drop original 'age' column
y = df['status']  # Replace 'status' with the actual target column name

# Identify categorical and numerical columns, excluding 'age' which we've already processed
# Identify categorical and numerical columns, excluding 'age' which we've already processed
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove 'age' from numerical_cols since it's already encoded as 'age_ordinal'
numerical_cols.remove('age_ordinal')


# Define the imputation and scaling for numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Use 'mean' for normally distributed columns
    ('scaler', StandardScaler())
])

# Define the imputation and encoding for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Define the encoding for the ordinal 'age' column

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories=[age_ranges]))  # Use 'age_ranges' as the categories
])


# Create the preprocessing steps for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols),
    ('ord', ordinal_transformer, ['age_ordinal'])
])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with the preprocessor and Logistic Regression
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(random_state=42))  # Use Logistic Regression
])

# Fit the pipeline to the training data
model_pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
# Print classification report
print(classification_report(y_test, y_pred))

# Print accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


ValueError: Found unknown categories [0, 1, 2, 3, 4] in column 0 during fit

In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with the preprocessor and Logistic Regression
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(random_state=42))  # Use Logistic Regression
])

# Fit the pipeline to the training data
model_pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
# Print classification report
print(classification_report(y_test, y_pred))

# Print accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


In [None]:
df['age_midpoint'] 

## Data Insights- Visualizations

In [None]:
df.corr()
fig, ax = plt.subplots() 
fig.set_size_inches(15,8)
sns.heatmap(df.corr(), vmax =.8, square = True, annot = True )
plt.title('Confusion Matrix',fontsize=15);

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='loan_purpose', palette='deep')
plt.title('Loan Purpose Counts')
plt.xlabel('Loan Purpose')
plt.ylabel('Count')
plt.xticks(rotation=45)  

# Show the plot
plt.show()


In [None]:

plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='person_home_ownership', palette='deep')
plt.title('Applicant Home Ownership')
plt.xlabel('Home Ownership')
plt.ylabel('Count')
plt.xticks(rotation=45)  

# Show the plot
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='cb_person_cred_hist_length', palette='deep')
plt.title('Applicant Credit History')
plt.xlabel('Credit History')
plt.ylabel('Count')
plt.xticks(rotation=45)  

# Show the plot
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='loan_intent', palette='deep')
plt.title('Applicant Loan Intent')
plt.xlabel('Loan Intent')
plt.ylabel('Count')
plt.xticks(rotation=45)  

# Show the plot
plt.show()

## Target values 

In [None]:
df["Status"].value_counts()

In [None]:
df["loan_status"].value_counts(normalize=True)

So the data is imbalance and we need to consider this for modelling steps. 

## Preprocessing 

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Assuming df is your dataframe and 'Status' is the target variable
X = df.drop(['ID', 'Status'], axis=1)
y = df['Status']

# Identify categorical columns that need to be encoded
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Define transformers
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define a pipeline that first preprocesses the data and then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=0))
])

# Split the dataset into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Get feature importances
feature_importances = pipeline.named_steps['classifier'].feature_importances_
feature_names = pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps['encoder'].get_feature_names(input_features=categorical_cols)

# Combine feature importances with their corresponding feature names
importances = dict(zip(feature_names, feature_importances))

# Sort the feature importances in descending order and print them
sorted_importances = sorted(importances.items(), key=lambda item: item[1], reverse=True)
for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")


## Data Cleaning  
Due to ethical considerations and potential legal restrictions (like those from the Equal Credit Opportunity Act in the U.S.), using gender as a predictor in loan default models is generally discouraged and could be considered discriminatory.

In [None]:
df['age'] = df['age'].str.extract('(\d+)').astype(float)

In [None]:
print(df.columns)

In [None]:
columns_to_drop = ['id', 'year', 'gender']
# Drop the columns
df_dropped = df.drop(columns=columns_to_drop)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# This code assumes that 'age' is a numerical column in your DataFrame `df`.
# If 'age' is represented differently, you'll need to preprocess it into numerical form.

plt.figure(figsize=(10, 6))
sns.boxplot(x=df['age'])  # Make sure 'age' is the correct column name
plt.title('Boxplot of Person Age')
plt.xlabel('age')
plt.show()


In [None]:
df['loan_to_income_ratio'] = df['loan_amnt'] / df['person_income']

 Some data entry issues

In [None]:
 #Find numerical columns in the DataFrame
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Find categorical columns in the DataFrame
categorical_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

# Output the findings
print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

In [None]:
print("Columns in the DataFrame:", df.columns)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline



# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('loan_status', axis=1),  # Drop the target variable to create features
    df['loan_status'],               # Target variable
    random_state=0,                  # Ensures reproducibility
    test_size=0.2,                   # Proportion of dataset to include in test split
    stratify=df['loan_status'],      # Ensures train and test sets have similar class distributions
    shuffle=True                     # Shuffles the data before splitting
)

# Define categorical and numerical features
categorical_features = X_train.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create the preprocessing pipelines for both numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline that combines the preprocessor with a RandomForest classifier
model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train the RandomForest model
model_pipeline.fit(X_train, y_train)




In [None]:
from sklearn.model_selection import GridSearchCV
# Define a grid of hyperparameters to search over
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
}

# Create a GridSearchCV object to perform hyperparameter tuning
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Output the best parameters found by GridSearchCV
print("Best hyperparameters:", grid_search.best_params_)

In [None]:
from sklearn.metrics import classification_report, f1_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Use the best hyperparameters to create a new pipeline
best_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42))
])

# Fit the pipeline with the best hyperparameters
best_pipeline.fit(X_train, y_train)

# Evaluate the model with the test data
y_pred = best_pipeline.predict(X_test)
y_proba = best_pipeline.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
print(classification_report(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

# Generate ROC curve values
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

# Feature Importances - extraction from the pipeline
feature_importances = best_pipeline.named_steps['classifier'].feature_importances_
# Get feature names from the preprocessor
# Note: This will only work with versions of scikit-learn where OneHotEncoder has a get_feature_names method
# Retrieve the right transformer
categorical_transformer = best_pipeline.named_steps['preprocessor'].named_transformers_['cat']

# Get the feature names for categorical features
cat_feature_names = categorical_transformer.named_steps['onehot'].get_feature_names_out()

# Combine with numerical features to get all feature names
feature_names = np.concatenate((numerical_features, cat_feature_names))

# Adjust the indices to match the length of feature names
indices = np.argsort(feature_importances)[::-1]

# Plot the feature importances
# Sort importances and corresponding feature names
sorted_indices = np.argsort(feature_importances)
sorted_features = [feature_names[i] for i in sorted_indices]
sorted_importances = feature_importances[sorted_indices]

# Now create the bar plot with Seaborn
plt.figure(figsize=(10, 8))
sns.barplot(x=sorted_importances, y=sorted_features)

plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')

plt.show()



Debt Burden: The loan-to-income ratio reflects the debt burden on the borrower relative to their income. A higher ratio indicates that a significant portion of the borrower's income is needed to service debt, which could increase the risk of default.

Repayment Capacity: It directly measures the borrower's capacity to repay. If a large portion of their income is already allocated to loan repayments, any financial stress could lead to default.

In [None]:
print(sorted_features)

## logistic regression 

In [None]:
## Create the logistic regression pipeline
lr_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Define a set of hyperparameters to test
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear'],  # 'liblinear' works well with small datasets and 'l1' penalty
}

# Create a GridSearchCV object
grid_search_lr = GridSearchCV(lr_pipeline, param_grid, cv=5, scoring='roc_auc', verbose=1)

# Fit the GridSearchCV object to the training data
grid_search_lr.fit(X_train, y_train)

# Get the best parameters and the best score
print("Best hyperparameters:", grid_search.best_params_)
print("Best score (ROC AUC):", grid_search.best_score_)

In [None]:

# Evaluate the best model on the test set
best_model_lr = grid_search_lr.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

# Print out evaluation metrics
print(classification_report(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))


In [None]:
# Assuming that you have a trained model 'best_model_lr' and a list of feature names 'feature_names'
coefs = best_model_lr.named_steps['classifier'].coef_[0]
feature_importances = pd.Series(coefs, index=feature_names)

# Sort the features by their importance (absolute value)
sorted_features = feature_importances.abs().sort_values(ascending=False)

# Plot the feature importances using seaborn
plt.figure(figsize=(10, 8))
sns.barplot(x=sorted_features.values, y=sorted_features.index)
plt.title('Feature Importances from Logistic Regression')
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.show()

## Decision Trees:
