In [1]:
# This notebook will be for Classification testing (Supervised Learning)
# Decision Tree, SGD Classifier/SVM - Logistic Regression
# Importing Libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [21]:
# importing dataset
from dotenv import load_dotenv

load_dotenv()

dataset_path = os.getenv('CLEANED_DATA_PATH')

cleaned_df = pd.read_csv(dataset_path)

In [3]:
# Probably need to scale the data first
# df_cleaned = pd.read_parquet('data/application_cleaned.parquet', engine = 'pyarrow')

In [None]:
# # continuous column names
# cont_col_list = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_LAST_PHONE_CHANGE', 'CNT_FAM_MEMBERS', 'REGION_POPULATION_RELATIVE']

In [17]:
categorical_cols = cleaned_df.select_dtypes(include=['object']).columns.tolist()
categorical_cols

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE']

In [19]:
numerical_cols = cleaned_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('TARGET')
numerical_cols

['AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'FLAG_DOCUMENT_3',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR']

In [22]:
# viewing the number of null values in each column
null_counts = cleaned_df.isnull().sum()
print(null_counts)

TARGET                             0
NAME_CONTRACT_TYPE                 0
CODE_GENDER                        0
FLAG_OWN_CAR                       0
FLAG_OWN_REALTY                    0
AMT_INCOME_TOTAL                   0
AMT_CREDIT                         0
AMT_ANNUITY                        0
NAME_TYPE_SUITE                    0
NAME_INCOME_TYPE                   0
NAME_EDUCATION_TYPE                0
NAME_FAMILY_STATUS                 0
NAME_HOUSING_TYPE                  0
REGION_POPULATION_RELATIVE         0
DAYS_BIRTH                         0
DAYS_EMPLOYED                      0
DAYS_REGISTRATION                  0
DAYS_ID_PUBLISH                    0
OCCUPATION_TYPE                40668
CNT_FAM_MEMBERS                    0
REGION_RATING_CLIENT_W_CITY        0
WEEKDAY_APPR_PROCESS_START         0
HOUR_APPR_PROCESS_START            0
REG_REGION_NOT_LIVE_REGION         0
REG_REGION_NOT_WORK_REGION         0
LIVE_REGION_NOT_WORK_REGION        0
REG_CITY_NOT_LIVE_CITY             0
L

In [23]:
# Impute missing values for numerical columns with median
num_imputer = SimpleImputer(strategy='median')
cleaned_df[numerical_cols] = num_imputer.fit_transform(cleaned_df[numerical_cols])

In [24]:
# Impute missing values for categorical columns with most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
cleaned_df[categorical_cols] = cat_imputer.fit_transform(cleaned_df[categorical_cols])

In [26]:
# Verify no missing values remain
missing_after = cleaned_df.isnull().sum().sum()
print(f'Missing values after imputation: {missing_after}')

Missing values after imputation: 0


In [27]:
# 4. Encode categorical variables
print('\\nEncoding categorical variables...')
# Apply label encoding to categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    cleaned_df[col] = le.fit_transform(cleaned_df[col])
    label_encoders[col] = le
    
print(f'Encoded {len(categorical_cols)} categorical columns')

\nEncoding categorical variables...
Encoded 12 categorical columns


In [28]:
# 5. Feature scaling
print('\\nApplying feature scaling...')
# Scale numerical features
scaler = StandardScaler()
cleaned_df[numerical_cols] = scaler.fit_transform(cleaned_df[numerical_cols])
print(f'Scaled {len(numerical_cols)} numerical columns')

\nApplying feature scaling...
Scaled 26 numerical columns


In [29]:
cleaned_df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,...,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,0,1,0,1,0.142579,-0.479294,-0.168296,6,7,...,0.250148,5.244357,0.204335,0.634651,-0.070797,-0.058834,-0.156237,-0.270413,-0.308566,-0.443266
1,0,0,0,0,0,0.426592,1.726847,0.591442,1,4,...,-0.170051,-0.276090,-0.165729,0.634651,-0.070797,-0.058834,-0.156237,-0.270413,-0.308566,-1.009608
2,0,1,1,1,1,-0.425448,-1.154882,-1.408310,6,7,...,-0.590251,-0.276090,-0.181451,-1.575669,-0.070797,-0.058834,-0.156237,-0.270413,-0.308566,-1.009608
3,0,0,0,0,1,-0.141435,-0.712904,0.176135,6,7,...,0.250148,-0.276090,-0.420904,0.634651,-0.070797,-0.058834,-0.156237,-0.270413,-0.308566,-0.443266
4,0,0,1,0,1,-0.198237,-0.214622,-0.364137,6,7,...,-0.590251,-0.276090,0.170473,-1.575669,-0.070797,-0.058834,-0.156237,-0.270413,-0.308566,-1.009608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305180,0,0,1,0,0,-0.046764,-0.857133,0.029099,6,7,...,-0.590251,-0.276090,-0.836923,-1.575669,-0.070797,-0.058834,-0.156237,-0.270413,-0.308566,-0.443266
305181,0,0,0,0,1,-0.406514,-0.820194,-1.045538,6,3,...,-0.590251,-0.276090,-1.167078,0.634651,-0.070797,-0.058834,-0.156237,-0.270413,-0.308566,-0.443266
305182,0,0,0,0,1,-0.065698,0.194973,0.196341,6,7,...,1.930946,-0.276090,1.141588,0.634651,12.767106,-0.058834,-0.156237,0.897782,-0.308566,-0.443266
305183,1,0,0,0,1,0.010039,-0.570063,-0.478843,6,1,...,-0.590251,-0.276090,-0.777664,0.634651,-0.070797,-0.058834,-0.156237,-0.270413,-0.308566,-1.009608


In [30]:
# Separate features and target
X = cleaned_df.drop('TARGET', axis=1)
if 'SK_ID_CURR' in X.columns:
    X = X.drop('SK_ID_CURR', axis=1)  # Remove ID column if present
y = cleaned_df['TARGET']

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# OneHotEncoder = OneHotEncoder()
# OneHotEncoder.fit(X_train)
# X_train = OneHotEncoder.transform(X_train)

# ^This will include nulls. We will impute them. we can one hot encode all the categorical variables then use Bayesian ridge estimator. XGBoost is also viable, also requires one hot encoding, 
# however can overfit.

In [None]:
X_train

In [33]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, confusion_matrix
from sklearn.metrics import classification_report, roc_curve, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Create classification report
    report = classification_report(y_test, y_pred)
    
    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend(loc="lower right")
    # plt.savefig(f'ml_results/{model_name}_roc_curve.png')
    plt.close()
    
    # Plot Precision-Recall curve
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall_curve, precision_curve)
    plt.figure(figsize=(10, 6))
    plt.plot(recall_curve, precision_curve, label=f'PR curve (area = {pr_auc:.3f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - {model_name}')
    plt.legend(loc="lower left")
    # plt.savefig(f'ml_results/{model_name}_pr_curve.png')
    plt.close()
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    # plt.savefig(f'ml_results/{model_name}_confusion_matrix.png')
    plt.close()
    
    # Return metrics
    return {
        'model_name': model_name,
        'roc_auc': roc_auc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'classification_report': report
    }

# Define models to evaluate
print("\nDefining models to evaluate...")
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(scale_pos_weight=11.39, random_state=42)  # Using the imbalance ratio
}

# Evaluate each model
print("\nEvaluating models...")
results = []
for name, model in models.items():
    print(f"Evaluating {name}...")
    result = evaluate_model(model, X_train, X_test, y_train, y_test, name.replace(' ', '_'))
    results.append(result)
    print(f"  ROC AUC: {result['roc_auc']:.4f}")
    print(f"  Precision: {result['precision']:.4f}")
    print(f"  Recall: {result['recall']:.4f}")
    print(f"  F1 Score: {result['f1_score']:.4f}")
    print(f"  Classification Report:\n{result['classification_report']}\n")

# Create results summary
results_df = pd.DataFrame(results)
results_df = results_df.drop('classification_report', axis=1)
# results_df.to_csv('ml_results/model_comparison.csv', index=False)

# Plot model comparison
plt.figure(figsize=(12, 8))
metrics = ['roc_auc', 'precision', 'recall', 'f1_score']
for i, metric in enumerate(metrics):
    plt.subplot(2, 2, i+1)
    sns.barplot(x='model_name', y=metric, data=results_df)
    plt.title(f'Model Comparison - {metric.upper()}')
    plt.xticks(rotation=45)
    plt.tight_layout()
# plt.savefig('ml_results/model_comparison.png')
plt.close()

# Try SMOTE for handling class imbalance
print("\nEvaluating models with SMOTE for handling class imbalance...")
# Define SMOTE pipeline
smote_pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(random_state=42))
])

# Evaluate SMOTE pipeline
smote_result = evaluate_model(smote_pipeline, X_train, X_test, y_train, y_test, 'XGBoost_with_SMOTE')
print(f"  ROC AUC: {smote_result['roc_auc']:.4f}")
print(f"  Precision: {smote_result['precision']:.4f}")
print(f"  Recall: {smote_result['recall']:.4f}")
print(f"  F1 Score: {smote_result['f1_score']:.4f}")
print(f"  Classification Report:\n{smote_result['classification_report']}\n")

# Try undersampling for handling class imbalance
print("\nEvaluating models with undersampling for handling class imbalance...")
# Define undersampling pipeline
under_pipeline = Pipeline([
    ('undersampler', RandomUnderSampler(random_state=42)),
    ('classifier', XGBClassifier(random_state=42))
])

# Evaluate undersampling pipeline
under_result = evaluate_model(under_pipeline, X_train, X_test, y_train, y_test, 'XGBoost_with_Undersampling')
print(f"  ROC AUC: {under_result['roc_auc']:.4f}")
print(f"  Precision: {under_result['precision']:.4f}")
print(f"  Recall: {under_result['recall']:.4f}")
print(f"  F1 Score: {under_result['f1_score']:.4f}")
print(f"  Classification Report:\n{under_result['classification_report']}\n")

# Add resampling results to comparison
results.append(smote_result)
results.append(under_result)
results_df = pd.DataFrame(results)
results_df = results_df.drop('classification_report', axis=1)
# results_df.to_csv('ml_results/model_comparison_with_resampling.csv', index=False)

# Plot final model comparison
plt.figure(figsize=(14, 10))
metrics = ['roc_auc', 'precision', 'recall', 'f1_score']
for i, metric in enumerate(metrics):
    plt.subplot(2, 2, i+1)
    sns.barplot(x='model_name', y=metric, data=results_df)
    plt.title(f'Model Comparison - {metric.upper()}')
    plt.xticks(rotation=45)
    plt.tight_layout()
# plt.savefig('ml_results/final_model_comparison.png')
plt.close()

# Feature importance for the best model (assuming XGBoost performs well)
print("\nCalculating feature importance for XGBoost model...")
xgb_model = XGBClassifier(scale_pos_weight=11.39, random_state=42)
xgb_model.fit(X_train, y_train)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
# feature_importance.to_csv('ml_results/xgboost_feature_importance.csv', index=False)

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
sns.barplot(x='importance', y='feature', data=top_features)
plt.title('Top 20 Features by Importance (XGBoost)')
plt.tight_layout()
# plt.savefig('ml_results/xgboost_feature_importance.png')
plt.close()

print("\nMachine learning evaluation completed. Results saved to ml_results directory.")

# Summary of recommended approaches
print("\nRecommended Machine Learning Approaches for Loan Default Prediction:")
print("1. Gradient Boosting and XGBoost models typically perform well for this type of problem")
print("2. Class imbalance handling techniques like SMOTE or class weights are essential")
print("3. Feature selection based on importance can improve model performance")
print("4. Evaluation should focus on metrics like ROC AUC, Precision, Recall, and F1 Score")
print("5. Threshold tuning can help balance precision and recall based on business requirements")



Defining models to evaluate...

Evaluating models...
Evaluating Logistic Regression...
  ROC AUC: 0.6716
  Precision: 0.1281
  Recall: 0.6288
  F1 Score: 0.2129
  Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.62      0.75     56093
           1       0.13      0.63      0.21      4944

    accuracy                           0.62     61037
   macro avg       0.54      0.63      0.48     61037
weighted avg       0.88      0.62      0.71     61037


Evaluating Decision Tree...
  ROC AUC: 0.5205
  Precision: 0.1162
  Recall: 0.1246
  F1 Score: 0.1202
  Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92     56093
           1       0.12      0.12      0.12      4944

    accuracy                           0.85     61037
   macro avg       0.52      0.52      0.52     61037
weighted avg       0.86      0.85      0.85     61037


Evaluating Random Forest...
  

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)