In [None]:
# Import libraries
import joblib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, classification_report, confusion_matrix,
    precision_recall_curve, roc_curve, accuracy_score, 
    precision_score, recall_score, f1_score
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# ======================
# DATA PREPARATION
# ======================

# Load and preprocess the data
df = pd.read_csv('loan_repayment_data.csv')

# Parse dates
df['Funded_date'] = pd.to_datetime(df['Funded_date'], format='%m/%d/%Y')
df['due_date'] = pd.to_datetime(df['due_date'], format='%m/%d/%Y')
df['last_paid_date'] = pd.to_datetime(df['last_paid_date'], format='%m/%d/%Y')

# Feature engineering
df['days_past_due'] = (pd.to_datetime(df['last_paid_date']) - pd.to_datetime(df['due_date'])).dt.days
df['repayment_ratio'] = df['repaid_amount'] / df['to_repay']
df['interest_rate'] = df['interest_amount'] / df['loan_amount']
df['loan_status_binary'] = np.where(df['loan_balance'] == 0, 0, 1)  # 0 = Fully Repaid, 1 = Outstanding

df['payment_delay'] = (df['last_paid_date'] - df['due_date']).dt.days

# Define target: default if repaid_amount < to_repay
df['default'] = (df['repaid_amount'] < df['to_repay']).astype(int)

In [None]:
# Feature selection
features = ['new_repeat', 'loan_duration', 'loan_amount', 'interest_amount']
target = 'default'

X = df[features]
y = df[target]

# Identify feature types
num_cols = ['loan_duration', 'loan_amount', 'interest_amount']
cat_cols = ['new_repeat']

# ======================
# PREPROCESSING PIPELINE
# ======================
# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combined preprocessor
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# ======================
# MODEL TRAINING
# ======================
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y
)

# Define models
models = {
    'LogisticRegression': LogisticRegression(class_weight='balanced', max_iter=1000),
    'DecisionTree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'RandomForest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}

# Store results
results = []

for name, model in models.items():
    print(f"\nðŸš€ Training {name}...")
    
    # Build pipeline with SMOTE
    pipeline = ImbPipeline([
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', model)
    ])
    
    # Fit model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        "Model": name,
        "AUC-ROC": roc_auc_score(y_test, y_pred_proba),
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    }
    
    # Save model
    joblib.dump(pipeline, f"{name}_model.pkl")
    print(f"âœ… {name} saved as {name}_model.pkl")
    
    # Store results
    results.append(metrics)

# ======================
# RESULTS ANALYSIS
# ======================
# Create and display results dataframe
results_df = pd.DataFrame(results)
results_df = results_df.sort_values("AUC-ROC", ascending=False).reset_index(drop=True)

print("\nðŸ“Š Model Performance Summary:")
print(results_df.round(3))

# Additional evaluation (optional)
best_model_name = results_df.iloc[0]['Model']
print(f"\nðŸŒŸ Best performing model: {best_model_name}")

# You can add visualization code here
# plt.figure(figsize=(10, 6))
# results_df.plot(x='Model', y='AUC-ROC', kind='bar')
# plt.title('Model Comparison: AUC-ROC Scores')
# plt.show()


ðŸš€ Training LogisticRegression...
âœ… LogisticRegression saved as LogisticRegression_model.pkl

ðŸš€ Training DecisionTree...
âœ… DecisionTree saved as DecisionTree_model.pkl

ðŸš€ Training RandomForest...
âœ… RandomForest saved as RandomForest_model.pkl

ðŸš€ Training GradientBoosting...
âœ… GradientBoosting saved as GradientBoosting_model.pkl

ðŸ“Š Model Performance Summary:
                Model  AUC-ROC  Accuracy  Precision  Recall  F1-Score
0        DecisionTree     0.75     0.842      1.000     0.4     0.571
1        RandomForest     0.75     0.842      1.000     0.4     0.571
2    GradientBoosting     0.75     0.842      1.000     0.4     0.571
3  LogisticRegression     0.60     0.632      0.375     0.6     0.462

ðŸŒŸ Best performing model: DecisionTree
