# Loan Default Prediction System

Predict loan repayment using XGBoost machine learning.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import pickle
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
print('Libraries imported successfully!')

## 2. Load Dataset

In [None]:
df = pd.read_csv('Loan_Default_data.csv', encoding='latin-1', encoding_errors='ignore')
print(f'Shape: {df.shape}')
df.head()

In [None]:
df.info()

## 3. Analyze Missing Values & Target

In [None]:
missing = pd.DataFrame({
    'Column': df.columns,
    'Missing': df.isnull().sum(),
    'Percent': (df.isnull().sum() / len(df) * 100).round(2)
})
missing[missing['Missing'] > 0].sort_values('Percent', ascending=False)

In [None]:
print('Target Distribution:')
print(df['repay_fail'].value_counts())
print(df['repay_fail'].value_counts(normalize=True) * 100)

## 4. Data Preprocessing

In [None]:
# Handle missing values
df_filled = df.copy()
numeric_cols = df_filled.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df_filled[col].isnull().sum() > 0:
        df_filled[col].fillna(df_filled[col].median(), inplace=True)

categorical_cols = df_filled.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_filled[col].isnull().sum() > 0:
        mode_val = df_filled[col].mode()[0] if len(df_filled[col].mode()) > 0 else 'Unknown'
        df_filled[col].fillna(mode_val, inplace=True)

print(f'Missing values after imputation: {df_filled.isnull().sum().sum()}')

In [None]:
# Convert percentages
df_converted = df_filled.copy()
if 'revol_util' in df_converted.columns and df_converted['revol_util'].dtype == 'object':
    # Remove special characters, quotes, and percentage signs
    df_converted['revol_util'] = df_converted['revol_util'].str.replace(r'[\x93\x94%]', '', regex=True)
    df_converted['revol_util'] = pd.to_numeric(df_converted['revol_util'], errors='coerce') / 100
    print('Percentage converted')


In [None]:
# Engineer date features
df_temporal = df_converted.copy()
if 'earliest_cr_line' in df_temporal.columns and 'issue_d' in df_temporal.columns:
    df_temporal['earliest_cr_line'] = pd.to_datetime(df_temporal['earliest_cr_line'], errors='coerce')
    df_temporal['issue_d'] = pd.to_datetime(df_temporal['issue_d'], errors='coerce')
    df_temporal['credit_history_months'] = (df_temporal['issue_d'] - df_temporal['earliest_cr_line']).dt.days / 30
    df_temporal['issue_year'] = df_temporal['issue_d'].dt.year
    df_temporal['issue_month'] = df_temporal['issue_d'].dt.month
    df_temporal['issue_quarter'] = df_temporal['issue_d'].dt.quarter
    for col in ['credit_history_months', 'issue_year', 'issue_month', 'issue_quarter']:
        if df_temporal[col].isnull().sum() > 0:
            df_temporal[col].fillna(df_temporal[col].median(), inplace=True)
    print('Date features created')

In [None]:
# Drop date columns
df_no_dates = df_temporal.copy()
date_cols = ['earliest_cr_line', 'issue_d', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d']
for col in date_cols:
    if col in df_no_dates.columns:
        df_no_dates.drop(col, axis=1, inplace=True)
print(f'Remaining columns: {df_no_dates.shape[1]}')

In [None]:
# Encode categorical variables
df_encoded = df_no_dates.copy()
label_encoders = {}
ordinal_features = ['emp_length', 'term']
for col in ordinal_features:
    if col in df_encoded.columns and df_encoded[col].dtype == 'object':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
        label_encoders[col] = le

categorical_cols = df_encoded.select_dtypes(include=['object']).columns.tolist()
if 'repay_fail' in categorical_cols:
    categorical_cols.remove('repay_fail')
nominal_features = [c for c in categorical_cols if c not in ordinal_features]
if nominal_features:
    df_encoded = pd.get_dummies(df_encoded, columns=nominal_features, drop_first=True)
print(f'Encoded! Columns: {df_encoded.shape[1]}')

In [None]:
# Remove identifiers
df_clean = df_encoded.copy()
drop_cols = ['id', 'member_id', 'zip_code', 'loan_status']
for col in drop_cols:
    if col in df_clean.columns:
        df_clean.drop(col, axis=1, inplace=True)
print(f'Final columns: {df_clean.shape[1]}')

In [None]:
# Split features and target
y = df_clean['repay_fail']
X = df_clean.drop('repay_fail', axis=1)
feature_columns = X.columns.tolist()
print(f'Features: {X.shape}, Target: {y.shape}')

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f'Train: {X_train.shape}, Test: {X_test.shape}')

In [None]:
# Save preprocessing info
preprocessor_info = {
    'label_encoders': label_encoders,
    'feature_columns': feature_columns
}
with open('preprocessor_info.pkl', 'wb') as f:
    pickle.dump(preprocessor_info, f)
print('Preprocessing artifacts saved')

## 5. Train XGBoost Model

In [None]:
model = xgb.XGBClassifier(
    objective='binary:logistic',
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
print('Model initialized')

In [None]:
import time
start = time.time()
model.fit(X_train, y_train)
print(f'Training complete in {time.time()-start:.2f}s')

In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
print('Model saved to model.pkl')

## 6. Model Evaluation

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
print(f'Predictions generated for {len(y_test)} samples')

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)')

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', square=True)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print('Classification Report:')
print(classification_report(y_test, y_pred, target_names=['Repaid', 'Defaulted']))

In [None]:
roc_auc = roc_auc_score(y_test, y_proba)
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC (AUC = {roc_auc:.4f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(alpha=0.3)
plt.show()
print(f'ROC-AUC Score: {roc_auc:.4f}')

## Summary

Model training and evaluation complete! Files saved:
- `model.pkl` - Trained XGBoost model
- `preprocessor_info.pkl` - Preprocessing artifacts