# Loan Default Prediction - Evaluation Set
**Team**: Renaissance Technologies  
**Model**: Stacking Ensemble (AUC: 0.8074)

## Pipeline
1. Load cleaned evaluation data
2. Apply feature engineering (same as training)
3. Load trained model
4. Generate predictions
5. Export results

## 1. Setup

In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings('ignore')

## 2. Load Cleaned Evaluation Data

In [2]:
# Load cleaned evaluation set
df = pd.read_csv('evaluation_set_cleaned.csv')

print(f"Evaluation set loaded: {len(df):,} rows × {len(df.columns)} columns")
print(f"Customer IDs: {df['customer_id'].nunique():,}")

Evaluation set loaded: 10,001 rows × 63 columns
Customer IDs: 10,001


## 3. Feature Engineering (Same as Training)

In [3]:
# === Financial Health Features ===
df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
df['debt_to_income_pct'] = df['debt_to_income_ratio'] * 100
df['payment_burden'] = df['existing_monthly_debt'] / (df['monthly_income'] + 1)
df['available_income'] = df['monthly_income'] - df['existing_monthly_debt']
df['income_stability'] = (df['employment_length'] > 2).astype(int)

# === Credit Behavior Features ===
df['credit_score_category'] = pd.cut(df['credit_score'], 
                                      bins=[0, 580, 670, 740, 850],
                                      labels=['poor', 'fair', 'good', 'excellent'])
df['high_utilization'] = (df['credit_utilization'] > 0.7).astype(int)
df['recent_delinquency'] = (df['num_delinquencies_2yrs'] > 0).astype(int)
df['credit_age_years'] = df['oldest_credit_line_age'] / 12

# === Loan Characteristics ===
median_rate = 10.5  # Use training median
df['high_interest_flag'] = (df['interest_rate'] > median_rate).astype(int)
df['loan_to_income'] = df['loan_amount'] / (df['annual_income'] + 1)
df['is_mortgage'] = (df['loan_type'] == 'mortgage').astype(int)
df['is_credit_card'] = (df['loan_type'] == 'credit_card').astype(int)

# === Risk Aggregate ===
high_dti = (df['debt_to_income_ratio'] > 0.43).astype(int)
low_credit = (df['credit_score'] < 640).astype(int)
df['total_risk_score'] = (df['high_utilization'] + df['recent_delinquency'] + 
                          high_dti + low_credit)

print(f"Basic features created: {df.shape[1]} columns")

Basic features created: 77 columns


## 4. Target Encoding (Using Training Statistics)

In [4]:
# Load training data to get target encoding statistics
train_df = pd.read_csv('../main_dataset.csv')

# Target encode using training data statistics
encode_cols = ['loan_purpose', 'marketing_campaign', 'state']

for col in encode_cols:
    if col in df.columns and col in train_df.columns and 'default' in train_df.columns:
        # Calculate encoding from training data
        means = train_df.groupby(col)['default'].mean()
        global_mean = train_df['default'].mean()
        
        # Apply to evaluation set
        df[f'{col}_target_enc'] = df[col].map(means).fillna(global_mean)
        print(f" {col} -> {col}_target_enc (using training stats)")

print(f"\n Target encoding complete: {df.shape[1]} columns")

 loan_purpose -> loan_purpose_target_enc (using training stats)
 marketing_campaign -> marketing_campaign_target_enc (using training stats)
 state -> state_target_enc (using training stats)

 Target encoding complete: 80 columns


## 5. One-Hot Encoding (Same as Training)

In [5]:
# Exclude columns
exclude_cols = [
    'customer_id', 'application_id',
    'loan_officer_id', 'marketing_campaign', 
    'referral_code', 'previous_zip_code'
]

# Get categorical features
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features = [c for c in categorical_features if c not in exclude_cols]

# One-hot encode
df_encoded = df.copy()
for feat in categorical_features:
    n_unique = df[feat].nunique()
    if n_unique <= 50:
        dummies = pd.get_dummies(df[feat], prefix=feat, drop_first=True, dtype=int)
        df_encoded = pd.concat([df_encoded, dummies], axis=1)

df_encoded = df_encoded.drop(columns=categorical_features)

print(f" One-hot encoding complete: {df_encoded.shape[1]} columns")

 One-hot encoding complete: 117 columns


## 6. Prepare Feature Matrix

In [6]:
# Load model features
import json
with open('../final_lgbm_optimized_features.json', 'r') as f:
    model_features = json.load(f)['features']

print(f"Model expects {len(model_features)} features")

# Select only model features (handle missing)
X_eval = df_encoded[['customer_id']].copy()
for feat in model_features:
    if feat in df_encoded.columns:
        X_eval[feat] = df_encoded[feat]
    else:
        X_eval[feat] = 0  # Missing feature = 0
        print(f"  Feature '{feat}' not in eval set, filling with 0")

# Separate customer_id
customer_ids = X_eval['customer_id'].values
X_eval = X_eval[model_features]

# Fill any remaining NaN
X_eval = X_eval.fillna(X_eval.median())

print(f"\n Feature matrix ready: {X_eval.shape}")
print(f"  Features: {X_eval.shape[1]}")
print(f"  Samples: {X_eval.shape[0]:,}")

Model expects 111 features
  Feature 'credit_score_category_poor' not in eval set, filling with 0

 Feature matrix ready: (10001, 111)
  Features: 111
  Samples: 10,001


## 7. Load Model & Predict

In [7]:
# Load trained model
with open('../final_lgbm_optimized.pkl', 'rb') as f:
    model = pickle.load(f)

print(f" Model loaded: {type(model).__name__}")

# Generate predictions
probabilities = model.predict_proba(X_eval)[:, 1]

# Use percentile-based threshold to match training default rate
training_default_rate = 0.051
threshold = np.percentile(probabilities, 100 * (1 - training_default_rate))
predictions = (probabilities >= threshold).astype(int)

print(f"\nPredictions generated")
print(f"  Training default rate: {training_default_rate*100:.2f}%")
print(f"  Optimal threshold (95th percentile): {threshold:.5f}")
print(f"  Predicted defaults: {predictions.sum():,} ({predictions.mean()*100:.2f}%)")
print(f"  Average probability: {probabilities.mean():.4f}")
print(f"  Probability range: [{probabilities.min():.4f}, {probabilities.max():.4f}]")

 Model loaded: StackingClassifier

Predictions generated
  Training default rate: 5.10%
  Optimal threshold (95th percentile): 0.21130
  Predicted defaults: 511 (5.11%)
  Average probability: 0.0520
  Probability range: [0.0055, 0.4111]


## 8. Export Results

In [8]:
# Create results dataframe
results = pd.DataFrame({
    'customer_id': customer_ids,
    'prob': probabilities.round(5),
    'default': predictions
})

# Export to CSV
results.to_csv('results.csv', index=False)

print("RESULTS EXPORTED")
print(f"File: results.csv")
print(f"Rows: {len(results):,}")
print(f"\nPreview:")
print(results.head(10).to_string(index=False))
print(f"\n Predictions complete!")


RESULTS EXPORTED
File: results.csv
Rows: 10,001

Preview:
 customer_id    prob  default
      100000 0.00815        0
      100001 0.01775        0
      100002 0.07280        0
      100003 0.04818        0
      100004 0.07923        0
      100005 0.02297        0
      100006 0.01000        0
      100007 0.02793        0
      100008 0.00730        0
      100009 0.02552        0

 Predictions complete!
