In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
train = pd.read_csv("../dataset/train.csv")
test = pd.read_csv("../dataset/test.csv")

In [16]:
def create_features(df):
    df = df.copy()
    
    
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['loan_exceeds_income'] = (df['loan_amount'] > df['annual_income']).astype(int)

    df['monthly_income'] = df['annual_income'] / 12
    
    df['high_dti'] = (df['debt_to_income_ratio'] > 0.15).astype(int)
    df['low_credit_score'] = (df['credit_score'] < 650).astype(int)
    df['high_interest_rate'] = (df['interest_rate'] > 13).astype(int)

    # Loan burden ratio (how much of annual income is the loan)
    df['loan_to_income_ratio'] = df['loan_amount'] / (df['annual_income'] + 1)  # +1 to avoid division by zero
    
    # Absolute debt burden (total debt in dollars)
    df['absolute_debt_burden'] = df['debt_to_income_ratio'] * df['annual_income']
    
    # Total debt including new loan
    df['total_debt_with_loan'] = df['absolute_debt_burden'] + df['loan_amount']
    
    # Credit score bins (capture non-linear effects)
    df['credit_score_bin'] = pd.cut(df['credit_score'], 
                                     bins=[0, 600, 650, 700, 750, 850], 
                                     labels=['poor', 'fair', 'good', 'very_good', 'excellent'])
    
    # DTI bins
    df['dti_bin'] = pd.cut(df['debt_to_income_ratio'], 
                           bins=[0, 0.1, 0.2, 0.3, 1.0], 
                           labels=['low', 'medium', 'high', 'very_high'])
    
    # Interest rate bins
    df['interest_rate_bin'] = pd.cut(df['interest_rate'], 
                                      bins=[0, 10, 12.5, 15, 30], 
                                      labels=['low', 'medium', 'high', 'very_high'])
    
    # High risk flag: low credit score + high DTI
    #df['high_risk_flag'] = ((df['credit_score'] < 650) & (df['debt_to_income_ratio'] > 0.2)).astype(int)
    
    # Low risk flag: high credit score + low DTI
    #df['low_risk_flag'] = ((df['credit_score'] > 700) & (df['debt_to_income_ratio'] < 0.15)).astype(int)
    
    return df

In [17]:
train_df = create_features(train)
test_df = create_features(test)

In [18]:
numerical_features = train_df.select_dtypes(include='number').columns.tolist()
numerical_features = [col for col in numerical_features if 'id' not in col.lower()]
categorical_features = train_df.select_dtypes(include='object').columns.to_list()

In [19]:
all_features = numerical_features + categorical_features
X = train_df[all_features].copy()
for col in categorical_features:
    X[col] = X[col].astype('category')
y = train_df['loan_paid_back'].copy()

In [20]:
# Label Encoder
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

In [21]:
params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': 6,
        'learning_rate': 0.05,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'gamma': 0.1,
        'reg_alpha': 0.1,  # L1 regularization
        'reg_lambda': 1.0,  # L2 regularization
        'scale_pos_weight': 1,  # Adjust for class imbalance if needed
        'random_state': 42,
        'n_jobs': -1,
        'tree_method': 'hist'  # Faster training
}

In [22]:
print("\n" + "="*50)
print("CROSS-VALIDATION TRAINING")
print("="*50)

n_splits = 5

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

cv_scores = []
train_scores = []
models = []


CROSS-VALIDATION TRAINING


In [23]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\nFold {fold}/{n_splits}")
    print("-" * 30)
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
    
    # Train model
    evals = [(dtrain, 'train'), (dval, 'val')]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=100
    )
    # Predictions
    train_pred = model.predict(dtrain)
    val_pred = model.predict(dval)
    
    # Scores
    train_auc = roc_auc_score(y_train_fold, train_pred)
    val_auc = roc_auc_score(y_val_fold, val_pred)
    
    train_scores.append(train_auc)
    cv_scores.append(val_auc)
    models.append(model)
    
    print(f"Train AUC: {train_auc:.6f}")
    print(f"Val AUC:   {val_auc:.6f}")
    print(f"Overfit:   {train_auc - val_auc:.6f}")


Fold 1/5
------------------------------
[0]	train-auc:0.90912	val-auc:0.90957
[100]	train-auc:0.91736	val-auc:0.91715
[200]	train-auc:0.92144	val-auc:0.91953
[300]	train-auc:0.92433	val-auc:0.92049
[400]	train-auc:0.92698	val-auc:0.92131
[500]	train-auc:0.92905	val-auc:0.92163
[600]	train-auc:0.93106	val-auc:0.92199
[700]	train-auc:0.93286	val-auc:0.92222
[800]	train-auc:0.93453	val-auc:0.92229
[900]	train-auc:0.93614	val-auc:0.92237
[958]	train-auc:0.93702	val-auc:0.92233
Train AUC: 0.937021
Val AUC:   0.922327
Overfit:   0.014694

Fold 2/5
------------------------------
[0]	train-auc:0.90923	val-auc:0.90904
[100]	train-auc:0.91751	val-auc:0.91641
[200]	train-auc:0.92129	val-auc:0.91848
[300]	train-auc:0.92440	val-auc:0.91982
[400]	train-auc:0.92696	val-auc:0.92058
[500]	train-auc:0.92925	val-auc:0.92117
[600]	train-auc:0.93109	val-auc:0.92147
[700]	train-auc:0.93285	val-auc:0.92165
[800]	train-auc:0.93446	val-auc:0.92179
[878]	train-auc:0.93562	val-auc:0.92183
Train AUC: 0.935644
Va

In [28]:
print("\n" + "="*50)
print("CROSS-VALIDATION RESULTS")
print("="*50)
print(f"Mean Train AUC: {np.mean(train_scores):.6f} ± {np.std(train_scores):.6f}")
print(f"Mean Val AUC:   {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")
print(f"Mean Overfitting: {np.mean(train_scores) - np.mean(cv_scores):.6f}")


CROSS-VALIDATION RESULTS
Mean Train AUC: 0.935297 ± 0.001592
Mean Val AUC:   0.921075 ± 0.000929
Mean Overfitting: 0.014222


In [29]:
 # Get feature importance
importance_dict = model.get_score(importance_type='gain')

# Convert to DataFrame
importance_df = pd.DataFrame({
    'feature': list(importance_dict.keys()),
    'importance': list(importance_dict.values())
}).sort_values('importance', ascending=False).head(10)

In [26]:
importance_df

Unnamed: 0,feature,importance
17,employment_status,1347.876709
8,high_dti,175.567459
1,debt_to_income_ratio,64.207939
2,credit_score,40.222786
19,grade_subgrade,28.154476
9,low_credit_score,12.555935
12,absolute_debt_burden,7.828838
3,loan_amount,6.907624
0,annual_income,6.7287
7,monthly_income,6.645182


In [27]:
X_test = test_df[all_features].copy()

In [30]:
for col in categorical_features:
    X_test[col] = label_encoders[col].transform(X_test[col].astype(str))

In [31]:
dtest = xgb.DMatrix(X_test)

In [32]:
test_preds = np.zeros(len(X_test))

for model in models:   # 'models' list was created during training
    test_preds += model.predict(dtest)

test_preds /= len(models)     # average predictions

In [35]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'loan_paid_back': test_preds
})
submission.to_csv('submission.csv', index=False)