# Title

## Step 0. Import packages and load data

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df_n = pd.read_csv("data/dataproject2025.csv")

In [3]:
df_n.isna().sum()

Unnamed: 0                 0
issue_d                    0
loan duration              0
annual_inc                 0
avg_cur_bal                0
bc_open_to_buy             0
bc_util                    0
delinq_2yrs                0
dti                        0
emp_length                 0
emp_title                  0
fico_range_high            0
funded_amnt                0
grade                      0
home_ownership             0
inq_last_6mths             0
int_rate                   0
mo_sin_old_rev_tl_op       0
mo_sin_rcnt_rev_tl_op      0
mo_sin_rcnt_tl             0
mort_acc                   0
mths_since_recent_bc       0
num_actv_bc_tl             0
num_bc_tl                  0
num_il_tl                  0
num_rev_accts              0
open_acc                   0
pub_rec                    0
pub_rec_bankruptcies       0
purpose                    0
revol_bal                  0
revol_util                 0
sub_grade                  0
target                     0
tax_liens     

In [4]:
df = df_n.dropna()

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,issue_d,loan duration,annual_inc,avg_cur_bal,bc_open_to_buy,bc_util,delinq_2yrs,dti,emp_length,...,purpose,revol_bal,revol_util,sub_grade,target,tax_liens,zip_code,Pct_afro_american,Predictions,Predicted probabilities
0,0,2013,0,39600.0,1379.0,21564.0,16.1,0.0,2.49,2 years,...,home_improvement,4136.0,16.1,B2,0,0.0,782,7.388592,0,0.053051
1,1,2013,0,55000.0,9570.0,16473.0,53.9,0.0,22.87,10+ years,...,debt_consolidation,36638.0,61.2,B2,0,0.0,481,9.745456,0,0.084507
2,2,2013,0,325000.0,53306.0,13901.0,67.1,0.0,18.55,5 years,...,debt_consolidation,29581.0,54.6,A3,0,0.0,945,7.542862,0,0.037206
3,3,2013,0,130000.0,36362.0,3567.0,93.0,0.0,13.03,10+ years,...,debt_consolidation,10805.0,67.0,B3,0,0.0,809,6.598132,0,0.061371
4,4,2013,1,73000.0,24161.0,4853.0,74.7,1.0,23.13,6 years,...,debt_consolidation,27003.0,82.8,D5,1,0.0,802,7.0589,1,0.345896


In [6]:
# Check column names and data types
print("Columns:", df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print(f"\nDataframe shape: {df.shape}")

Columns: ['Unnamed: 0', 'issue_d', 'loan duration', 'annual_inc', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'delinq_2yrs', 'dti', 'emp_length', 'emp_title', 'fico_range_high', 'funded_amnt', 'grade', 'home_ownership', 'inq_last_6mths', 'int_rate', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc', 'num_actv_bc_tl', 'num_bc_tl', 'num_il_tl', 'num_rev_accts', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies', 'purpose', 'revol_bal', 'revol_util', 'sub_grade', 'target', 'tax_liens', 'zip_code', 'Pct_afro_american', 'Predictions', 'Predicted probabilities']

Data types:
Unnamed: 0                   int64
issue_d                      int64
loan duration                int64
annual_inc                 float64
avg_cur_bal                float64
bc_open_to_buy             float64
bc_util                    float64
delinq_2yrs                float64
dti                        float64
emp_length                  object
emp_title                  

In [7]:
df["Predicted probabilities"].unique()

array([0.0530511 , 0.0845068 , 0.03720555, ..., 0.08110127, 0.15007915,
       0.37907901])

## Step 1.

## Step 2.

In [8]:
# Simplified approach using sklearn pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import xgboost as xgb
import lightgbm as lgb

import warnings
from sklearn.exceptions import DataConversionWarning

# Suppress the specific sklearn warnings about feature names
warnings.filterwarnings('ignore', message='X does not have valid feature names')
warnings.filterwarnings('ignore', category=DataConversionWarning)

# Prepare data - use original features without manual encoding
X = df.drop(columns=['Unnamed: 0', 'target', 'Predictions', 'Predicted probabilities'])
y = df['target']

# Split the data first
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
print(f"Class distribution: {y_train.value_counts(normalize=True).round(3).to_dict()}")

Training set: (760365, 35), Test set: (325871, 35)
Class distribution: {0: 0.79, 1: 0.21}


In [9]:
# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

print(f"Categorical features ({len(categorical_features)}): {categorical_features}")
print(f"Numerical features ({len(numerical_features)}): {len(numerical_features)} columns")

# Create preprocessing pipeline
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)

# Create XGBoost pipeline
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier())
])

print("Pipeline created successfully!")

Categorical features (6): ['emp_length', 'emp_title', 'grade', 'home_ownership', 'purpose', 'sub_grade']
Numerical features (29): 29 columns
Pipeline created successfully!


In [10]:
# Train both models using pipelines
print("Training models using pipelines...")

# Train XGBoost
print("Training XGBoost...")
xgb_pipeline.fit(X_train, y_train)

print("Training completed!")

# Make predictions
xgb_pred = xgb_pipeline.predict(X_test)
xgb_pred_proba = xgb_pipeline.predict_proba(X_test)[:, 1]

# Calculate metrics
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_auc = roc_auc_score(y_test, xgb_pred_proba)

print(f"\nXGBoost - Accuracy: {xgb_accuracy:.4f}, AUC: {xgb_auc:.4f}")

Training models using pipelines...
Training XGBoost...


Training completed!

XGBoost - Accuracy: 0.7945, AUC: 0.7210


In [11]:
# Create comparison DataFrame
results_df = pd.DataFrame({
    'Model': ['XGBoost Pipeline'],
    'Accuracy': [xgb_accuracy],
    'AUC Score': [xgb_auc]
})

print(results_df.round(4))

print("\n" + "=" * 70)
print("CLASSIFICATION REPORTS")
print("=" * 70)

print("\nXGBoost Classification Report:")
print(classification_report(y_test, xgb_pred))

              Model  Accuracy  AUC Score
0  XGBoost Pipeline    0.7945      0.721

CLASSIFICATION REPORTS

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.98      0.88    257277
           1       0.56      0.11      0.18     68594

    accuracy                           0.79    325871
   macro avg       0.68      0.54      0.53    325871
weighted avg       0.75      0.79      0.74    325871

