In [14]:
### Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, validation_curve
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt


In [15]:
### Import data

data_train = pd.read_csv("datasets/data_train.csv")
test_competition = pd.read_csv("datasets/test_competition.csv")

In [16]:
### Split data

# Separate features and target variable
X = data_train.drop(columns=['default.payment.next.month'])
y = data_train['default.payment.next.month']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,4157,70000.0,1,2,2,27,0,0,0,0,...,65353.0,84184.0,47686.0,49276.0,4100.0,2400.0,2500.0,1800.0,2400.0,4800.0
1,10909,150000.0,2,1,2,30,0,0,2,0,...,2700.0,1500.0,18439.0,1381.0,4908.0,0.0,0.0,18439.0,1381.0,0.0
2,8003,490000.0,1,1,1,43,-1,-1,-1,-1,...,56573.0,81414.0,62063.0,14122.0,14187.0,56717.0,81414.0,62063.0,14122.0,25417.0
3,23852,260000.0,2,3,1,57,-1,2,-1,2,...,1668.0,446.0,123.0,35722.0,0.0,1668.0,0.0,490.0,35722.0,0.0
4,18890,310000.0,2,2,1,30,0,0,0,0,...,299162.0,305251.0,262368.0,254340.0,12401.0,13000.0,13208.0,10000.0,9506.0,10002.0
5,8576,40000.0,1,2,1,35,1,-2,-2,-2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,5663,200000.0,2,2,1,33,-1,-1,-1,-1,...,200.0,15845.0,15845.0,-39.0,5846.0,200.0,15845.0,0.0,0.0,0.0
7,3757,40000.0,1,1,2,26,0,0,0,0,...,21687.0,22327.0,22816.0,23436.0,2000.0,2000.0,1000.0,1000.0,1000.0,1500.0
8,24343,20000.0,1,2,1,42,0,0,0,0,...,16706.0,18214.0,18626.0,18196.0,1600.0,1600.0,2100.0,1000.0,0.0,1500.0
9,25876,200000.0,2,1,1,39,-1,-1,-1,-1,...,114770.0,70175.0,69180.0,44376.0,81596.0,114770.0,70175.0,69180.0,44376.0,37638.0


In [20]:
# Step 1: Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Step 2: Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': range(1, 30),
    'min_samples_split': [2, 5, 10]
}

In [21]:
random_search = RandomizedSearchCV(model, param_distributions=param_dist, 
                                   n_iter=10, scoring='roc_auc', cv=5, random_state=42)

random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_

In [24]:
random_search.best_params_

# Step 3: Evaluate the best model on validation data
y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]

# Calculate accuracy and AUC score
accuracy = accuracy_score(y_val, best_model.predict(X_val))
auc_score = roc_auc_score(y_val, y_val_pred_proba)

print(f"Validation Accuracy: {accuracy}")
print(f"Validation AUC: {auc_score}")

Validation Accuracy: 0.8224
Validation AUC: 0.784436928074479
