In [30]:
### Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, validation_curve
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import matplotlib.pyplot as plt


In [31]:
### Import data

data_train = pd.read_csv("datasets/data_train.csv")
test_competition = pd.read_csv("datasets/test_competition.csv")

In [32]:
### Split data

# Drop ID column as it doesn't contribute to predictions
if 'ID' in data_train.columns:
    data_train = data_train.drop(columns=['ID'])

# Separate features and target variable
X = data_train.drop(columns=['default.payment.next.month'])
y = data_train['default.payment.next.month']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Step 1: Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Step 2: Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': range(1, 30),
    'min_samples_split': [2, 5, 10]
}

In [34]:
random_search = RandomizedSearchCV(model, param_distributions=param_dist, 
                                   n_iter=10, scoring='roc_auc', cv=5, random_state=42)

random_search.fit(X_train, y_train)

# Step 3: Evaluate the best model on validation data
best_model = random_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]

# Calculate accuracy and AUC score
accuracy = accuracy_score(y_val, best_model.predict(X_val))
auc_score = roc_auc_score(y_val, y_val_pred_proba)

print(f"Validation Accuracy: {accuracy}")
print(f"Validation AUC: {auc_score}")

Validation Accuracy: 0.824
Validation AUC: 0.7831174023751236


In [35]:
# Load the test dataset
test_data = pd.read_csv("datasets/test_competition.csv")

# Drop the 'ID' column from the test data if it exists
if 'ID' in test_data.columns:
    test_data = test_data.drop(columns=['ID'])

# Use the trained model to predict probabilities on the test data
test_predictions = best_model.predict_proba(test_data)[:, 1]

# Select top 1000 clients with highest probability of default
top_1000_risk_indices = np.argsort(test_predictions)[-1000:]
submission = test_data.iloc[top_1000_risk_indices].copy()  # Avoid SettingWithCopyWarning
submission['default_risk_score'] = test_predictions[top_1000_risk_indices]

# Save the top 1000 highest-risk clients to a CSV file
submission[['default_risk_score']].to_csv('top_1000_risk_clients.csv', index=False)