In [6]:
# %% 
### Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# %% 
### Import data
data_train = pd.read_csv("datasets/data_train.csv")

# %% 
### Feature Engineering
# Convertir les noms de colonnes en majuscules pour éviter les problèmes de casse
data_train.columns = data_train.columns.str.upper()

# Ajouter des caractéristiques utiles
data_train['DEBT_TO_CREDIT_RATIO'] = data_train['BILL_AMT1'] / data_train['LIMIT_BAL']
data_train['AVG_BILL_AMOUNT'] = data_train[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].mean(axis=1)
data_train['PAYMENT_RATIO'] = data_train[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].mean(axis=1)

# %% 
### Split data
# Drop ID column as it doesn't contribute to predictions
if 'ID' in data_train.columns:
    data_train = data_train.drop(columns=['ID'])

# Separate features and target variable
X = data_train.drop(columns=['DEFAULT.PAYMENT.NEXT.MONTH'])
y = data_train['DEFAULT.PAYMENT.NEXT.MONTH']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# %% 
# Step 1: Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Step 2: Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': range(100, 500, 50),
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'max_depth': range(3, 8),  # Augmenter la profondeur pour éviter l'underfitting
    'min_samples_split': range(8, 15),  # Augmenter les splits pour éviter l'overfitting
}

# Validation croisée à 10 plis
random_search = RandomizedSearchCV(
    model, param_distributions=param_dist,
    n_iter=20, scoring='roc_auc', cv=10, random_state=42, n_jobs=-1
)

random_search.fit(X_train, y_train)

# %% 
# Step 3: Evaluate the best model on validation data
best_model = random_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]

# Calculate AUC
auc_score = roc_auc_score(y_val, y_val_pred_proba)
print(f"Validation AUC: {auc_score}")

# %% 
# Step 4: Adjust threshold for selecting top 1000 clients
# Test different thresholds
thresholds = [0.3, 0.4, 0.5, 0.6]
best_threshold = 0.5  # Default start with 0.5

for threshold in thresholds:
    predicted_classes = (y_val_pred_proba >= threshold).astype(int)
    auc_score_threshold = roc_auc_score(y_val, y_val_pred_proba)
    print(f"Threshold: {threshold} - AUC: {auc_score_threshold}")

# Select best threshold (default 0.5 for final submission)
final_threshold = 0.5

# %% 
# Step 5: Load the test dataset
test_data = pd.read_csv("datasets/test_competition.csv")
test_ids = test_data['ID']
test_data = test_data.drop(columns=['ID'])

# Predict probabilities for the test data
test_predictions = best_model.predict_proba(test_data)[:, 1]

# Select top 1000 clients with the highest probability of default
top_1000_risk_indices = np.argsort(test_predictions)[-1000:]
top_1000_risk_ids = test_ids.iloc[top_1000_risk_indices].values

# Create a DataFrame with the top 1000 IDs and their default risk scores
submission = pd.DataFrame({
    'ID': top_1000_risk_ids,
    'DEFAULT_RISK_SCORE': test_predictions[top_1000_risk_indices]
})

# Save the top 1000 high-risk clients to a CSV file
submission.to_csv('top_1000_risk_clients.csv', index=False)


Validation AUC: 0.7886911814239115
Threshold: 0.3 - AUC: 0.7886911814239115
Threshold: 0.4 - AUC: 0.7886911814239115
Threshold: 0.5 - AUC: 0.7886911814239115
Threshold: 0.6 - AUC: 0.7886911814239115


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- AVG_BILL_AMOUNT
- DEBT_TO_CREDIT_RATIO
- PAYMENT_RATIO


In [7]:
random_search.best_params_

{'n_estimators': 200,
 'min_samples_split': 14,
 'max_depth': 3,
 'learning_rate': 0.05}