In [16]:
# %% 
### Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# %% 
### Import data
data_train = pd.read_csv("datasets/data_train.csv")

# %% 
### Feature Engineering
# Convertir les noms de colonnes en majuscules pour éviter les problèmes de casse
data_train.columns = data_train.columns.str.upper()

# Ajouter des caractéristiques utiles pour l'entraînement
data_train['DEBT_TO_CREDIT_RATIO'] = data_train['BILL_AMT1'] / data_train['LIMIT_BAL']
data_train['AVG_BILL_AMOUNT'] = data_train[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].mean(axis=1)
data_train['PAYMENT_RATIO'] = data_train[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].mean(axis=1)

data_train['AGE_ECART_AVG'] = data_train['AGE'] - data_train['AGE'].mean()

# %% 
### Split data
# Drop ID column as it doesn't contribute to predictions
if 'ID' in data_train.columns:
    data_train = data_train.drop(columns=['ID'])

# Separate features and target variable
X = data_train.drop(columns=['DEFAULT.PAYMENT.NEXT.MONTH', 'SEX'])
y = data_train['DEFAULT.PAYMENT.NEXT.MONTH']

# %% 
# Step 1: Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Step 2: Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': [200],
    'learning_rate': [0.4, 0.5, 0.6],
    'max_depth': range(3, 4),  # Augmenter la profondeur pour éviter l'underfitting
    'min_samples_split': range(13, 15),  # Augmenter les splits pour éviter l'overfitting
}

# {'n_estimators': 200,
#  'min_samples_split': 14,
#  'max_depth': 3,
#  'learning_rate': 0.05}

# Validation croisée à 10 plis
random_search = RandomizedSearchCV(
    model, param_distributions=param_dist,
    n_iter=6, scoring='roc_auc', cv=10, random_state=42, n_jobs=-1
)

random_search.fit(X, y)

# %% 
# Step 3: Evaluate the best model on validation data
best_model = random_search.best_estimator_
y_val_pred_proba = best_model.predict_proba(X)[:, 1]

# Calculate AUC
auc_score = roc_auc_score(y, y_val_pred_proba)
print(f"Validation AUC: {auc_score}")

# %% 
# Step 4: Adjust threshold for selecting top 1000 clients
# Test different thresholds
thresholds = [0.3, 0.4, 0.5, 0.6]
best_threshold = 0.5  # Default start with 0.5

for threshold in thresholds:
    predicted_classes = (y_val_pred_proba >= threshold).astype(int)
    auc_score_threshold = roc_auc_score(y, y_val_pred_proba)
    print(f"Threshold: {threshold} - AUC: {auc_score_threshold}")

# Select best threshold (default 0.5 for final submission)
final_threshold = 0.5

# %% 
# Step 5: Load the test dataset
test_data = pd.read_csv("datasets/test_competition.csv")
test_ids = test_data['ID']
test_data = test_data.drop(columns=['ID', 'SEX'])

# Ajouter les mêmes caractéristiques que celles utilisées pour l'entraînement
test_data['DEBT_TO_CREDIT_RATIO'] = test_data['BILL_AMT1'] / test_data['LIMIT_BAL']
test_data['AVG_BILL_AMOUNT'] = test_data[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].mean(axis=1)
test_data['PAYMENT_RATIO'] = test_data[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].mean(axis=1)

# // Add prof's advice
test_data['AGE_ECART_AVG'] = test_data['AGE'] - test_data['AGE'].mean()


# Assurez-vous que l'ordre des colonnes dans le test_data correspond à celui du jeu d'entraînement
test_data = test_data[X.columns]

# Predict probabilities for the test data
test_predictions = best_model.predict_proba(test_data)[:, 1]

# Select top 1000 clients with the highest probability of default
top_1000_risk_indices = np.argsort(test_predictions)[-1000:]
top_1000_risk_ids = test_ids.iloc[top_1000_risk_indices].values

# Create a DataFrame with the top 1000 IDs and their default risk scores
submission = pd.DataFrame({
    'ID': top_1000_risk_ids,
    'DEFAULT_RISK_SCORE': test_predictions[top_1000_risk_indices]
})

# Save the top 1000 high-risk clients to a CSV file
submission.to_csv('top_1000_risk_clients.csv', index=False)


Validation AUC: 0.87023037342195
Threshold: 0.3 - AUC: 0.87023037342195
Threshold: 0.4 - AUC: 0.87023037342195
Threshold: 0.5 - AUC: 0.87023037342195
Threshold: 0.6 - AUC: 0.87023037342195


In [18]:
#### J'adore
random_search.best_estimator_.feature_importances_

array([0.02763054, 0.00856587, 0.00347718, 0.0163121 , 0.40079216,
       0.01881264, 0.00899051, 0.00535755, 0.00632446, 0.00296487,
       0.03339553, 0.02239456, 0.01970134, 0.01522992, 0.02301344,
       0.01524449, 0.01959028, 0.01823717, 0.02540456, 0.01277624,
       0.01834679, 0.02095691, 0.05257656, 0.06184255, 0.13280079,
       0.009261  ])

In [19]:
X.columns

Index(['LIMIT_BAL', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3',
       'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'DEBT_TO_CREDIT_RATIO',
       'AVG_BILL_AMOUNT', 'PAYMENT_RATIO', 'AGE_ECART_AVG'],
      dtype='object')

In [12]:
X.head()

X['AGE_ECART_AVG'] = X['AGE'] - X['AGE'].mean()

X.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEBT_TO_CREDIT_RATIO,AVG_BILL_AMOUNT,PAYMENT_RATIO,AGE_ECART_AVG
0,70000.0,1,2,2,27,0,0,0,0,0,...,4100.0,2400.0,2500.0,1800.0,2400.0,4800.0,0.904257,62597.0,0.0,-8.50012
1,150000.0,2,1,2,30,0,0,2,0,-1,...,4908.0,0.0,0.0,18439.0,1381.0,0.0,0.09922,8597.666667,0.0,-5.50012
2,490000.0,1,1,1,43,-1,-1,-1,-1,-1,...,14187.0,56717.0,81414.0,62063.0,14122.0,25417.0,0.05289,42379.166667,-1.0,7.49988
3,260000.0,2,3,1,57,-1,2,-1,2,-1,...,0.0,1668.0,0.0,490.0,35722.0,0.0,0.004415,6538.333333,0.0,21.49988
4,310000.0,2,2,1,30,0,0,0,0,0,...,12401.0,13000.0,13208.0,10000.0,9506.0,10002.0,0.957877,286806.833333,0.0,-5.50012
