In [70]:
import numpy as np
import pandas as pd
import torch
from comet_ml import Experiment
import xgboost as xgb
import warnings
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from format_data import *
from xgboost import DMatrix, train, XGBClassifier
from tqdm import tqdm
from pyDOE import lhs

# Plan d'expérience

### Importation et formattage des données

In [2]:
train_data = pd.read_csv("./data/GAN_train.csv")
train_data = format_data(train_data)

### Recherche par optimisation bayesienne des hyperparamètres

In [4]:
import optuna
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
import warnings

warnings.filterwarnings("ignore")

y = train_data["Label"]
X = train_data.drop(columns=["Label"])

class CustomLearningRate(xgb.callback.TrainingCallback):
    def __init__(self):
        super().__init__()

    def after_iteration(self, model, epoch, evals_log):
        if epoch < 25:
            model.set_param('eta', 0.1)
        elif epoch < 50:
            model.set_param('eta', 0.05)
        elif epoch < 150:
            model.set_param('eta', 0.01)
        elif epoch < 300:
            model.set_param('eta', 0.005)
        else:
            model.set_param('eta', 0.001)

        return False

# Instantiate the callback
custom_lr_callback = CustomLearningRate()

def objective(trial):

    params = {
        'objective': 'multi:softprob',
        'num_class': 3,
        'eval_metric': 'merror',
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 0.001, 1),
        'lambda': trial.suggest_loguniform('lambda', 0.001, 2.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'device': 'cuda'
    }
    
    scores = []

    for i in range(4):

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.35, stratify=y)

        # Calculate class weights inside the loop
        class_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=y_train)
        weights = y_train.map({0: class_weights[0], 1: class_weights[1], 2: class_weights[2]})

        model = XGBClassifier(**params)
        
        eval_set = [(X_valid, y_valid)]
        model.fit(X_train, y_train, 
                eval_set=eval_set, 
                early_stopping_rounds=500,
                verbose=False,
                callbacks=[custom_lr_callback])

        preds = model.predict(X_valid)
        score = accuracy_score(y_valid, preds)
        scores.append(score)

        # Report the accuracy score and check for pruning
        trial.report(score, step=len(scores))
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    avg_accuracy = sum(scores) / len(scores)
    return avg_accuracy


def save_intermediate_results(study, trial):
    """Callback to save the trials dataframe after each iteration."""
    df = study.trials_dataframe()
    df.to_csv("bayes/xgb_optimization_results.csv", index=False)

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
n_trials = 1000
study.optimize(objective, n_trials=n_trials, callbacks=[save_intermediate_results])

# Save the final results
final_results = study.trials_dataframe()
final_results.to_csv("bayes/xgb_optimization_results.csv", index=False)

# Print the best result
best_trial = study.best_trial
print("\nBest trial:")
print(f"  Iteration: {best_trial.number + 1}")
print(f"  Value (Accuracy): {best_trial.value:.4f}")
print("  Params:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

[I 2023-11-04 00:16:42,967] A new study created in memory with name: no-name-0189ea0b-2383-4d74-b1af-ae69550bf616


[I 2023-11-04 00:16:52,323] Trial 0 finished with value: 0.9115063543483677 and parameters: {'max_depth': 15, 'subsample': 0.5068072479409373, 'colsample_bytree': 0.9152388012425, 'colsample_bylevel': 0.896567710092731, 'alpha': 0.008413878147015875, 'lambda': 0.0020679094885720675, 'gamma': 3.665760778371201, 'min_child_weight': 134, 'booster': 'gbtree', 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.9115063543483677.
[I 2023-11-04 00:18:20,007] Trial 1 finished with value: 0.9639608771492648 and parameters: {'max_depth': 11, 'subsample': 0.8352193435551376, 'colsample_bytree': 0.8996635044721178, 'colsample_bylevel': 0.7873062217827902, 'alpha': 0.01504928060540415, 'lambda': 0.003257659488348449, 'gamma': 4.786933087977136, 'min_child_weight': 18, 'booster': 'dart', 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.9639608771492648.
[I 2023-11-04 00:19:49,132] Trial 2 finished with value: 0.9092636431597308 and parameters: {'max_depth': 20, 'subsample': 0.773293


Best trial:
  Iteration: 747
  Value (Accuracy): 0.9913
  Params:
    max_depth: 19
    subsample: 0.9881625777808394
    colsample_bytree: 0.5066637462932493
    colsample_bylevel: 0.8676419628745671
    alpha: 0.11995518785342191
    lambda: 0.022790142677397214
    gamma: 0.3419400437334943
    min_child_weight: 1
    booster: gbtree
    grow_policy: lossguide


# Aggrégation des prédictions pour 500 modèles différents avec les meilleurs hyperparamètres afin de réduire la variance des prédictions 

In [6]:
class CustomLearningRate(xgb.callback.TrainingCallback):
    def __init__(self):
        super().__init__()

    def after_iteration(self, model, epoch, evals_log):
        if epoch < 25:
            model.set_param('eta', 0.1)
        elif epoch < 50:
            model.set_param('eta', 0.05)
        elif epoch < 150:
            model.set_param('eta', 0.01)
        elif epoch < 300:
            model.set_param('eta', 0.005)
        else:
            model.set_param('eta', 0.001)

        return False

# Instantiate the callback
custom_lr_callback = CustomLearningRate()

In [109]:
X = train_data.drop(['Label'], axis=1)
y = train_data['Label']

warnings.filterwarnings("ignore")

params = {
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'merror',    
    'max_depth': 19,
    'subsample': 0.9881625777808394,
    'colsample_bytree': 0.5066637462932493,
    'colsample_bylevel': 0.8676419628745671,
    'alpha': 0.11995518785342191,
    'lambda': 0.022790142677397214,
    'gamma': 0.3419400437334943,
    'min_child_weight': 1,
    'booster': 'gbtree',
    'grow_policy': 'lossguide',
    'device': 'cuda'
}

total_predictions = np.zeros((10320, 500))

# Initialize a dictionary to hold accumulated metrics
accumulated_metrics = {
    'Precision': {0: [], 1: [], 2: []},
    'Recall': {0: [], 1: [], 2: []},
    'F1': {0: [], 1: [], 2: []},
    'Accuracy': {0: [], 1: [], 2: []}
}

for i in range(500):
    # Split data into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, stratify=y)

    # Calculate class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    weights = y_train.map(dict(zip(np.unique(y_train), class_weights)))

    model = XGBClassifier(**params)
    
    # Model training
    eval_set = [(X_valid, y_valid)]
    model.fit(X_train, y_train, 
              sample_weight=weights, 
              eval_set=eval_set, 
              early_stopping_rounds=500,
              verbose=False)

    # Prediction on validation set
    y_pred = model.predict(X_valid)

    # Generate a classification report for this fold
    report = classification_report(y_valid, y_pred, output_dict=True, zero_division=0)

    # Calculate per-class metrics and update accumulated metrics
    cm = confusion_matrix(y_valid, y_pred)
    for label in range(len(cm)):
        label_str = str(label)
        accumulated_metrics['Accuracy'][label].append(cm[label, label] / np.sum(cm[label]))
        accumulated_metrics['Precision'][label].append(report[label_str]['precision'])
        accumulated_metrics['Recall'][label].append(report[label_str]['recall'])
        accumulated_metrics['F1'][label].append(report[label_str]['f1-score'])

    # Prediction on test set
    y_test = model.predict(test_data)
    total_predictions[:, i] = y_test

# Calculate the average of the accumulated metrics
average_metrics = {measure: {} for measure in accumulated_metrics}
for measure, classes in accumulated_metrics.items():
    for class_label, values in classes.items():
        average_metrics[measure][class_label] = np.mean(values)

# Remap class labels to the desired names
class_label_names = {0: 'Normal', 1: 'Tropical Cyclone', 2: 'Atmospheric River'}

# Now create a new DataFrame that will contain the remapped labels and measures
final_metrics_df = pd.DataFrame()

# Populate the new DataFrame with the remapped labels and the average metrics
for measure, classes in average_metrics.items():
    for class_label, value in classes.items():
        final_metrics_df.at[class_label_names[class_label], measure] = value

# Reorder the DataFrame to have accuracy as the first row and performance measures as columns
reordered_measures = ['Accuracy', 'Precision', 'Recall', 'F1']
final_metrics_df = final_metrics_df[reordered_measures]

# Transpose the DataFrame to match the requested format with performance measures as rows
final_metrics_df = final_metrics_df.T

# Print the final DataFrame
final_metrics_df

Unnamed: 0,Normal,Tropical Cyclone,Atmospheric River
Accuracy,0.995801,0.99094,0.984895
Precision,0.995224,0.98335,0.990743
Recall,0.995801,0.99094,0.984895
F1,0.995511,0.987112,0.987802


In [110]:
# Convert predictions to integers if necessary
total_predictions_int = total_predictions.astype(int)

# Now apply the majority voting
y_test = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=1, arr=total_predictions_int)

In [111]:
labels, counts = np.unique(y_test, return_counts=True)

for label, count in zip(labels, counts):
    print(f"Label {label}: {count} occurrences")

Label 0: 7677 occurrences
Label 1: 697 occurrences
Label 2: 1946 occurrences


In [159]:
df = pd.DataFrame({
    'SNo': range(1, len(y_test) + 1),
    'Label': y_test.astype(int)
})

df.to_csv("xgb_final.csv", index=False)