In [1]:
import numpy as np
import pandas as pd
import torch
from comet_ml import Experiment
import xgboost as xgb
import warnings
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, log_loss, balanced_accuracy_score
from format_data import *
from xgboost import DMatrix, train
from tqdm import tqdm
from pyDOE import lhs

# Plan d'expérience

### Importation et formattage des données

In [10]:
train_data = pd.read_csv("./data/GAN_train.csv")
train_data

Unnamed: 0,TMQ,U850,V850,UBOT,VBOT,QREFHT,PS,PSL,T200,T500,PRECT,TS,TREFHT,Z1000,Z200,ZBOT,Label,LOCATION,MONTH
0,33.233349,-1.323338,-2.876757,-1.104712,-3.018848,0.015944,101478.914100,101478.95310,214.135406,262.189911,0.003701,300.198181,298.744324,129.496017,12231.834960,65.871857,0,3,4
1,42.230705,-5.575889,1.676216,-8.121222,-2.047163,0.018948,101354.992200,101355.01560,217.118073,265.872925,21.771541,302.329193,301.508942,120.022858,12367.765630,66.578079,0,3,7
2,28.851704,6.540233,-2.425864,3.151410,-4.542639,0.010225,101815.445300,101815.44530,220.517487,255.087524,27.640550,291.032959,289.223969,152.949829,12040.492190,63.534153,2,0,10
3,33.602306,30.092083,-13.667763,14.197247,-18.031784,0.011341,100489.109400,100489.10940,219.173523,259.330139,0.497996,290.632599,292.086822,64.319603,12044.448240,64.319603,2,0,8
4,18.035347,19.857960,-2.524551,13.810221,-3.531109,0.008733,101102.593800,101102.59380,217.271362,259.712280,12.884099,290.607849,290.395599,93.650154,12002.601560,63.753540,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22926,62.326530,-5.594485,-0.313011,-4.709764,-4.878560,0.019827,100325.828100,100325.82810,221.689377,269.748932,24.083189,300.653839,299.989166,66.302177,12487.194340,66.302177,1,5,8
22927,21.466675,-1.014480,2.437929,-5.984308,2.638489,0.010786,102170.273400,102170.27340,219.099594,259.991547,10.295630,296.831757,294.150940,185.347992,12167.036130,64.601463,0,0,2
22928,55.052807,11.844543,-1.656594,8.818943,2.171454,0.020023,100979.710900,100979.71090,219.387283,268.568939,40.373258,302.809631,301.243988,86.892281,12444.893550,66.533775,0,4,9
22929,31.006016,-3.160620,-1.718535,-0.268632,-4.899251,0.015006,100900.312500,100900.31250,220.337585,267.260590,0.000000,299.039764,297.987000,78.818901,12417.227540,65.663666,0,5,9


### Recherche par optimisation bayesienne des hyperparamètres

In [5]:
import optuna
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
import warnings

warnings.filterwarnings("ignore")

y = train_data["Label"]
X = train_data.drop(columns=["Label"])

class CustomLearningRate(xgb.callback.TrainingCallback):
    def __init__(self):
        super().__init__()

    def after_iteration(self, model, epoch, evals_log):
        if epoch < 25:
            model.set_param('eta', 0.1)
        elif epoch < 50:
            model.set_param('eta', 0.05)
        elif epoch < 150:
            model.set_param('eta', 0.01)
        elif epoch < 300:
            model.set_param('eta', 0.005)
        else:
            model.set_param('eta', 0.001)

        return False

# Instantiate the callback
custom_lr_callback = CustomLearningRate()

def objective(trial):

    params = {
        'objective': 'multi:softprob',
        'num_class': 3,
        'eval_metric': 'merror',
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 0.001, 1),
        'lambda': trial.suggest_loguniform('lambda', 0.001, 2.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'device': 'cuda'
    }
    
    scores = []

    for i in range(4):

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, stratify=y)

        # Calculate class weights inside the loop
        class_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=y_train)
        weights = y_train.map({0: class_weights[0], 1: class_weights[1], 2: class_weights[2]})

        model = XGBClassifier(**params)
        
        eval_set = [(X_valid, y_valid)]
        model.fit(X_train, y_train, 
                eval_set=eval_set, 
                early_stopping_rounds=500,
                verbose=False,
                callbacks=[custom_lr_callback])

        preds = model.predict(X_valid)
        score = accuracy_score(y_valid, preds)
        scores.append(score)

        # Report the accuracy score and check for pruning
        trial.report(score, step=len(scores))
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    avg_accuracy = sum(scores) / len(scores)
    return avg_accuracy


def save_intermediate_results(study, trial):
    """Callback to save the trials dataframe after each iteration."""
    df = study.trials_dataframe()
    df.to_csv("xgb_bayesian_results.csv", index=False)

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
n_trials = 300
study.optimize(objective, n_trials=n_trials, callbacks=[save_intermediate_results])

# Save the final results
final_results = study.trials_dataframe()
final_results.to_csv("xgb_bayesian_results.csv", index=False)

# Print the best result
best_trial = study.best_trial
print("\nBest trial:")
print(f"  Iteration: {best_trial.number + 1}")
print(f"  Value (Accuracy): {best_trial.value:.4f}")
print("  Params:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

[I 2023-11-02 16:25:33,835] A new study created in memory with name: no-name-5f45fc82-c9f7-4cc7-8cb5-a8cb07a059f9
[I 2023-11-02 16:26:48,369] Trial 0 finished with value: 0.8674781976744186 and parameters: {'max_depth': 6, 'subsample': 0.9917190598023169, 'colsample_bytree': 0.8684108970263503, 'colsample_bylevel': 0.6142567983940939, 'alpha': 0.05644072622110936, 'lambda': 0.014936039534876301, 'gamma': 0.02540877522974605, 'min_child_weight': 233, 'booster': 'dart', 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.8674781976744186.
[I 2023-11-02 16:26:55,022] Trial 1 finished with value: 0.8698764534883722 and parameters: {'max_depth': 10, 'subsample': 0.7460680694571806, 'colsample_bytree': 0.574610471522514, 'colsample_bylevel': 0.7523817691122445, 'alpha': 0.0906510490081258, 'lambda': 0.47942736479721376, 'gamma': 1.6288886126515894, 'min_child_weight': 183, 'booster': 'gbtree', 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.8698764534883722.
[I 2023-11-02 1


Best trial:
  Iteration: 493
  Value (Accuracy): 0.9854
  Params:
    max_depth: 14
    subsample: 0.6895526061828922
    colsample_bytree: 0.6178958267070939
    colsample_bylevel: 0.8320749511534301
    alpha: 0.0026219405648692895
    lambda: 0.0010142833428147427
    gamma: 0.000331778291456572
    min_child_weight: 1
    booster: gbtree
    grow_policy: lossguide


# Entraînement du modèle final

In [8]:
train_data = pd.read_csv("./data/GAN_train.csv")
y = train_data["Label"]
X = train_data.drop(columns=["Label"])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

In [9]:
train_data

Unnamed: 0,TMQ,U850,V850,UBOT,VBOT,QREFHT,PS,PSL,T200,T500,PRECT,TS,TREFHT,Z1000,Z200,ZBOT,Label,LOCATION,MONTH
0,33.233349,-1.323338,-2.876757,-1.104712,-3.018848,0.015944,101478.914100,101478.95310,214.135406,262.189911,0.003701,300.198181,298.744324,129.496017,12231.834960,65.871857,0,3,4
1,42.230705,-5.575889,1.676216,-8.121222,-2.047163,0.018948,101354.992200,101355.01560,217.118073,265.872925,21.771541,302.329193,301.508942,120.022858,12367.765630,66.578079,0,3,7
2,28.851704,6.540233,-2.425864,3.151410,-4.542639,0.010225,101815.445300,101815.44530,220.517487,255.087524,27.640550,291.032959,289.223969,152.949829,12040.492190,63.534153,2,0,10
3,33.602306,30.092083,-13.667763,14.197247,-18.031784,0.011341,100489.109400,100489.10940,219.173523,259.330139,0.497996,290.632599,292.086822,64.319603,12044.448240,64.319603,2,0,8
4,18.035347,19.857960,-2.524551,13.810221,-3.531109,0.008733,101102.593800,101102.59380,217.271362,259.712280,12.884099,290.607849,290.395599,93.650154,12002.601560,63.753540,0,0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22926,62.326530,-5.594485,-0.313011,-4.709764,-4.878560,0.019827,100325.828100,100325.82810,221.689377,269.748932,24.083189,300.653839,299.989166,66.302177,12487.194340,66.302177,1,5,8
22927,21.466675,-1.014480,2.437929,-5.984308,2.638489,0.010786,102170.273400,102170.27340,219.099594,259.991547,10.295630,296.831757,294.150940,185.347992,12167.036130,64.601463,0,0,2
22928,55.052807,11.844543,-1.656594,8.818943,2.171454,0.020023,100979.710900,100979.71090,219.387283,268.568939,40.373258,302.809631,301.243988,86.892281,12444.893550,66.533775,0,4,9
22929,31.006016,-3.160620,-1.718535,-0.268632,-4.899251,0.015006,100900.312500,100900.31250,220.337585,267.260590,0.000000,299.039764,297.987000,78.818901,12417.227540,65.663666,0,5,9


In [41]:
best = pd.read_csv("bayesian_results.csv", index_col=0)
params = best[best["value"] == best["value"].max()]

# Assigning each hyperparameter to a variable
alpha = params["params_alpha"].values[0]
booster = params["params_booster"].values[0]
colsample_bylevel = params["params_colsample_bylevel"].values[0]
colsample_bytree = params["params_colsample_bytree"].values[0]
eta = params["params_eta"].values[0]
gamma = params["params_gamma"].values[0]
grow_policy = params["params_grow_policy"].values[0]
lambda_param = params["params_lambda"].values[0]
max_depth = params["params_max_depth"].values[0]
min_child_weight = params["params_min_child_weight"].values[0]
scale_pos_weight = params["params_scale_pos_weight"].values[0]
subsample = params["params_subsample"].values[0]

params = {
        'objective': 'multi:softprob',
        'num_class': 3,
        'eval_metric': 'mlogloss',
        'eta': eta,
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'colsample_bylevel': colsample_bylevel,
        'alpha': alpha,
        'lambda': lambda_param,
        'gamma': gamma,
        'min_child_weight': min_child_weight,
        'booster': booster,
        'grow_policy': grow_policy,
        'scale_pos_weight': scale_pos_weight,
        'tree_method': 'gpu_hist',
        'gpu_id': 0,
        'seed': 42
    }

In [42]:
# Calculate class weights for this fold
class_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=y_train)
weights = y_train.map({0: class_weights[0], 1: class_weights[1], 2: class_weights[2]})

train_dmatrix = DMatrix(X_train, label=y_train, weight=weights)
val_dmatrix = DMatrix(X_val, label=y_val)

watchlist = [(train_dmatrix, 'train'), (val_dmatrix, 'validation')]

xgb_model = train(
    params,
    train_dmatrix,
    num_boost_round=1_000_000,
    evals=watchlist,
    early_stopping_rounds=20,
    verbose_eval=True
)

[0]	train-mlogloss:0.93289	validation-mlogloss:0.94410
[1]	train-mlogloss:0.80274	validation-mlogloss:0.82483
[2]	train-mlogloss:0.69087	validation-mlogloss:0.71549
[3]	train-mlogloss:0.59888	validation-mlogloss:0.62619
[4]	train-mlogloss:0.52203	validation-mlogloss:0.55188
[5]	train-mlogloss:0.45724	validation-mlogloss:0.49039
[6]	train-mlogloss:0.40145	validation-mlogloss:0.43615
[7]	train-mlogloss:0.35302	validation-mlogloss:0.38841
[8]	train-mlogloss:0.31123	validation-mlogloss:0.34725
[9]	train-mlogloss:0.27511	validation-mlogloss:0.31189
[10]	train-mlogloss:0.24431	validation-mlogloss:0.28185
[11]	train-mlogloss:0.21636	validation-mlogloss:0.25295
[12]	train-mlogloss:0.19227	validation-mlogloss:0.22795
[13]	train-mlogloss:0.17094	validation-mlogloss:0.20582
[14]	train-mlogloss:0.15197	validation-mlogloss:0.18605
[15]	train-mlogloss:0.13567	validation-mlogloss:0.16934
[16]	train-mlogloss:0.12095	validation-mlogloss:0.15393
[17]	train-mlogloss:0.10833	validation-mlogloss:0.14104
[1

In [47]:
class_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=y)
weights = y.map({0: class_weights[0], 1: class_weights[1], 2: class_weights[2]})
total_train = DMatrix(X, label=y, weight=weights)

watchlist = [(total_train, 'train')]

xgb_model2 = train(
    params,
    total_train,
    num_boost_round=xgb_model.best_iteration,
    evals=watchlist,
    verbose_eval=True
)

[0]	train-mlogloss:0.93384
[1]	train-mlogloss:0.80317
[2]	train-mlogloss:0.69150
[3]	train-mlogloss:0.59880
[4]	train-mlogloss:0.52122
[5]	train-mlogloss:0.45636
[6]	train-mlogloss:0.40031
[7]	train-mlogloss:0.35206
[8]	train-mlogloss:0.31034
[9]	train-mlogloss:0.27404
[10]	train-mlogloss:0.24304
[11]	train-mlogloss:0.21498
[12]	train-mlogloss:0.19035
[13]	train-mlogloss:0.16876
[14]	train-mlogloss:0.14981
[15]	train-mlogloss:0.13358
[16]	train-mlogloss:0.11948
[17]	train-mlogloss:0.10689
[18]	train-mlogloss:0.09561
[19]	train-mlogloss:0.08554
[20]	train-mlogloss:0.07659
[21]	train-mlogloss:0.06855
[22]	train-mlogloss:0.06133
[23]	train-mlogloss:0.05505
[24]	train-mlogloss:0.04925
[25]	train-mlogloss:0.04414
[26]	train-mlogloss:0.03996
[27]	train-mlogloss:0.03600
[28]	train-mlogloss:0.03246
[29]	train-mlogloss:0.02945
[30]	train-mlogloss:0.02671
[31]	train-mlogloss:0.02411
[32]	train-mlogloss:0.02211
[33]	train-mlogloss:0.02017
[34]	train-mlogloss:0.01849
[35]	train-mlogloss:0.01679
[3

# Calcul des prédictions sur le jeu de données test

In [108]:
from sklearn.decomposition import PCA
import pandas as pd

test_data = pd.read_csv("./data/test.csv", index_col=0)
test_data = format_data(test_data, is_test=True)

# List of categorical columns
categorical_columns = ['YEAR', 'MONTH', 'DAY', 'SOUTHERN_HEMISPHERE', 
                       'LOC_0', 'LOC_1', 'LOC_2', 'LOC_3', 'LOC_4', 'LOC_5',
                       'SEASON_Fall', 'SEASON_Spring', 'SEASON_Summer', 'SEASON_Winter',
                       'Label']

# Separate continuous and categorical data
continuous_data = test_data.drop(columns=categorical_columns)
categorical_data = test_data[categorical_columns]

# Apply whitening to continuous_data (for demonstration, using a placeholder function)
original_index = continuous_data.index

# Apply PCA
pca = PCA(whiten=True)
continuous_data_whitened = pca.fit_transform(continuous_data)

# Convert back to DataFrame while assigning the original index
continuous_data_whitened = pd.DataFrame(continuous_data_whitened, columns=continuous_data.columns, index=original_index)

# Combine the whitened continuous data dataframe with the categorical data dataframe
combined_data_test = pd.concat([continuous_data_whitened, categorical_data], axis=1)

combined_data_test.head()

KeyError: "['SEASON_Spring', 'SEASON_Winter', 'Label'] not found in axis"

In [118]:
from sklearn.decomposition import PCA
import pandas as pd

test_data = pd.read_csv("./data/test.csv", index_col=0)
test_data = format_data(test_data, is_test=True)
test_data["TMQ"]

SNo
1        25.907482
2        25.907482
3        27.019733
4        27.019733
5        26.516499
           ...    
10316    57.411018
10317    57.277252
10318    57.277252
10319    54.855862
10320    54.855862
Name: TMQ, Length: 10320, dtype: float64

In [119]:
combined_data["TMQ"]

KeyError: 'TMQ'

In [96]:
import pandas as pd

# Assuming train_data has been previously loaded
test_data = pd.read_csv("./data/test.csv")

# Format the test_data
test_data = format_data(test_data, is_test=True)

# List of columns from train_data excluding the label
columns_without_label = [col for col in combined_data.columns if col != "Label"]

# Check and add missing columns to test_data and set their values to 0
missing_columns = ['SEASON_Spring', 'SEASON_Winter']

for col in missing_columns:
    if col not in test_data.columns:
        test_data[col] = 0

# Reorder the columns of test_data to match the order in train_data
test_data = test_data[columns_without_label]

test_data



Unnamed: 0,U850,V850,UBOT,VBOT,QREFHT,PS,PSL,T200,T500,PRECT,...,LOC_0,LOC_1,LOC_2,LOC_3,LOC_4,LOC_5,SEASON_Fall,SEASON_Spring,SEASON_Summer,SEASON_Winter
0,6.662070,-17.510447,-7.432653,-3.936030,0.010624,101532.5391,101532.5391,213.092209,256.032043,0.012060,...,1,0,0,0,0,0,1,0,0,0
1,6.662070,-17.510447,-7.432653,-3.936030,0.010624,101532.5391,101532.5391,213.092209,256.032043,0.012060,...,1,0,0,0,0,0,1,0,0,0
2,4.951319,-17.341263,-7.286631,-3.150316,0.010890,101513.0234,101513.0234,213.161011,255.616837,0.000068,...,1,0,0,0,0,0,1,0,0,0
3,4.951319,-17.341263,-7.286631,-3.150316,0.010890,101513.0234,101513.0234,213.161011,255.616837,0.000068,...,1,0,0,0,0,0,1,0,0,0
4,5.362008,-17.227922,-7.257047,-2.907396,0.010821,101505.1484,101505.1484,213.188248,255.498810,0.004068,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10315,3.704696,17.998688,-8.723821,14.131296,0.020041,100295.9375,100295.9375,222.533188,267.842407,1.126800,...,0,0,0,1,0,0,1,0,0,0
10316,5.944778,18.429197,-8.424341,15.523009,0.020222,100210.2891,100210.2891,221.928162,268.028778,2.026800,...,0,0,0,1,0,0,1,0,0,0
10317,5.944778,18.429197,-8.424341,15.523009,0.020222,100210.2891,100210.2891,221.928162,268.028778,2.026800,...,0,0,0,1,0,0,1,0,0,0
10318,8.170049,19.165062,-6.791406,16.326492,0.020324,100116.5234,100116.5234,220.904862,267.992645,1.904400,...,0,0,0,1,0,0,1,0,0,0


In [97]:
dtest = xgb.DMatrix(test_data)
y_test = xgb_model.predict(dtest)
y_test = [np.argmax(value) for value in y_test]

In [98]:
labels, counts = np.unique(y_test, return_counts=True)

for label, count in zip(labels, counts):
    print(f"Label {label}: {count} occurrences")

Label 1: 3363 occurrences
Label 2: 6957 occurrences


In [52]:
df = pd.DataFrame({
    'SNo': range(1, len(y_test) + 1),
    'Label': y_test
})

df.to_csv("xgboost2_prev.csv", index=False)