In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import gc
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import ParameterSampler

train_data = pd.read_parquet('../../../data/CIC_2018/train_dataset_treated.parquet')
test_data = pd.read_parquet('../../../data/CIC_2018/test_dataset_treated.parquet')

train_X = train_data.drop(columns=['Label']).astype(np.float32)
test_X = test_data.drop(columns=['Label']).astype(np.float32)
train_y = np.asarray(train_data['Label'].astype('category').cat.codes, dtype=np.int64)
test_y = np.asarray(test_data['Label'].astype('category').cat.codes, dtype=np.int64)

num_classes = len(np.unique(train_y))
param_dist = {
    'n_estimators': [50, 100],  
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_child_weight': [1, 3],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'gamma': [0, 0.1],
    'scale_pos_weight': [1, 10]  
}

param_sampler = ParameterSampler(param_dist, n_iter=10, random_state=42)

best_f1_score = 0
best_params = None
best_model = None

for params in param_sampler:
    print(f"Training with parameters: {params}")
    model = xgb.XGBClassifier(
        use_label_encoder=False,
        objective='multi:softmax',
        num_class=num_classes,
        eval_metric='mlogloss',
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        min_child_weight=params['min_child_weight'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        gamma=params['gamma'],
        scale_pos_weight=params['scale_pos_weight'],
        n_estimators=params['n_estimators']
    )

    model.fit(train_X, train_y, eval_set=[(test_X, test_y)], verbose=False)

    predictions = model.predict(test_X)

    f1 = f1_score(test_y, predictions, average='weighted')
    print(f"F1 Score: {f1}")

    if f1 > best_f1_score:
        best_f1_score = f1
        best_params = params
        best_model = model

    del model
    gc.collect()

print(f"Best parameters: {best_params}")
print(f"Best F1 score: {best_f1_score}")

if best_model:
    predictions = best_model.predict(test_X)
    print(classification_report(test_y, predictions))

gc.collect()