In [5]:
import numpy as np

import pandas as pd

In [8]:
df = pd.read_csv('../Data/Model_Data.csv')

In [None]:
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, classification_report
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
import joblib

# Assuming df is your DataFrame
X = df.drop(columns='label')
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
}

# Define the pipeline
estimator = Pipeline([
    ('smoteenn', SMOTEENN()),  # SMOTEENN for resampling
    ('scaler', StandardScaler()),  # Standardization
    ('classification', XGBClassifier(eval_metric='auc', use_label_encoder=False))  # XGBoost with AUC metric
])

# Expanded parameter grid for XGBoost
param_grid = {
    'smoteenn__sampling_strategy': [0.1, 0.3, 0.5, 0.7, 1.0],
    'classification__max_depth': [5, 7, 9, 15],  # Hạn chế max_depth
    'classification__n_estimators': [200, 300, 500],  # Hợp lý hóa n_estimators
    'classification__learning_rate': [0.001, 0.005, 0.01],  # Giữ lại các giá trị learning rate
    'classification__colsample_bytree': [0.7, 0.8, 1.0],
    'classification__subsample': [0.8, 1.0],
    'classification__min_child_weight': [1, 2, 3],  # Điều chỉnh min_child_weight hợp lý
    'classification__scale_pos_weight': [5, 10, 20, 30, 50, 100]  # Điều chỉnh scale_pos_weight
}

# Randomized search with cross-validation
random_search = RandomizedSearchCV(
    estimator=estimator,
    param_distributions=param_grid,
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    scoring=scoring,
    refit='f1',  # Refitting based on Recall to focus on improving it
    n_iter=50,  # Increased iterations for more extensive search
    n_jobs=-1,  # Use all available CPUs
    verbose=1
)

# Fit the random search on the training data
random_search.fit(X_train, y_train)

# Get the best model from random search
best_model = random_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Display the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Optionally save the best model
joblib.dump(best_model, 'best_xgb_model_tuned.pkl')
