In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from hyperopt import fmin, tpe, hp, Trials
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import hyperopt

In [2]:
data = pd.read_csv("../Data/CleanedData.csv")

In [3]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = tfidf_vectorizer.fit_transform(data['transformed text'])
y = data['subject'].values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
oversampler = SMOTE(random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [6]:
xg_clf = xgb.XGBClassifier(objective='multi:softprob', num_class=6)

In [7]:
search_space = {
    'n_estimators': hyperopt.hp.choice('n_estimators', [100, 200, 300, 400, 500]),
    'max_depth': hyperopt.hp.choice('max_depth', [3, 4, 5, 6, 7]),
    'learning_rate': hyperopt.hp.choice('learning_rate', [0.01, 0.05, 0.1]),
    'reg_alpha': hyperopt.hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hyperopt.hp.uniform('reg_lambda', 0, 1)
}

In [8]:
def objective(params):
    xg_clf = xgb.XGBClassifier(**params)
    xg_clf.fit(X_train, y_train)

    y_pred = xg_clf.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')

    return -f1

In [9]:
best_params = hyperopt.fmin(fn=objective, space=search_space, algo=hyperopt.tpe.suggest, max_evals=50)

100%|████| 50/50 [11:14:17<00:00, 809.15s/trial, best loss: -0.7723709195044903]


In [15]:
print('Best parameters:', best_params)


Best parameters: {'learning_rate': 2, 'max_depth': 0, 'n_estimators': 3, 'reg_alpha': 0.7896796965233772, 'reg_lambda': 0.6670024028587416}


In [16]:
# Train the XGBoost classifier with the best parameters
xg_clf = xgb.XGBClassifier(learning_rate=0.1,max_depth =6,n_estimators=400,reg_alpha=0.7896796965233772,reg_lambda=0.6670024028587416)
xg_clf.fit(X_train, y_train)

# Evaluate the XGBoost classifier on the test set
y_pred = xg_clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')

print('F1 score on test set:', f1)

F1 score on test set: 0.7469063790095996
