In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from hyperopt import fmin, tpe, hp, Trials
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import hyperopt
import time
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../Data/CleanedEcommerce.csv')

In [3]:
df = df.dropna()

In [4]:
text = 'desc'
label = 'label'
num_classes = df[label].nunique()

In [5]:
label_encoder = LabelEncoder()

df[label] = label_encoder.fit_transform(df[label])

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = tfidf_vectorizer.fit_transform(df[text]).toarray()
y = df[label].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.4, random_state=42)

In [8]:
oversampler = SMOTE(random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [9]:
xg_clf = xgb.XGBClassifier(objective='multi:softprob', num_class=num_classes, verbose = 4, n_jobs=-1)

In [10]:
start = time.time()
xg_clf.fit(X_train, y_train)
end = time.time()
print("Time taken for Tuning", (end-start)/60, "Mins")

y_pred = xg_clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
cr = classification_report(y_test,y_pred)
print("F1 SCore:", f1)
print("Classification Report :\n", cr)

Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Time taken for Tuning 3.5330618699391683 Mins
F1 SCore: 0.9284842023567392
Classification Report :/n               precision    recall  f1-score   support

           0       0.89      0.93      0.91       489
           1       0.95      0.97      0.96       449
           2       0.93      0.89      0.91       410
           3       0.94      0.92      0.93       877

    accuracy                           0.93      2225
   macro avg       0.93      0.93      0.93      2225
weighted avg       0.93      0.93      0.93      2225



In [40]:
search_space = {
    'n_estimators': hyperopt.hp.choice('n_estimators', [10,15,20,30,35,40,50]),
    'max_depth': hyperopt.hp.choice('max_depth', [10,20,30,40,50,60,70]),
    'learning_rate': hyperopt.hp.choice('learning_rate', [0.01, 0.05, 0.1]),
    'reg_alpha': hyperopt.hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hyperopt.hp.uniform('reg_lambda', 0, 1)
}

In [41]:
def objective(params):
    xg_clf = xgb.XGBClassifier(**params,n_jobs=-1)
    xg_clf.fit(X_train, y_train)

    y_pred = xg_clf.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
    if f1<0.97:
        return -f1
    else:
        return 1.00

In [42]:
start = time.time()
best_params = hyperopt.fmin(
    fn=objective, space=search_space, algo=hyperopt.tpe.suggest, max_evals=30)
end = time.time()
print("Time taken for Tuning", (end-start)/3600, "Hours")

100%|█████| 30/30 [4:14:09<00:00, 508.33s/trial, best loss: -0.9312937345264315]
Time taken for Tuning 4.236089823378457 Hours


In [43]:
print('Best parameters:', best_params)


Best parameters: {'learning_rate': 2, 'max_depth': 6, 'n_estimators': 6, 'reg_alpha': 0.004474355059321249, 'reg_lambda': 0.9357284038747243}


In [45]:
# Train the XGBoost classifier with the best parameters
xg_clf = xgb.XGBClassifier(
    learning_rate=0.1,
    max_depth =60,
    n_estimators=40,
    reg_alpha=0.004474355059321249,
    reg_lambda=0.9357284038,
    n_jobs=-1
)
xg_clf.fit(X_train, y_train)

# Evaluate the XGBoost classifier on the test set
y_pred = xg_clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
clr_report = classification_report(y_test,y_pred)
print('F1 score on test set:', f1)
print('Classification Report on test set:\n', clr_report)


F1 score on test set: 0.9276117454095436
Classification Report on test set:
               precision    recall  f1-score   support

           0       0.91      0.92      0.92       489
           1       0.96      0.96      0.96       449
           2       0.92      0.90      0.91       410
           3       0.92      0.92      0.92       877

    accuracy                           0.93      2225
   macro avg       0.93      0.93      0.93      2225
weighted avg       0.93      0.93      0.93      2225

