In [1]:
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('../Data/CleanedEcommerce.csv')

In [3]:
text = 'desc'
label = 'label'
num_classes = df[label].nunique()

In [4]:
df.head()

Unnamed: 0,label,desc
0,Household,paper plane design frame wall hang motiv offic...
1,Household,saf frame paint wood 30 inch x 10 inch special...
2,Household,saf textur modern art print frame paint synthe...
3,Household,saf flower print frame paint synthet 13 5 inch...
4,Household,incred gift india wooden happi birthday uniqu ...


In [5]:
df = df.dropna()

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer(max_features=5000)

In [7]:
label_encoder = LabelEncoder()

df[label] = label_encoder.fit_transform(df[label])

In [8]:
x= tfidf.fit_transform(df[text])
y= df[label].values

In [9]:

X_train, X_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state = 42, stratify = y)

X_val, X_test, y_val, y_test= train_test_split(X_test, y_test, test_size=0.4, random_state = 42)

# HyperParameter Tuning

In [29]:
space = {
    'n_estimators': hp.quniform('n_estimators', 10, 500, 10),  # Number of trees
    'max_depth': hp.quniform('max_depth', 2, 500, 1),  # Maximum depth of the trees
    'min_samples_split': hp.quniform('min_samples_split', 2, 50, 1),  # Minimum samples required to split a node
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),  # Minimum samples required in a leaf node
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),  # Number of features to consider for the best split
    'bootstrap': hp.choice('bootstrap', [True, False]),  # Whether to bootstrap samples when building trees
}

In [34]:
# Objective function to optimize
def objective(params):
    clf = RandomForestClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        max_features = params['max_features'],
        bootstrap = params['bootstrap'],
        random_state=42,
        n_jobs=-1
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    f1 = f1_score(y_val, y_pred, average = 'weighted')
    if f1 < 0.96:
        return {'loss':1- f1, 'status': STATUS_OK}
    else:
        return {'loss':1.0, 'status': STATUS_OK}

In [35]:
# Perform hyperparameter tuning
trials = Trials()

# Measure execution time
start_time = time.time()

best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=70, trials=trials)

end_time = time.time()
execution_time = end_time - start_time

# Print best hyperparameters
print("Best hyperparameters:")
print(best)

# Calculate and print F1 score of the best parameters
best_params = {
    'n_estimators': int(best['n_estimators']),
    'max_depth': int(best['max_depth']),
    'min_samples_split': int(best['min_samples_split']),
    'min_samples_leaf': int(best['min_samples_leaf']),
    'random_state': 42,
    'n_jobs':-1
}
clf = RandomForestClassifier(**best_params)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_from_best_loss = -(trials.best_trial['result']['loss']-1)
best_f1 = f1_score(y_test, y_pred, average = 'weighted')
class_report = classification_report(y_test,y_pred)
print("F1 score from Loss:", f1_from_best_loss )
print("F1 score of the best parameters:", best_f1)
print("Classification Report:\n", class_report)
# Print execution time
print("Execution time:", execution_time, "seconds")

100%|████████| 70/70 [22:42<00:00, 19.46s/trial, best loss: 0.06730767651755931]
Best hyperparameters:
{'bootstrap': 1, 'max_depth': 494.0, 'max_features': 1, 'min_samples_leaf': 1.0, 'min_samples_split': 12.0, 'n_estimators': 410.0}
F1 score from Loss: 0.9326923234824407
F1 score of the best parameters: 0.9260730785832957
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93       489
           1       0.95      0.96      0.96       449
           2       0.93      0.86      0.90       410
           3       0.90      0.94      0.92       877

    accuracy                           0.93      2225
   macro avg       0.93      0.92      0.93      2225
weighted avg       0.93      0.93      0.93      2225

Execution time: 1362.177012205124 seconds
