In [9]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import gensim
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../Data/CleanedEcommerce.csv')

In [3]:
text = 'desc'
label = 'label'
num_classes = df[label].nunique()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer(max_features=5000)

In [5]:
df = df.dropna()

In [6]:
label_encoder = LabelEncoder()

df[label] = label_encoder.fit_transform(df[label])

In [7]:
X = tfidf.fit_transform(df[text]).toarray()
y = df[label].values

In [8]:
y.shape

(27801,)

In [10]:

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state = 42, stratify=y)

X_val, X_test, y_val, y_test= train_test_split(X_test, y_test, test_size=0.4, random_state = 42)

In [11]:
y_test.shape

(2225,)

In [12]:
X_test.shape

(2225, 5000)

In [13]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
X_test, y_test = smote.fit_resample(X_test, y_test)


# HyperParameter Tuning using HyperOpt

In [14]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, classification_report
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval


In [1]:
space = {
    'max_depth': hp.quniform('max_depth', 1, 200, 1),
    'min_samples_split': hp.quniform('min_samples_split', 1, 500, 2),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 500, 1),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', None, 0.5, 0.75,1,2,3,4,5]),
    'criterion': hp.choice('criterion', ['gini', 'entropy'])
}


NameError: name 'hp' is not defined

In [2]:
def objective(params):
    # Define hyperparameters
    max_depth = int(params['max_depth'])
    min_samples_split = int(params['min_samples_split'])
    min_samples_leaf = int(params['min_samples_leaf'])
    max_features = params['max_features']
    criterion = params['criterion']
    
    # Create a Decision Tree Classifier with the specified hyperparameters
    clf = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split,
                                 min_samples_leaf=min_samples_leaf, max_features=max_features,
                                 criterion=criterion, random_state=42)
    
    # Train the classifier
    clf.fit(X_resampled, y_resampled)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_val)
    
    # Calculate F1 score
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    return {'loss': -f1, 'status': STATUS_OK}

In [36]:
import time
trials = Trials()
start  =time.time()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,  # Number of optimization iterations
            trials=trials,
            rstate=np.random.default_rng(42))

# Get the best hyperparameters and the corresponding loss
best_hyperparams = space_eval(space, best)
best_loss = -trials.best_trial['result']['loss']
end = time.time()
print("The time of execution of the above program is:", (end - start)/3600, "Hours")
# Print the best hyperparameters and corresponding loss
print("Best Hyperparameters:")
print(best_hyperparams)
print("Best Weighted F1 Score (Loss):", best_loss)

100%|██████| 100/100 [14:23<00:00,  8.63s/trial, best loss: -0.8596600099844761]
The time of execution of the above program is: 0.2398080016507043 Hours
Best Hyperparameters:
{'criterion': 'entropy', 'max_depth': 166.0, 'max_features': None, 'min_samples_leaf': 2.0, 'min_samples_split': 64.0}
Best Weighted F1 Score (Loss): 0.8596600099844761


In [37]:
#Train with Best HyperParameters Obtained
decision_tree_classifier = DecisionTreeClassifier(
    max_depth = 166, 
    max_features= None,
    min_samples_leaf=2,
    min_samples_split=64,
    criterion='entropy'

)
decision_tree_classifier.fit(X_resampled, y_resampled)
y_pred = decision_tree_classifier.predict(X_test)
f1_weighted = f1_score(y_test, y_pred, average='weighted')
class_report= classification_report(y_test, y_pred)
print(f"Weighted F1 Score: {f1_weighted:.2f}")
print(f"Classification Report:\n {class_report}")

Weighted F1 Score: 0.88
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90       877
           1       0.95      0.93      0.94       877
           2       0.88      0.85      0.86       877
           3       0.81      0.84      0.83       877

    accuracy                           0.88      3508
   macro avg       0.88      0.88      0.88      3508
weighted avg       0.88      0.88      0.88      3508

