In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import f1_score, classification_report,accuracy_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder



In [2]:
df = pd.read_csv('../Data/CleanedEcommerce.csv')

In [3]:
df.head()

Unnamed: 0,label,desc
0,Household,paper plane design frame wall hang motiv offic...
1,Household,saf frame paint wood 30 inch x 10 inch special...
2,Household,saf textur modern art print frame paint synthe...
3,Household,saf flower print frame paint synthet 13 5 inch...
4,Household,incred gift india wooden happi birthday uniqu ...


In [4]:
text = 'desc'
label = 'label'
num_classes = df[label].nunique()

In [5]:
df.shape

(27802, 2)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)

In [7]:
df = df.dropna()

In [8]:
label_encoder = LabelEncoder()

df[label] = label_encoder.fit_transform(df[label])

In [9]:
X= tfidf.fit_transform(df[text]).toarray()
y= df[label].values

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42, stratify = y)
X_val,X_test,y_val,y_test= train_test_split(X_test,y_test,test_size=0.4,random_state=42)


In [12]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)


In [14]:
from collections import Counter

label_distribution = Counter(y)

print(label_distribution)

Counter({3: 10564, 0: 6255, 1: 5674, 2: 5308})


In [19]:
def objective(params):
    if params['classifier']['type'] == 'MultinomialNB':
        classifier = MultinomialNB(alpha=params['classifier']['alpha'])
    elif params['classifier']['type'] == 'GaussianNB':
        classifier = GaussianNB(var_smoothing=params['classifier']['var_smoothing'])
    else:
        classifier = BernoulliNB(alpha=params['classifier']['alpha'], binarize=params['classifier']['binarize'])
    
   # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = classifier.predict(X_val)
    
    # Calculate F1 score
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    return {'loss': -f1, 'status': STATUS_OK}


In [20]:
space = {
    'classifier': hp.choice('classifier', [
        {
            'type': 'MultinomialNB',
            'alpha': hp.loguniform('alpha_mnb', 1e-6, 2),
        },
        {
            'type': 'GaussianNB',
            'var_smoothing': hp.loguniform('var_smoothing', 1e-9, 1e-1),
        },
        {
            'type': 'BernoulliNB',
            'alpha': hp.loguniform('alpha_bnb', 1e-5, 1),
            'binarize': hp.uniform('binarize_bnb', 0.0, 1.0),
        }
    ])
}


In [21]:
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, rstate=np.random.default_rng(42))

print("Best Hyperparameters:")
print(best)

100%|██████| 100/100 [01:08<00:00,  1.45trial/s, best loss: -0.9316053826853495]
Best Hyperparameters:
{'alpha_mnb': 1.158728145418749, 'classifier': 0}


In [18]:
mnb = (alpha = 1.23443343650026,binarize = 0.12838447657183444)
bnb.fit(X_train,y_train)
y_pred = bnb.predict(X_test)
acc = accuracy_score(y_test,y_pred)
f1= f1_score(y_test,y_pred, average='weighted')
class_report = classification_report(y_test,y_pred)
print('Accuracy :', acc)
print('F1 Score :',f1)
print(class_report)

Accuracy : 0.9303370786516854
F1 Score : 0.9303328087539924
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       489
           1       0.94      0.97      0.95       449
           2       0.89      0.91      0.90       410
           3       0.94      0.92      0.93       877

    accuracy                           0.93      2225
   macro avg       0.93      0.93      0.93      2225
weighted avg       0.93      0.93      0.93      2225



## Multinomial Naive Bayes gives the best accuracy, so we will proceed with it

# Using Word2Vec vectors for training

In [26]:
df = pd.read_csv('word2vecData.csv')


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43125 entries, 0 to 43124
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  43125 non-null  int64 
 1   vec      43125 non-null  object
dtypes: int64(1), object(1)
memory usage: 674.0+ KB


In [31]:
import ast

# Convert the string representation to a list
df['vec'] = df['vec'].apply(ast.literal_eval)

# Verify the data type of the column
print(type(df['vec'].iloc[0]))

<class 'list'>


In [32]:
X = df['vec'].to_list()
y = df['subject'].to_list()

In [33]:
X[0]

[0.017181913731461863,
 0.054051674018471926,
 0.02501760364252295,
 0.1145188498631709,
 -0.07854115222133486,
 0.013064093508962857,
 0.03582854190115201,
 -0.06757971122439972,
 0.07362481025652697,
 0.06868105958410575,
 -0.04545136360125353,
 -0.07671412236272952,
 -0.0677699288405941,
 0.027317779885847018,
 -0.07981907041732875,
 0.07597549352268715,
 0.0445443708344368,
 0.0843538618357168,
 -0.014792016670528778,
 -0.10628513831876765,
 0.006775034349516961,
 0.018582618842690676,
 0.07434383489317813,
 -0.03250578971905897,
 0.025516833289194914,
 0.015695469527594787,
 -0.06637474641961566,
 0.05962376136564265,
 0.06179611292262535,
 -0.01286162899038886,
 0.005549328475348694,
 0.011451300928148172,
 -0.053575073931850285,
 -0.01066235903292726,
 0.017091136867717162,
 0.004068493169579802,
 0.01690352703891905,
 0.029450562040684587,
 0.033840976865951625,
 0.06556245836160951,
 0.09580701892658816,
 -0.0669507387667726,
 0.11927528165828037,
 0.04103657350701801,
 -0.020

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, classification_report

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = mnb.predict(X_test)

# Calculate weighted F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the classification report, including precision, recall, and F1 score for each class
class_report = classification_report(y_test, y_pred)

print("Weighted F1 Score:",f1)
print("Classification Report:\n", class_report)

Weighted F1 Score: 0.7303612414248749
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.64      0.65      3597
           1       0.72      0.78      0.75      3496
           2       0.94      0.89      0.92      3520
           3       0.55      0.62      0.58      3443
           4       0.65      0.52      0.58      3494
           5       0.86      0.95      0.90      3522

    accuracy                           0.73     21072
   macro avg       0.73      0.73      0.73     21072
weighted avg       0.73      0.73      0.73     21072

