# Imports

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix

# from gensim.models import KeyedVectors
# import gensim.downloader as api
# from huggingface_hub import hf_hub_download

# NLP
import string, re, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
import spacy
from spellchecker import SpellChecker

# !python -m spacy download en_core_web_sm

# Data

In [None]:
# The objective of the project is to classify a product into the four categories 
# Electronics, Household, Books and Clothing & Accessories, 
# based on its description available in the e-commerce platform.

In [None]:
# Source : https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification
# Code : https://www.kaggle.com/code/sugataghosh/e-commerce-text-classification-tf-idf-word2vec#Text-Normalization 

data = pd.read_csv('data/ecommerceDataset.csv', names = ['category','description'], header = None)
print(data['category'].value_counts())
print(data.shape)
data.head()

# Data Cleaning + TFIDF

In [None]:
# Missing values and duplicate observations
print(pd.Series({"Number of observations with missing values": len(data) - len(data.dropna()),
                 "Number of duplicate observations": data.duplicated().sum()}).to_string())

In [None]:
data.dropna(inplace = True) # Dropping observations with missing values
data.drop_duplicates(inplace = True) # Dropping duplicate observations
data.reset_index(drop = True, inplace = True) # Resetting index

# Manual encoding of labels
label_dict = {'Electronics': 0, 'Household': 1, 'Books': 2, 'Clothing & Accessories': 3}
data = data.replace({'category': label_dict})

In [None]:
data.head()

In [None]:
# Feature-target split
X, y = data.drop('category', axis = 1), data['category']

# Train-test split (from complete data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 40)

# Validation-test split (from test data)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 40)

#### In natural language processing, **text normalization** is the process of transforming text into a single canonical form. We consider a number of text normalization processes. At the end of the section, we combine selected processes into one single function and apply it on the product descriptions.

**Stemming and Lemmatization**
-  Stemming is the process of reducing the words to their root form or stem. It reduces related words to the same stem even if the stem is not a dictionary word. For example, the words introducing, introduced, introduction reduce to a common word introduce. However, the process often produces stems that are not actual words.
  
- Lemmatization offers a more sophisticated approach by utilizing a corpus to match root forms of the words. Unlike stemming, it uses the context in which a word is being used.

In [None]:
# Converting to lowercase
def convert_to_lowercase(text):
    return text.lower()

# Removing whitespaces
def remove_whitespace(text):
    return text.strip()

# Remove Punctuation
def remove_punctuation(text):
    punct_str = string.punctuation # contains a predefined set of punctuation characters.
    punct_str = punct_str.replace("'", "") # discarding apostrophe from the string
    clean_text = "".join(char for char in text if char not in punct_str)
    return clean_text

# Remove stopwords
def remove_stopwords(text):
    regexp = RegexpTokenizer("[\w']+")
    
    stops = stopwords.words("english") # stopwords
    clean_text = " ".join([word for word in regexp.tokenize(text) if word not in stops])
    return clean_text

# Lemmatization
spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])
def text_lemmatizer(text):
    text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
    return text_spacy

def text_normalizer(text):
    text = convert_to_lowercase(text)
    text = remove_whitespace(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = text_lemmatizer(text) 
    return text

In [None]:
X_train['description'] = X_train['description'].apply(text_normalizer)
X_val['description'] = X_val['description'].apply(text_normalizer)
X_test['description'] = X_test['description'].apply(text_normalizer)

#### TF-IDF (short for term frequency-inverse document frequency), is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

- Corpus contains multiple documents, document contains multiple terms
- Corpus = Collection of Sentences
- Document = Sentence
- Term = Word

- **Term frequency (TF) is the relative frequency of a term within a given document. It is obtained as the number of times a word appears in a text, divided by the total number of words appearing in the text.**
  
- **Inverse document frequency (IDF) measures how common or rare a word is across all documents. It is the logarithmically scaled inverse fraction of the documents that contain the word, obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that ratio.**

#### **Term Frequency (TF) Formula:**  

$$
TF(t, d) = \frac{f_t}{N}
$$

where:  
- $ f_t $ = Number of times term $ t $ appears in document $ d $  
- $ N $ = Total number of terms in document $ d $  

---

#### **Inverse Document Frequency (IDF) Formula:**  

$$
IDF(t) = \log \left(\frac{N_d}{N_t + 1} \right)
$$

where:  
- $ N_d $ = Total number of documents in the corpus  
- $ N_t $ = Number of documents that contain term $ t $  
- The "+1" in the denominator is used to prevent division by zero (i.e., smoothing).  

---

#### **TF-IDF Formula:**  

$$
TF\text{-}IDF(t, d) = TF(t, d) \times IDF(t)
$$

This helps in weighting terms based on their importance in a document relative to the entire corpus.


In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
# TF-IDF vectorization
TfidfVec = TfidfVectorizer()
X_train_tfidf = TfidfVec.fit_transform(X_train["description"])
X_val_tfidf = TfidfVec.transform(X_val["description"])
X_test_tfidf = TfidfVec.transform(X_test["description"])

- ngram_range : tuple (min_n, max_n), default=(1, 1)
- The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. 
- For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.

#### Model

In [None]:
model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)
y_test_pred, y_val_pred = model.predict(X_test_tfidf), model.predict(X_val_tfidf)

In [None]:
y_test =  y_test.values

In [None]:
# 1. Compute Macro F1-score
f1_macro = f1_score(y_test, y_test_pred, average='macro')
print(f"Macro F1-score: {f1_macro:.4f}")

# 2. Classification Report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

# 3. Compute Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# LSTM

In [24]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anand.thirwani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
data = pd.read_csv('data/ecommerceDataset.csv', names = ['category','description'], header = None)
print(data['category'].value_counts())
print(data.shape)

data.dropna(inplace = True) # Dropping observations with missing values
data.drop_duplicates(inplace = True) # Dropping duplicate observations
data.reset_index(drop = True, inplace = True) # Resetting index

# Manual encoding of labels
label_dict = {'Electronics': 0, 'Household': 1, 'Books': 2, 'Clothing & Accessories': 3}
data = data.replace({'category': label_dict})

data.head()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64
(50425, 2)


  data = data.replace({'category': label_dict})


Unnamed: 0,category,description
0,1,Paper Plane Design Framed Wall Hanging Motivat...
1,1,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,1,SAF 'UV Textured Modern Art Print Framed' Pain...
3,1,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,1,Incredible Gifts India Wooden Happy Birthday U...


In [26]:
# Feature-target split
X, y = data['description'].values, data['category'].values

# Train-test split (from complete data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 40)

# Validation-test split (from test data)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 40)

In [27]:
# Hyperparameters
vocab_size = 5000  # Maximum vocabulary size
max_length = 100  # Maximum sentence length
embedding_dim = 128  # Word embedding dimensions

# Tokenization
# Builds the vocabulary based on X_train. Assigns an index (number) to each word
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>") # tensorflow tokeniser
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
# This replaces words with their assigned indices (numbers).
# if X_train = ["I love this movie", "This movie is amazing"], then X_train_seq = [[3, 4, 1, 2],  [1, 2, 5, 6]]
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

In [29]:
# Define LSTM Model
inputs = layers.Input(shape=(max_length,))
embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(inputs)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(embedding)
x = layers.Dropout(0.5)(x)
x = layers.Bidirectional(layers.LSTM(32, return_sequences=False))(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(4, activation='softmax')(x)

# Compile Model
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()




In [None]:
# Train the model
early_stopping = EarlyStopping(monitor='val_loss',  # Monitor validation loss
                               patience=3,          # Stop after 3 epochs of no improvement
                               restore_best_weights=True,  # Restore best weights
                               verbose=1)
history = model.fit(X_train_pad, y_train, epochs=20, batch_size=4, validation_data=(X_val_pad, y_val), callbacks=[early_stopping])

In [None]:
# Plot loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training & Validation Loss')
plt.show()

In [None]:
# Since softmax is used, the output will be a probability distribution across the 4 classes.
y_test_pred = model.predict(X_test_pad)
print(y_test_pred)

# To get the predicted class, take the argmax (index of the highest probability):
y_test_pred = np.argmax(y_test_pred, axis=1)
print(y_test_pred)

In [None]:
# 1. Compute Macro F1-score
f1_macro = f1_score(y_test, y_test_pred, average='macro')
print(f"Macro F1-score: {f1_macro:.4f}")

# 2. Classification Report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

# 3. Compute Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# BERT

In [11]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("Is MPS available?:", tf.config.list_physical_devices("GPU"))

TensorFlow version: 2.16.2
Is MPS available?: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [12]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
X_train

array(['Practical Approach To Acupuncture: 1 About the Author The author graduated in allopathy [MBBS] in 1954. She went on to acquire a postgraduate diploma in gynecology and obstetrics, and went into General practice in Mumbai, that kept her busy for 30 years. She realized that allopathy had its limit, creating a deadlockin the treatment of certain disorders.',
       'Nice Goods Leatherette Office Arm Chair (Brown) This Chair Has Adjustable Seat Height, Wheels, Armrest, Swivel. Ideal For Home & Office Use.',
       'Ekan Fashionable Fedora Hat for Girls, Boys Fedora Hat, Hats for Men Stylish Casual, Fedora Hat for Women Red Color 30Gram (Pack of 1) Fedora Hats For Men And Women Stylish(Ekan) Stylish Hat with elegant designs and hues designed with combination of great quality and fashion. Complete your look with this Red fedora hat from Ekan. Made from high quality material, this hat will redefine your casual look and make you look all the more stylish',
       ...,
       "The Chall

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)


In [15]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(32)

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
trainable_params = sum([tf.keras.backend.count_params(p) for p in model.trainable_variables])
print(f"Total Trainable Parameters: {trainable_params}")

Total Trainable Parameters: 109485316


In [17]:
# Freeze all BERT layers
model.bert.trainable = False

In [18]:
trainable_params = sum([tf.keras.backend.count_params(p) for p in model.trainable_variables])
print(f"Total Trainable Parameters: {trainable_params}")

Total Trainable Parameters: 3076


In [19]:
model.fit(train_dataset, epochs=2)

Epoch 1/2
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported
Epoch 2/2


<tf_keras.src.callbacks.History at 0x355c2a620>

In [21]:
# Get raw predictions
predictions = model.predict(test_dataset).logits  # Extract logits
# Convert logits to class probabilities
probabilities = tf.nn.softmax(predictions, axis=1).numpy()
print(probabilities)

[[0.15812528 0.00661241 0.64668757 0.18857472]
 [0.1581253  0.00661241 0.6466877  0.1885747 ]
 [0.1581253  0.00661241 0.6466877  0.1885747 ]
 ...
 [0.1581253  0.00661241 0.6466876  0.1885747 ]
 [0.1581253  0.00661241 0.6466876  0.1885747 ]
 [0.1581253  0.00661241 0.6466877  0.1885747 ]]


In [22]:
# To get the predicted class, take the argmax (index of the highest probability).
y_test_pred = np.argmax(probabilities, axis=1)
print(y_test_pred)

[2 2 2 ... 2 2 2]


In [23]:
# 1. Compute Macro F1-score
f1_macro = f1_score(y_test, y_test_pred, average='macro')
print(f"Macro F1-score: {f1_macro:.4f}")

# 2. Classification Report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

# 3. Compute Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

Macro F1-score: 0.0940
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       525
           1       0.00      0.00      0.00      1056
           2       0.23      1.00      0.38       644
           3       0.00      0.00      0.00       556

    accuracy                           0.23      2781
   macro avg       0.06      0.25      0.09      2781
weighted avg       0.05      0.23      0.09      2781

Confusion Matrix:
[[   0    0  525    0]
 [   0    0 1056    0]
 [   0    0  644    0]
 [   0    0  556    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# TFIDF
# Word 2 Vec
# BiLSTM
# Conv 1D
# BERT