# Imports

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix

# from gensim.models import KeyedVectors
# import gensim.downloader as api
# from huggingface_hub import hf_hub_download

# NLP
import string, re, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
import spacy
from spellchecker import SpellChecker

# !python -m spacy download en_core_web_sm

# Data

In [None]:
# The objective of the project is to classify a product into the four categories 
# Electronics, Household, Books and Clothing & Accessories, 
# based on its description available in the e-commerce platform.

In [None]:
# Source : https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification
# Code : https://www.kaggle.com/code/sugataghosh/e-commerce-text-classification-tf-idf-word2vec#Text-Normalization 

data = pd.read_csv('data/ecommerceDataset.csv', names = ['category','description'], header = None)
print(data['category'].value_counts())
print(data.shape)
data.head()

# Data Cleaning

In [None]:
# Missing values and duplicate observations
print(pd.Series({"Number of observations with missing values": len(data) - len(data.dropna()),
                 "Number of duplicate observations": data.duplicated().sum()}).to_string())

In [None]:
data.dropna(inplace = True) # Dropping observations with missing values
data.drop_duplicates(inplace = True) # Dropping duplicate observations
data.reset_index(drop = True, inplace = True) # Resetting index

# Manual encoding of labels
label_dict = {'Electronics': 0, 'Household': 1, 'Books': 2, 'Clothing & Accessories': 3}
data = data.replace({'category': label_dict})

In [None]:
data.head()

In [None]:
# Feature-target split
X, y = data.drop('category', axis = 1), data['category']

# Train-test split (from complete data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 40)

# Validation-test split (from test data)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 40)

#### In natural language processing, **text normalization** is the process of transforming text into a single canonical form. We consider a number of text normalization processes. At the end of the section, we combine selected processes into one single function and apply it on the product descriptions.

**Stemming and Lemmatization**
-  Stemming is the process of reducing the words to their root form or stem. It reduces related words to the same stem even if the stem is not a dictionary word. For example, the words introducing, introduced, introduction reduce to a common word introduce. However, the process often produces stems that are not actual words.
  
- Lemmatization offers a more sophisticated approach by utilizing a corpus to match root forms of the words. Unlike stemming, it uses the context in which a word is being used.

In [None]:
# Converting to lowercase
def convert_to_lowercase(text):
    return text.lower()

# Removing whitespaces
def remove_whitespace(text):
    return text.strip()

# Remove Punctuation
def remove_punctuation(text):
    punct_str = string.punctuation # contains a predefined set of punctuation characters.
    punct_str = punct_str.replace("'", "") # discarding apostrophe from the string
    clean_text = "".join(char for char in text if char not in punct_str)
    return clean_text

# Remove stopwords
def remove_stopwords(text):
    regexp = RegexpTokenizer("[\w']+")
    
    stops = stopwords.words("english") # stopwords
    clean_text = " ".join([word for word in regexp.tokenize(text) if word not in stops])
    return clean_text

# Lemmatization
spacy_lemmatizer = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])
def text_lemmatizer(text):
    text_spacy = " ".join([token.lemma_ for token in spacy_lemmatizer(text)])
    return text_spacy

def text_normalizer(text):
    text = convert_to_lowercase(text)
    text = remove_whitespace(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = text_lemmatizer(text) 
    return text

In [None]:
X_train['description'] = X_train['description'].apply(text_normalizer)
X_val['description'] = X_val['description'].apply(text_normalizer)
X_test['description'] = X_test['description'].apply(text_normalizer)

# TD - IDF

#### TF-IDF (short for term frequency-inverse document frequency), is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

- Corpus contains multiple documents, document contains multiple terms
- Corpus = Collection of Sentences
- Document = Sentence
- Term = Word

- **Term frequency (TF) is the relative frequency of a term within a given document. It is obtained as the number of times a word appears in a text, divided by the total number of words appearing in the text.**
  
- **Inverse document frequency (IDF) measures how common or rare a word is across all documents. It is the logarithmically scaled inverse fraction of the documents that contain the word, obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that ratio.**

#### **Term Frequency (TF) Formula:**  

$$
TF(t, d) = \frac{f_t}{N}
$$

where:  
- $ f_t $ = Number of times term $ t $ appears in document $ d $  
- $ N $ = Total number of terms in document $ d $  

---

#### **Inverse Document Frequency (IDF) Formula:**  

$$
IDF(t) = \log \left(\frac{N_d}{N_t + 1} \right)
$$

where:  
- $ N_d $ = Total number of documents in the corpus  
- $ N_t $ = Number of documents that contain term $ t $  
- The "+1" in the denominator is used to prevent division by zero (i.e., smoothing).  

---

#### **TF-IDF Formula:**  

$$
TF\text{-}IDF(t, d) = TF(t, d) \times IDF(t)
$$

This helps in weighting terms based on their importance in a document relative to the entire corpus.


In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
# TF-IDF vectorization
TfidfVec = TfidfVectorizer()
X_train_tfidf = TfidfVec.fit_transform(X_train["description"])
X_val_tfidf = TfidfVec.transform(X_val["description"])
X_test_tfidf = TfidfVec.transform(X_test["description"])

- ngram_range : tuple (min_n, max_n), default=(1, 1)
- The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. 
- For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.

#### Model

In [None]:
model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)
y_test_pred, y_val_pred = model.predict(X_test_tfidf), model.predict(X_val_tfidf)

In [None]:
y_test =  y_test.values

In [None]:
# 1. Compute Macro F1-score
f1_macro = f1_score(y_test, y_test_pred, average='macro')
print(f"Macro F1-score: {f1_macro:.4f}")

# 2. Classification Report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

# 3. Compute Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# LSTM

In [None]:
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import nltk

nltk.download('punkt')
from nltk.tokenize import word_tokenize


In [3]:
import numpy as np

In [4]:
import keras

2025-02-15 20:21:28.242617: W external/local_xla/xla/tsl/lib/monitoring/collection_registry.cc:88] Trying to register 2 metrics with the same name: /tensorflow/api/ops_eager_execution. The old value will be erased in order to register a new one. Please check if you link the metric more than once, or if the name is already used by other metrics.
2025-02-15 20:21:28.242645: W external/local_xla/xla/tsl/lib/monitoring/collection_registry.cc:88] Trying to register 2 metrics with the same name: /tensorflow/api/enable_control_flow_v2. The old value will be erased in order to register a new one. Please check if you link the metric more than once, or if the name is already used by other metrics.
2025-02-15 20:21:28.242655: W external/local_xla/xla/tsl/lib/monitoring/collection_registry.cc:88] Trying to register 2 metrics with the same name: /tensorflow/api/tf_function. The old value will be erased in order to register a new one. Please check if you link the metric more than once, or if the nam

TypeError: unhashable type: 'list'