In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ecommerce-text-classification/ecommerceDataset.csv


In [2]:
# Python Imports
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

%matplotlib inline

In [3]:
# Load the dataset
df = pd.read_csv("/kaggle/input/ecommerce-text-classification/ecommerceDataset.csv", names=["label","text"])

In [4]:
df.shape

(50425, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   50425 non-null  object
 1   text    50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [6]:
df.head()

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [7]:
# Check if there are any missing value
df.isnull().sum()

label    0
text     1
dtype: int64

In [8]:
df.dropna(axis=0, inplace=True)

In [9]:
df.head()

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [10]:
df.label.value_counts()

label
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8670
Name: count, dtype: int64

In [11]:
# Data is unbalanced, we can use undersampling or SMOTE. For now I will not apply it.

In [12]:
# Write the proprocessing logic

lemmatizer = WordNetLemmatizer()

def text_preprocess(text):

    # Lower case
    text = text.lower()
    # Accept only character
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # remove extra spaces
    text = re.sub(r"\s+"," ", text)

    # Apply lemmatization
    tokens = text.split()
    tokens = [ lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words("english") and len(word)!=1 ]
    

    return " ".join(tokens)

In [13]:
df["text_cleaned"] = df["text"].apply(text_preprocess)

In [14]:
df.head()

Unnamed: 0,label,text,text_cleaned
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,paper plane design framed wall hanging motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",saf floral framed painting wood inch inch spec...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,saf uv textured modern art print framed painti...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",saf flower print framed painting synthetic inc...
4,Household,Incredible Gifts India Wooden Happy Birthday U...,incredible gift india wooden happy birthday un...


In [15]:
# apply label encoder on the label.

encoder = LabelEncoder()
df["label"] = encoder.fit_transform(df["label"]) 

In [27]:
# Train a word2vec model

from gensim.models import Word2Vec

tokenized_corpus = [ x.split() for x in df["text_cleaned"].values]

w2v_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=200,
    window=5,
    min_count=5,
    workers=4  # Number of CPU cores to use
)

In [31]:
# w2v_model.wv.index_to_key

In [32]:
w2v_model.epochs

5

In [40]:
w2v_model.wv.similar_by_word("boy")

[('girl', 0.7145849466323853),
 ('womens', 0.6800793409347534),
 ('fort', 0.6632500886917114),
 ('jumpsuit', 0.6595824360847473),
 ('dungree', 0.6512340307235718),
 ('qube', 0.6473724842071533),
 ('jony', 0.6444801688194275),
 ('sweatshirt', 0.6358873844146729),
 ('romper', 0.634795069694519),
 ('checkered', 0.6336818933486938)]

In [51]:
def calculate_avg_word2vec(text, word2vec_model):

    word_vectors = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]

    if not word_vectors:
        return np.zeros(word2vec_model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)

In [52]:
## Start Model Training


# Seperate the dataset into dependent and independent variable
X = df["text_cleaned"]
y = df["label"]

In [53]:
# Split the dataset into train and test set.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
import tqdm
X_train_w2v = []
for i in tqdm.tqdm(range(len(X_train))):
    token_corpus = X_train.values[i].split()
    X_train_w2v.append(calculate_avg_word2vec(token_corpus, w2v_model))


100%|██████████| 40339/40339 [00:07<00:00, 5554.27it/s]


In [56]:
X_train_w2v = np.array(X_train_w2v)

In [61]:
X_test_w2v = []
for i in tqdm.tqdm(range(len(X_test))):
    token_corpus = X_test.values[i].split()
    X_test_w2v.append(calculate_avg_word2vec(token_corpus, w2v_model))

100%|██████████| 10085/10085 [00:01<00:00, 5608.19it/s]


In [62]:
X_test_w2v = np.array(X_test_w2v)

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1. Initialize a classifier
classifier = LogisticRegression(max_iter=1000)

# 2. Train the classifier on the Word2Vec features
# Note: y_train is the target variable (labels) from your original split
classifier.fit(X_train_w2v, y_train)

# 3. Predict and evaluate
y_pred = classifier.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      2378
           1       0.95      0.95      0.95      1750
           2       0.92      0.89      0.90      2082
           3       0.91      0.94      0.93      3875

    accuracy                           0.93     10085
   macro avg       0.93      0.93      0.93     10085
weighted avg       0.93      0.93      0.93     10085



In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. Initialize a classifier
classifier = RandomForestClassifier()

# 2. Train the classifier on the Word2Vec features
# Note: y_train is the target variable (labels) from your original split
classifier.fit(X_train_w2v, y_train)

# 3. Predict and evaluate
y_pred = classifier.predict(X_test_w2v)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2378
           1       0.98      0.97      0.97      1750
           2       0.97      0.96      0.97      2082
           3       0.96      0.98      0.97      3875

    accuracy                           0.97     10085
   macro avg       0.98      0.97      0.97     10085
weighted avg       0.97      0.97      0.97     10085



In [65]:
# PREDICTION

In [87]:
new_text = """
This book is excellent and has a very engaging story. Great purchase!
"""
 
idx_class_mapping = { i:x for i,x in enumerate(encoder.classes_)}

preprocessed_text = text_preprocess(new_text)

text_vector = np.array([calculate_avg_word2vec(preprocessed_text.split(), w2v_model)])

res = classifier.predict(text_vector)[0]

print("Predicted Result = ", idx_class_mapping[res])

Predicted Result =  Books


In [84]:
idx_class_mapping

{0: 'Books', 1: 'Clothing & Accessories', 2: 'Electronics', 3: 'Household'}