In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

import spacy

In [2]:
#Function to remove stopwords, punctuation and lemmatize
nlp= spacy.load("en_core_web_sm")

def preprocess(text):
    doc= nlp(text)
    filtered_tokens= []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [3]:
ecommerce_data= pd.read_csv("ecommerceDataset.csv", names=['category', 'text'])
ecommerce_data[:5]

Unnamed: 0,category,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [4]:
ecommerce_data.shape

(50425, 2)

In [5]:
ecommerce_data.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [6]:
ecommerce_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  50425 non-null  object
 1   text      50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [7]:
ecommerce_data["category_label"]= ecommerce_data.category.map({
    "Household": 0,
    "Books": 1,
    "Electronics": 2,
    "Clothing & Accessories": 3
})

In [8]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [9]:
#Checking for missing values
ecommerce_data.isnull().sum()

category          0
text              1
category_label    0
dtype: int64

In [10]:
ecommerce_data[ecommerce_data.text.isnull()]

Unnamed: 0,category,text,category_label
39330,Clothing & Accessories,,3


In [11]:
#Replacing missing missing value with zero
ecommerce_data["text"]= ecommerce_data.text.fillna(0)

In [12]:
#Dropping row with missing value
ecommerce_data= ecommerce_data[ecommerce_data.text != 0]

In [13]:
ecommerce_data.shape

(50424, 3)

In [14]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [15]:
X_train, X_test, y_train, y_test= train_test_split(
    ecommerce_data.text,
    ecommerce_data.category_label, 
    test_size=0.2, 
    random_state=2022, 
    stratify=ecommerce_data.category_label) 

In [16]:
y_train.value_counts()

category_label
0    15450
1     9456
2     8497
3     6936
Name: count, dtype: int64

In [17]:
y_test.value_counts()

category_label
0    3863
1    2364
2    2124
3    1734
Name: count, dtype: int64

In [18]:
X_train

6291     TCLPVC Economy Bamboo Blind Chick Window Close...
10681    Primeway® Metaltex Rap Box 3 Grater with Conta...
14771    IFB 20 L Convection Microwave Oven (20SC2, Met...
31385    ahhaaaa Boy's Blended Waistcoat, Shirt and Tro...
28380    Animals and the Shaping of Modern Medicine: On...
                               ...                        
812      Cloth Fusion Fruton Cooling Gel Memory Foam Ma...
44771    Hk Villa WS-858 Wireless Bluetooth Microphone ...
23146    A Dance with Dragons (HBO Tie-in Edition): A S...
13391    Philips HL1632 500-Watt 3 Jar Juicer Mixer Gri...
5237     Windsong Chimes & Bells Windchimes With Hangin...
Name: text, Length: 40339, dtype: object

In [19]:
cv= CountVectorizer()
X_train_cv= cv.fit_transform(X_train)
X_test_cv= cv.fit_transform(X_test)

In [20]:
X_train.shape

(40339,)

In [21]:
X_train_cv

<40339x74168 sparse matrix of type '<class 'numpy.int64'>'
	with 2858568 stored elements in Compressed Sparse Row format>

In [24]:
X_train_cv.shape

(40339, 74168)

In [25]:
print(cv.vocabulary_)



In [26]:
cv.get_feature_names_out()[34108]

'seagate'

In [27]:
cv.get_feature_names_out()

array(['00', '000', '0000', ..., '③supports', '④supports', 'ヒッジー'],
      dtype=object)

# TF-IDF VECTORIZER

In [28]:
clf= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred= clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.98      0.94      3863
           1       0.98      0.92      0.95      2364
           2       0.97      0.90      0.93      2124
           3       0.98      0.93      0.95      1734

    accuracy                           0.94     10085
   macro avg       0.96      0.93      0.94     10085
weighted avg       0.94      0.94      0.94     10085



In [29]:
clf2= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Model', KNeighborsClassifier())
])

clf2.fit(X_train, y_train)
y_pred2= clf2.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3863
           1       0.97      0.96      0.96      2364
           2       0.96      0.94      0.95      2124
           3       0.97      0.97      0.97      1734

    accuracy                           0.96     10085
   macro avg       0.96      0.96      0.96     10085
weighted avg       0.96      0.96      0.96     10085



In [30]:
y_test[:5]

41536    2
9671     0
49630    2
33658    3
19317    1
Name: category_label, dtype: int64

In [31]:
y_pred2[:5]

array([2, 0, 2, 3, 1], dtype=int64)

In [32]:
clf3= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Model', RandomForestClassifier())
])

clf3.fit(X_train, y_train)
y_pred3= clf3.predict(X_test)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      3863
           1       0.98      0.98      0.98      2364
           2       0.98      0.94      0.96      2124
           3       0.98      0.97      0.98      1734

    accuracy                           0.97     10085
   macro avg       0.97      0.97      0.97     10085
weighted avg       0.97      0.97      0.97     10085



In [33]:
y_pred3[:5]

array([2, 0, 2, 3, 1], dtype=int64)

In [34]:
y_test[:5]

41536    2
9671     0
49630    2
33658    3
19317    1
Name: category_label, dtype: int64

# USING PREPROCESSED TEXT

In [35]:
#Defining a function to preprocess text

nlp= spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc= nlp(text)
    filtered_tokens= []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue 
        filtered_tokens.append(token.lemma_)
        
    return " ".join(filtered_tokens)

In [36]:
#Defining a function to preprocess text

nlp= spacy.load("en_core_web_sm")

#def preprocess_text(text):
    text= re.sub('[^A-Za-z ]', '', str(text))
    text= text.lower()
    
    doc= nlp(text)
    filtered_tokens= []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

IndentationError: unexpected indent (572923396.py, line 6)

In [37]:
ecommerce_data["processed_text"]= ecommerce_data.text.apply(preprocess_text)

In [38]:
ecommerce_data.text

0        Paper Plane Design Framed Wall Hanging Motivat...
1        SAF 'Floral' Framed Painting (Wood, 30 inch x ...
2        SAF 'UV Textured Modern Art Print Framed' Pain...
3        SAF Flower Print Framed Painting (Synthetic, 1...
4        Incredible Gifts India Wooden Happy Birthday U...
                               ...                        
50420    Strontium MicroSD Class 10 8GB Memory Card (Bl...
50421    CrossBeats Wave Waterproof Bluetooth Wireless ...
50422    Karbonn Titanium Wind W4 (White) Karbonn Titan...
50423    Samsung Guru FM Plus (SM-B110E/D, Black) Colou...
50424                     Micromax Canvas Win W121 (White)
Name: text, Length: 50424, dtype: object

In [44]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label,processed_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0,SAF Floral Framed Painting Wood 30 inch x 10 i...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0,saf uv texture Modern Art Print Framed Paintin...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0,SAF Flower Print Framed Painting Synthetic 13....
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0,incredible Gifts India Wooden Happy Birthday U...


In [39]:
X_train, X_test, y_train, y_test= train_test_split(
    ecommerce_data.processed_text,
    ecommerce_data.category_label, 
    test_size=0.2, 
    random_state=2022, 
    stratify=ecommerce_data.category_label) 

# TF_IDF VECTORIZER

In [40]:
clf= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred= clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      3863
           1       0.98      0.92      0.95      2364
           2       0.96      0.91      0.94      2124
           3       0.98      0.95      0.96      1734

    accuracy                           0.95     10085
   macro avg       0.96      0.94      0.95     10085
weighted avg       0.95      0.95      0.95     10085



In [41]:
clf2= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Model', KNeighborsClassifier())
])

clf2.fit(X_train, y_train)
y_pred2= clf2.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      3863
           1       0.89      0.98      0.94      2364
           2       0.98      0.92      0.95      2124
           3       0.99      0.96      0.98      1734

    accuracy                           0.96     10085
   macro avg       0.96      0.95      0.96     10085
weighted avg       0.96      0.96      0.96     10085



In [42]:
clf3= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Model', RandomForestClassifier())
])

clf3.fit(X_train, y_train)
y_pred3= clf3.predict(X_test)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      3863
           1       0.98      0.97      0.98      2364
           2       0.98      0.95      0.97      2124
           3       0.98      0.97      0.98      1734

    accuracy                           0.97     10085
   macro avg       0.98      0.97      0.97     10085
weighted avg       0.97      0.97      0.97     10085



In [43]:
#ecommerce_data["cleaned_text"]= ecommerce_data.text.apply(preprocess_text)
ecommerce_data

Unnamed: 0,category,text,category_label,processed_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0,SAF Floral Framed Painting Wood 30 inch x 10 i...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0,saf uv texture Modern Art Print Framed Paintin...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0,SAF Flower Print Framed Painting Synthetic 13....
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0,incredible Gifts India Wooden Happy Birthday U...
...,...,...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...,2,Strontium MicroSD Class 10 8 GB Memory Card Bl...
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...,2,CrossBeats Wave Waterproof Bluetooth Wireless ...
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...,2,Karbonn Titanium Wind W4 White Karbonn Titaniu...
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",2,Samsung Guru FM plus SM b110e D Black Colour B...
