In [32]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

import spacy

In [2]:
ecommerce_data= pd.read_csv("ecommerceDataset.csv", names=['category', 'text'])
ecommerce_data[:5]

Unnamed: 0,category,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [3]:
ecommerce_data.shape

(50425, 2)

In [4]:
ecommerce_data.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [5]:
ecommerce_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  50425 non-null  object
 1   text      50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [6]:
ecommerce_data["category_label"]= ecommerce_data.category.map({
    "Household": 0,
    "Books": 1,
    "Electronics": 2,
    "Clothing & Accessories": 3
})

In [7]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [8]:
#Checking for missing values
ecommerce_data.isnull().sum()

category          0
text              1
category_label    0
dtype: int64

In [9]:
ecommerce_data[ecommerce_data.text.isnull()]

Unnamed: 0,category,text,category_label
39330,Clothing & Accessories,,3


In [10]:
#Dropping row with missing value
ecommerce_data.dropna(inplace=True)

In [11]:
ecommerce_data.shape

(50424, 3)

In [12]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [13]:
X_train, X_test, y_train, y_test= train_test_split(
    ecommerce_data.text,
    ecommerce_data.category_label, 
    test_size=0.2, 
    random_state=2022, 
    stratify=ecommerce_data.category_label) 

In [14]:
y_train.value_counts()

category_label
0    15450
1     9456
2     8497
3     6936
Name: count, dtype: int64

In [15]:
y_test.value_counts()

category_label
0    3863
1    2364
2    2124
3    1734
Name: count, dtype: int64

In [16]:
X_train.head(3)

6291     TCLPVC Economy Bamboo Blind Chick Window Close...
10681    Primeway® Metaltex Rap Box 3 Grater with Conta...
14771    IFB 20 L Convection Microwave Oven (20SC2, Met...
Name: text, dtype: object

In [17]:
cv= CountVectorizer()
X_train_cv= cv.fit_transform(X_train)
X_test_cv= cv.fit_transform(X_test)

In [18]:
X_train.shape

(40339,)

In [19]:
X_train_cv

<40339x74168 sparse matrix of type '<class 'numpy.int64'>'
	with 2858568 stored elements in Compressed Sparse Row format>

In [20]:
X_train_cv.shape

(40339, 74168)

In [21]:
#print(cv.vocabulary_)

In [22]:
cv.get_feature_names_out()[34108]

'seagate'

In [23]:
cv.get_feature_names_out()

array(['00', '000', '0000', ..., '③supports', '④supports', 'ヒッジー'],
      dtype=object)

# TF-IDF VECTORIZER

In [24]:
clf= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred= clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.98      0.94      3863
           1       0.98      0.92      0.95      2364
           2       0.97      0.90      0.93      2124
           3       0.98      0.93      0.95      1734

    accuracy                           0.94     10085
   macro avg       0.96      0.93      0.94     10085
weighted avg       0.94      0.94      0.94     10085



In [25]:
clf2= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Model', KNeighborsClassifier())
])

clf2.fit(X_train, y_train)
y_pred2= clf2.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3863
           1       0.97      0.96      0.96      2364
           2       0.96      0.94      0.95      2124
           3       0.97      0.97      0.97      1734

    accuracy                           0.96     10085
   macro avg       0.96      0.96      0.96     10085
weighted avg       0.96      0.96      0.96     10085



In [26]:
y_test[:5]

41536    2
9671     0
49630    2
33658    3
19317    1
Name: category_label, dtype: int64

In [27]:
y_pred2[:5]

array([2, 0, 2, 3, 1], dtype=int64)

In [28]:
clf3= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Model', RandomForestClassifier())
])

clf3.fit(X_train, y_train)
y_pred3= clf3.predict(X_test)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      3863
           1       0.98      0.98      0.98      2364
           2       0.98      0.94      0.96      2124
           3       0.98      0.97      0.98      1734

    accuracy                           0.97     10085
   macro avg       0.97      0.97      0.97     10085
weighted avg       0.97      0.97      0.97     10085



In [29]:
y_pred3[:5]

array([2, 0, 2, 3, 1], dtype=int64)

In [30]:
y_test[:5]

41536    2
9671     0
49630    2
33658    3
19317    1
Name: category_label, dtype: int64

In [33]:
un_processed_text_models= pd.DataFrame({
    'models': ['MultinomialNB','KNN' ,'Random Forest'],
    'Accuracy':[accuracy_score(y_test, y_pred),
               accuracy_score(y_test, y_pred2),
               accuracy_score(y_test, y_pred3)]
})
un_processed_text_models= un_processed_text_models.sort_values(by='Accuracy', ascending=False)
un_processed_text_models

Unnamed: 0,models,Accuracy
2,Random Forest,0.970749
1,KNN,0.962519
0,MultinomialNB,0.941497


# USING PREPROCESSED TEXT

In [34]:
#Defining a function to preprocess text

nlp= spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc= nlp(text)
    filtered_tokens= []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue 
        filtered_tokens.append(token.lemma_)
        
    return " ".join(filtered_tokens)

In [35]:
ecommerce_data["processed_text"]= ecommerce_data.text.apply(preprocess_text)

In [36]:
ecommerce_data.text[0]

'Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal blis

In [37]:
ecommerce_data.processed_text[0]

'Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints 8.7 x 8.7 inch set 4 Painting synthetic frame uv texture print give multi effect attract special series painting make wall beautiful give royal touch painting ready hang proud possess unique painting niche apart use modern efficient print technology print ink precision epson roland hp printer innovative hd printing technique result durable spectacular look print high lifetime print solely notch 100 ink achieve brilliant true colour high level uv resistance print retain beautiful colour year add colour style live space digitally print painting pleasure eternal bliss.so bring home elegant print lushe rich color make sheer elegance friend family.it treasure forever lucky recipient liven place intriguing painting high definition hd graphic digital print home office room'

In [38]:
ecommerce_data[:3]

Unnamed: 0,category,text,category_label,processed_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0,SAF Floral Framed Painting Wood 30 inch x 10 i...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0,saf uv texture Modern Art Print Framed Paintin...


In [39]:
X_train, X_test, y_train, y_test= train_test_split(
    ecommerce_data.processed_text,
    ecommerce_data.category_label, 
    test_size=0.2, 
    random_state=2022, 
    stratify=ecommerce_data.category_label) 

# TF_IDF VECTORIZER

In [42]:
clf= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred_nb= clf.predict(X_test)
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      3863
           1       0.98      0.92      0.95      2364
           2       0.96      0.91      0.94      2124
           3       0.98      0.95      0.96      1734

    accuracy                           0.95     10085
   macro avg       0.96      0.94      0.95     10085
weighted avg       0.95      0.95      0.95     10085



In [46]:
clf2= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Model', KNeighborsClassifier())
])

clf2.fit(X_train, y_train)
y_pred_knn= clf2.predict(X_test)
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96      3863
           1       0.89      0.98      0.94      2364
           2       0.98      0.92      0.95      2124
           3       0.99      0.96      0.98      1734

    accuracy                           0.96     10085
   macro avg       0.96      0.95      0.96     10085
weighted avg       0.96      0.96      0.96     10085



In [47]:
clf3= Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('Model', RandomForestClassifier())
])

clf3.fit(X_train, y_train)
y_pred_rf= clf3.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      3863
           1       0.98      0.98      0.98      2364
           2       0.98      0.95      0.97      2124
           3       0.98      0.98      0.98      1734

    accuracy                           0.97     10085
   macro avg       0.98      0.97      0.97     10085
weighted avg       0.97      0.97      0.97     10085



In [48]:
processed_text_models= pd.DataFrame({
    'models': ['MultinomialNB','KNN' ,'Random Forest'],
    'Accuracy':[accuracy_score(y_test, y_pred_nb),
               accuracy_score(y_test, y_pred_knn),
               accuracy_score(y_test, y_pred_rf)]
})
processed_text_models= processed_text_models.sort_values(by='Accuracy', ascending=False)
processed_text_models

Unnamed: 0,models,Accuracy
2,Random Forest,0.973723
1,KNN,0.955875
0,MultinomialNB,0.946158
