In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import re

import spacy

In [2]:
ecommerce_data= pd.read_csv("ecommerceDataset.csv", names=['category', 'text'])
ecommerce_data[:5]

Unnamed: 0,category,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [3]:
ecommerce_data.shape

(50425, 2)

In [4]:
ecommerce_data.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [5]:
ecommerce_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  50425 non-null  object
 1   text      50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [6]:
ecommerce_data["category_label"]= ecommerce_data.category.map({
    "Household": 0,
    "Books": 1,
    "Electronics": 2,
    "Clothing & Accessories": 3
})

In [7]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [8]:
#Checking for missing values
ecommerce_data.isnull().sum()

category          0
text              1
category_label    0
dtype: int64

In [9]:
ecommerce_data[ecommerce_data.text.isnull()]

Unnamed: 0,category,text,category_label
39330,Clothing & Accessories,,3


In [10]:
#Replacing missing missing value with zero
ecommerce_data["text"]= ecommerce_data.text.fillna(0)

In [11]:
#Dropping row with missing value
ecommerce_data= ecommerce_data[ecommerce_data.text != 0]

In [12]:
ecommerce_data.shape

(50424, 3)

In [13]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [14]:
ecommerce_data.text[4]

'Incredible Gifts India Wooden Happy Birthday Unique Personalized Gift (5 X 4 Inch) Size:4 x 5   Made Of Natural Imported Wood, Which Is Quite Solid With Light Particle Pattern & Is Soft Pale To Blond Colour. Your Uploaded Photo Will Look Amazing And Beautiful After Laser Engraving On It. This Is One Of The Most Popular Unique Gifts In Our Store. We Offer This In Multiple Sizes, Some Can Be Used As Table Top And The Big Sizes Can Be Used As Wall Hanging Which Just Blends With Your Home Decaration. You Just Need To Upload A Picture And Add Your Own Text And We Will Do The Rest For You. We Will Email You The Preview Before Making The Final Product. Do You Want The Best Moment Of Your Life To Be Engraved On A Wooden Plaque That Lasts For A Longer Time And Stays Close To You Forever? Then You Are At The Right Place. We Present To You Various Sizes Personalized Engraved Wooden Plaques Made With Birch Wood. Let Your Memories Be Engraved On Wooden Plaques And Stay With Your Forever.'

In [15]:
nlp= spacy.load("en_core_web_lg")

In [16]:
doc= nlp("Paper Plane Design Framed Wall Hanging Motivation")
doc[0].vector.shape

(300,)

In [17]:
doc[1].vector[:5]

array([ 0.033539,  1.0758  ,  4.2654  ,  2.4083  , -1.4626  ],
      dtype=float32)

In [18]:
doc[1].vector.shape

(300,)

In [19]:
ecommerce_data["vector"]= ecommerce_data["text"].apply(lambda x: nlp(x).vector)

In [20]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label,vector
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0,"[-2.306548, 0.30390584, -2.51085, 0.06632433, ..."
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0,"[-2.6766863, -1.1272602, -1.1009779, 0.4870543..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0,"[-2.414381, 0.7888623, -2.8213925, 0.8304517, ..."
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0,"[-2.3844197, 0.7978983, -2.7950675, 0.74301887..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0,"[-0.5195761, -0.46975613, -1.4031031, -0.38848..."


In [21]:
ecommerce_data.vector[:5]

0    [-2.306548, 0.30390584, -2.51085, 0.06632433, ...
1    [-2.6766863, -1.1272602, -1.1009779, 0.4870543...
2    [-2.414381, 0.7888623, -2.8213925, 0.8304517, ...
3    [-2.3844197, 0.7978983, -2.7950675, 0.74301887...
4    [-0.5195761, -0.46975613, -1.4031031, -0.38848...
Name: vector, dtype: object

In [22]:
X = np.array([np.array(x) for x in ecommerce_data.vector.values])
y = ecommerce_data.category_label

In [23]:
X[:5]

array([[-2.306548  ,  0.30390584, -2.51085   , ..., -1.4262657 ,
        -3.3489084 ,  0.57573175],
       [-2.6766863 , -1.1272602 , -1.1009779 , ..., -1.6334178 ,
        -2.4390118 , -0.24040593],
       [-2.414381  ,  0.7888623 , -2.8213925 , ..., -2.1887448 ,
        -3.0587976 ,  0.657897  ],
       [-2.3844197 ,  0.7978983 , -2.7950675 , ..., -2.13979   ,
        -2.9163933 ,  0.48472232],
       [-0.5195761 , -0.46975613, -1.4031031 , ..., -0.5076162 ,
        -2.1032832 , -0.826409  ]], dtype=float32)

In [24]:
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: category_label, dtype: int64

In [25]:
#Applying SMOTE to handle the imbalance dataset

X_sm, y_sm= SMOTE().fit_resample(X, y)

In [26]:
X_sm.shape

(77252, 300)

In [27]:
y_sm.value_counts()

category_label
0    19313
1    19313
3    19313
2    19313
Name: count, dtype: int64

In [28]:
X_train, X_test, y_train, y_test= train_test_split(
    X_sm, 
    y_sm,
    test_size= 0.25,
    random_state= 2022)

In [29]:
y_test.value_counts()

category_label
1    4973
0    4845
2    4813
3    4682
Name: count, dtype: int64

In [30]:
y_train.value_counts()

category_label
3    14631
2    14500
0    14468
1    14340
Name: count, dtype: int64

In [36]:
X_train[0]

array([-1.6228391 , -0.8795158 , -0.60475844,  0.09957702,  4.5345936 ,
        0.49701482,  0.5316818 ,  2.804304  ,  0.3641683 , -0.42082235,
        2.8456032 ,  1.3637278 , -3.202679  ,  0.86895186,  0.23375197,
        0.69083005,  0.9995085 ,  0.36960497, -1.0316478 , -1.0361857 ,
        0.49809745, -0.12028522, -0.09276063,  0.6998448 ,  0.9360567 ,
       -0.10150979, -1.8054459 , -0.93224007,  0.28848827,  0.6398769 ,
        0.05864611, -0.39690572,  0.67430156, -0.6484703 , -2.6157746 ,
       -0.8849752 , -0.15731286,  0.25606146, -1.3431283 , -1.1228722 ,
        1.4037353 , -0.09437145, -1.0761565 ,  0.67094934, -0.42892984,
        0.27302143, -0.67013556, -0.6753916 ,  0.23651104, -0.23073682,
       -1.5751532 ,  1.431477  , -0.17946924, -2.61857   ,  0.08347544,
        0.1525487 , -1.9377465 , -0.19389702,  0.5035086 , -0.9523302 ,
       -0.12221701, -0.6814095 ,  0.6373349 ,  0.48782822,  2.3400123 ,
        0.9520956 , -2.9526992 , -3.9583101 ,  2.0084848 ,  0.76

In [37]:
X_test[3]

array([-1.6129826e+00,  3.5974401e-01, -1.5279915e+00,  9.8656183e-01,
        3.9608943e+00, -3.9014500e-01,  9.0684134e-01,  3.2379608e+00,
       -1.5596658e+00, -5.9574264e-01,  5.4132471e+00,  1.1384041e+00,
       -4.3412108e+00,  1.6103960e+00,  1.2325590e+00,  1.0987558e+00,
        1.7252977e+00,  5.1287615e-01, -7.6056671e-01, -1.8450770e+00,
        2.3989406e-01,  1.9439369e-01, -4.9562806e-01, -1.5488654e-01,
       -6.1692488e-01, -1.4273847e+00, -2.3916268e+00, -1.6482418e+00,
       -8.1044984e-01,  1.4379320e+00,  3.9494255e-01, -1.0965869e+00,
       -6.0291708e-01, -1.2063323e+00, -1.6223649e+00, -6.1752617e-01,
        6.5509790e-01,  1.5297965e+00,  1.6739434e+00,  8.4059441e-01,
        1.2178181e+00, -1.0812914e+00, -8.3276212e-02,  4.0604895e-01,
       -1.9287207e+00,  2.1380622e+00,  1.4113308e+00, -1.2917514e+00,
       -9.4705057e-01, -2.1855947e-02, -5.2126064e-03,  1.1391947e+00,
       -4.6402788e-01, -3.2397373e+00, -1.2060776e+00,  6.2647069e-01,
      

In [33]:
clf= Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", MultinomialNB())
])

clf.fit(X_train, y_train)
y_pred= clf.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.70      0.57      0.63      5955
           1       0.80      0.91      0.85      4330
           2       0.64      0.64      0.64      4813
           3       0.75      0.84      0.79      4215

    accuracy                           0.72     19313
   macro avg       0.72      0.74      0.73     19313
weighted avg       0.72      0.72      0.72     19313



In [34]:
clf2= Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", KNeighborsClassifier(n_neighbors=5, metric="euclidean"))
])

clf2.fit(X_train, y_train)
y_pred2= clf2.predict(X_test)
print(classification_report(y_pred2, y_test))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      4320
           1       0.96      0.98      0.97      4874
           2       0.96      0.91      0.94      5105
           3       0.99      0.92      0.95      5014

    accuracy                           0.94     19313
   macro avg       0.94      0.94      0.94     19313
weighted avg       0.94      0.94      0.94     19313



In [38]:
clf4= Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", KNeighborsClassifier())
])

clf4.fit(X_train, y_train)
y_pred4= clf4.predict(X_test)
print(classification_report(y_pred4, y_test))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      4320
           1       0.96      0.98      0.97      4874
           2       0.96      0.91      0.94      5105
           3       0.99      0.92      0.95      5014

    accuracy                           0.94     19313
   macro avg       0.94      0.94      0.94     19313
weighted avg       0.94      0.94      0.94     19313



In [35]:
clf3= Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", RandomForestClassifier())
])

clf3.fit(X_train, y_train)
y_pred3= clf3.predict(X_test)
print(classification_report(y_pred3, y_test))

              precision    recall  f1-score   support

           0       0.97      0.94      0.96      5011
           1       0.97      0.99      0.98      4909
           2       0.97      0.98      0.98      4726
           3       0.98      0.98      0.98      4667

    accuracy                           0.97     19313
   macro avg       0.97      0.97      0.97     19313
weighted avg       0.97      0.97      0.97     19313

