In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import re

import spacy

In [3]:
ecommerce_data= pd.read_csv("ecommerceDataset.csv", names=['category', 'text'])
ecommerce_data[:5]

Unnamed: 0,category,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [4]:
ecommerce_data.shape

(50425, 2)

In [5]:
ecommerce_data.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [6]:
ecommerce_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  50425 non-null  object
 1   text      50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [7]:
ecommerce_data["category_label"]= ecommerce_data.category.map({
    "Household": 0,
    "Books": 1,
    "Electronics": 2,
    "Clothing & Accessories": 3
})

In [8]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [9]:
#Checking for missing values
ecommerce_data.isnull().sum()

category          0
text              1
category_label    0
dtype: int64

In [10]:
ecommerce_data[ecommerce_data.text.isnull()]

Unnamed: 0,category,text,category_label
39330,Clothing & Accessories,,3


In [11]:
#Replacing missing missing value with zero
ecommerce_data["text"]= ecommerce_data.text.fillna(0)

In [12]:
#Dropping row with missing value
ecommerce_data= ecommerce_data[ecommerce_data.text != 0]

In [13]:
ecommerce_data.shape

(50424, 3)

In [14]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [15]:
nlp= spacy.load("en_core_web_lg")

In [16]:
doc= nlp("Paper Plane Design Framed Wall Hanging Motivation")
doc[0].vector.shape

(300,)

In [17]:
doc[1].vector[:5]

array([ 0.033539,  1.0758  ,  4.2654  ,  2.4083  , -1.4626  ],
      dtype=float32)

In [18]:
doc[1].vector.shape

(300,)

In [19]:
ecommerce_data["vector"]= ecommerce_data["text"].apply(lambda x: nlp(x).vector)

In [20]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label,vector
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0,"[-2.306548, 0.30390584, -2.51085, 0.06632433, ..."
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0,"[-2.6766863, -1.1272602, -1.1009779, 0.4870543..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0,"[-2.414381, 0.7888623, -2.8213925, 0.8304517, ..."
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0,"[-2.3844197, 0.7978983, -2.7950675, 0.74301887..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0,"[-0.5195761, -0.46975613, -1.4031031, -0.38848..."


In [34]:
X_train, X_test, y_train, y_test= train_test_split(
    ecommerce_data.vector.values, 
    ecommerce_data.category_label,
    test_size= 0.25,
    random_state= 2022,
    stratify= ecommerce_data.category_label)

In [35]:
y_test.value_counts()

category_label
0    4828
1    2955
2    2655
3    2168
Name: count, dtype: int64

In [36]:
y_train.value_counts()

category_label
0    14485
1     8865
2     7966
3     6502
Name: count, dtype: int64

In [37]:
X_train[:5]

array([array([-1.15011239e+00,  7.32444406e-01, -3.21626210e+00, -9.53763425e-02,
               2.33316731e+00,  5.15002072e-01,  9.24209297e-01,  3.34508252e+00,
              -1.88041162e+00, -5.84636219e-02,  5.62545061e+00,  1.72163761e+00,
              -4.25379467e+00,  2.36286569e+00,  1.65008187e+00,  9.76917922e-01,
               2.04230213e+00, -8.72460604e-01, -1.67010641e+00, -2.71721768e+00,
               1.29622388e+00,  5.05230874e-02, -2.65119344e-01, -1.40498841e+00,
              -1.75763822e+00, -1.14565444e+00, -1.73212051e+00, -2.72036195e+00,
              -9.04822469e-01,  1.54686284e+00,  1.84197092e+00, -1.11679888e+00,
              -1.59374416e-01, -1.18308365e+00, -6.59013450e-01,  1.41197348e+00,
               9.44045603e-01,  1.36637437e+00,  3.13103652e+00,  1.08294189e+00,
               1.39737046e+00, -2.37096772e-01, -7.15362370e-01, -5.41786611e-01,
              -1.93079221e+00,  2.54355574e+00,  1.48725760e+00, -2.59833598e+00,
              -5

In [39]:
X_test[:3]

array([array([-1.0987113 , -0.3239043 , -0.8740661 ,  0.4303316 ,  4.2347546 ,
               0.28422326,  0.6539228 ,  3.5913353 , -0.9548351 , -0.74147093,
               5.411868  ,  1.4595894 , -4.2286024 ,  1.119557  ,  0.5897979 ,
               1.0923961 ,  2.0544386 ,  0.57087123, -2.021661  , -2.0252602 ,
               0.11863516,  0.25833765, -2.084481  ,  1.1346924 ,  0.12073611,
              -1.9574301 , -2.729089  , -0.42588097, -1.037963  ,  1.8718283 ,
               0.23750816, -0.6944425 , -0.20043199, -1.7068294 , -1.952275  ,
              -0.36888006,  0.47267073,  0.11054955,  0.6899071 ,  0.07541832,
               1.2740715 ,  0.46832478, -0.9906793 ,  0.4949778 , -1.6020947 ,
               1.0993625 ,  0.9571466 , -1.4419674 , -0.48554188, -0.9740905 ,
              -0.8489973 ,  1.9120287 , -0.6826305 , -3.9441025 , -0.46755615,
               1.8996851 , -1.5900506 ,  0.8503717 ,  1.1709796 , -1.7771786 ,
               0.02624725,  0.2943733 , -1.2222358 ,

In [41]:
#Converting X_train to 2 dimentional array to make it suitable for sklearn model training
X_train_stacked= np.stack(X_train)
X_train_stacked

array([[-1.1501124 ,  0.7324444 , -3.216262  , ..., -1.3298172 ,
        -2.8925438 ,  0.3502695 ],
       [-1.9359345 , -0.18888246, -0.8203244 , ..., -1.6595446 ,
        -2.108214  ,  0.38784555],
       [-1.3366245 ,  0.6276342 , -2.9608674 , ..., -0.86737233,
        -2.326306  ,  0.89895755],
       ...,
       [-2.9003465 , -0.42902943, -0.41330007, ..., -2.252416  ,
        -2.9399526 ,  0.00471165],
       [-1.8940018 , -0.8107042 , -1.4286046 , ..., -2.0909934 ,
        -0.9805882 , -0.34726417],
       [-2.0616357 , -0.5582499 , -2.012985  , ..., -1.2399536 ,
        -2.2518113 ,  0.48242924]], dtype=float32)

In [42]:
#Converting X_test to 2 dimentional array to make it suitable for sklearn model training
X_test_stacked= np.stack(X_test)
X_test_stacked

array([[-1.0987113 , -0.3239043 , -0.8740661 , ..., -2.4616654 ,
        -2.6854649 ,  0.49346226],
       [-1.3981475 , -1.6054499 , -0.46969843, ..., -2.310577  ,
        -0.57279843,  0.35338292],
       [-0.37394238, -4.804106  ,  1.3103735 , ..., -2.936149  ,
         0.17135048, -0.45164755],
       ...,
       [-1.9795557 ,  1.3175062 , -1.1124188 , ..., -0.33096367,
        -2.4543965 ,  0.77800167],
       [-2.6978967 , -2.0113392 , -1.8792033 , ..., -1.5019835 ,
        -1.4402246 , -0.19145222],
       [-1.8333997 , -0.20488927, -1.092316  , ..., -2.5824306 ,
        -2.6676002 ,  0.5403807 ]], dtype=float32)

In [45]:
X_test_stacked.shape

(12606, 300)

In [49]:
clf= Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", MultinomialNB())
])

clf.fit(X_train_stacked, y_train)
y_pred= clf.predict(X_test_stacked)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.95      0.48      0.64      9637
           1       0.60      0.96      0.74      1857
           2       0.21      0.59      0.31       947
           3       0.07      0.96      0.14       165

    accuracy                           0.56     12606
   macro avg       0.46      0.74      0.45     12606
weighted avg       0.83      0.56      0.62     12606



In [50]:
scaler= MinMaxScaler()
X_train_scaled= scaler.fit_transform(X_train_stacked)
X_test_scaled= scaler.fit_transform(X_test_stacked)

In [52]:
nbc= MultinomialNB()
nbc.fit(X_train_scaled, y_train)
y_pred= nbc.predict(X_test_scaled)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.93      0.50      0.65      9084
           1       0.68      0.95      0.79      2100
           2       0.26      0.57      0.36      1217
           3       0.09      0.96      0.17       205

    accuracy                           0.59     12606
   macro avg       0.49      0.75      0.49     12606
weighted avg       0.81      0.59      0.64     12606



In [56]:
clf2= Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", KNeighborsClassifier(n_neighbors=5, metric="euclidean"))
])

clf2.fit(X_train_stacked, y_train)
y_pred2= clf2.predict(X_test_stacked)
print(classification_report(y_pred2, y_test))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      5013
           1       0.92      0.97      0.94      2788
           2       0.90      0.90      0.90      2643
           3       0.91      0.92      0.92      2162

    accuracy                           0.92     12606
   macro avg       0.91      0.92      0.92     12606
weighted avg       0.92      0.92      0.92     12606



In [54]:
clf3= Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", RandomForestClassifier())
])

clf3.fit(X_train_stacked, y_train)
y_pred3= clf3.predict(X_test_stacked)
print(classification_report(y_pred3, y_test))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95      5106
           1       0.97      0.98      0.98      2928
           2       0.92      0.97      0.95      2514
           3       0.93      0.98      0.95      2058

    accuracy                           0.96     12606
   macro avg       0.95      0.96      0.96     12606
weighted avg       0.96      0.96      0.96     12606

