In [28]:
#pip install spacy
#pip install -U imbalanced-learn
#!python -m spacy download en
#!python -m spacy download en_core_web_lg

In [22]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

import spacy

In [2]:
path= ("C:/Users/Open user/Downloads/archive/ecommerceDataset.csv")
open(path).readline()

'Household,"Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for e

In [3]:
#Reading the file
ecommerce_data= pd.read_csv("C:/Users/Open user/Documents/ecommerceDataset.csv", names=['category', 'text'])
ecommerce_data[:3]

Unnamed: 0,category,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...


In [4]:
ecommerce_data.shape

(50425, 2)

In [5]:
#Checking the data set information
ecommerce_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  50425 non-null  object
 1   text      50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [6]:
#Checking for missing values
ecommerce_data.isnull().sum()

category    0
text        1
dtype: int64

In [7]:
#viewing the missing value
ecommerce_data[ecommerce_data.text.isnull()]

Unnamed: 0,category,text
39330,Clothing & Accessories,


In [8]:
#Dropping the missing value
ecommerce_data.dropna(inplace=True)

In [9]:
#Rechecking the dataset to confirm that the missing value has been droppedfor missing 
ecommerce_data.isnull().sum()

category    0
text        0
dtype: int64

In [10]:
ecommerce_data.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8670
Name: count, dtype: int64

In [11]:
#Encoding the target variable
ecommerce_data["category_label"] = ecommerce_data.category.map({
    "Household": 0,
    "Books": 1,
    "Electronics": 2,
    "Clothing & Accessories": 3
})

In [12]:
ecommerce_data.category_label.unique()

array([0, 1, 3, 2], dtype=int64)

In [13]:
ecommerce_data[:5]

Unnamed: 0,category,text,category_label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [14]:
#splitting the data into train and test set
X_train, X_test, y_train, y_test= train_test_split(
    ecommerce_data.text,
    ecommerce_data.category_label,
    test_size= 0.25,
    random_state= 2022,
    stratify= ecommerce_data.category_label
)

In [15]:
X_train[:5]

43762    DOMO nHance CM230B Anaglyph Passive Cyan and M...
30292    Bihar Diaries: The True Story of How Bihar's M...
8537     Generic 12-Watt Round LED Surface Panel Light ...
43713    REES52 12v 1.3Ah SMF/VRLA Battery Batteries ar...
36105    Royal Export Women's Georgette Long Party Wear...
Name: text, dtype: object

In [16]:
y_train[:5]

43762    2
30292    1
8537     0
43713    2
36105    3
Name: category_label, dtype: int64

In [17]:
y_train.value_counts()

category_label
0    14485
1     8865
2     7966
3     6502
Name: count, dtype: int64

In [None]:
#Naive baye

In [18]:
nb= Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1,2))),
    ("Multi NB", MultinomialNB())
])
nb.fit(X_train, y_train)
y_pred= nb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      4828
           1       0.98      0.94      0.96      2955
           2       0.96      0.95      0.96      2655
           3       0.97      0.98      0.97      2168

    accuracy                           0.96     12606
   macro avg       0.97      0.96      0.96     12606
weighted avg       0.96      0.96      0.96     12606



# Random Forest

In [19]:
rf= Pipeline([
    ("vectorizer", CountVectorizer(ngram_range=(1,2))),
    ("Model", RandomForestClassifier())
])
rf.fit(X_train, y_train)
y_pred2=rf.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      4828
           1       0.97      0.98      0.97      2955
           2       0.98      0.92      0.95      2655
           3       0.99      0.96      0.97      2168

    accuracy                           0.96     12606
   macro avg       0.97      0.96      0.97     12606
weighted avg       0.97      0.96      0.96     12606



In [29]:
un_processed_text_models= pd.DataFrame({
    'models': ['MultinomialNB', 'Random Forest'],
    'Accuracy':[accuracy_score(y_test, y_pred),
               accuracy_score(y_test, y_pred2)]
})
un_processed_text_models= un_processed_text_models.sort_values(by='Accuracy', ascending=False)
un_processed_text_models

Unnamed: 0,models,Accuracy
1,Random Forest,0.964779
0,MultinomialNB,0.963271


# Using the preprocessed text

In [23]:
#Function to remove stopwords, punctuation and lemmatize
nlp= spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc= nlp(text)
    filtered_tokens= []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [24]:
ecommerce_data["preprocessed_text"]= ecommerce_data.text.apply(preprocess_text)

In [25]:
X_train, X_test, y_train, y_test= train_test_split(
    ecommerce_data.preprocessed_text,
    ecommerce_data.category_label,
    test_size= 0.25,
    random_state= 2022,
    stratify= ecommerce_data.category_label
)

In [26]:
ecommerce_data[:3]

Unnamed: 0,category,text,category_label,preprocessed_text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0,SAF Floral Framed Painting Wood 30 inch x 10 i...
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0,saf uv texture Modern Art Print Framed Paintin...


In [27]:
nb= Pipeline([
    ("vectorizer", CountVectorizer()),
    ("Multi NB", MultinomialNB())
])
nb.fit(X_train, y_train)
y_pred_nb= nb.predict(X_test)
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      4828
           1       0.97      0.93      0.95      2955
           2       0.94      0.94      0.94      2655
           3       0.95      0.98      0.97      2168

    accuracy                           0.95     12606
   macro avg       0.95      0.95      0.95     12606
weighted avg       0.95      0.95      0.95     12606



In [28]:
rf2= Pipeline([
    ("vectorizer", CountVectorizer()),
    ("Model", RandomForestClassifier())
])
rf2.fit(X_train, y_train)
y_pred_rf=rf2.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4828
           1       0.98      0.98      0.98      2955
           2       0.98      0.95      0.96      2655
           3       0.99      0.98      0.98      2168

    accuracy                           0.97     12606
   macro avg       0.98      0.97      0.97     12606
weighted avg       0.97      0.97      0.97     12606



In [31]:
processed_text_models= pd.DataFrame({
    'models': ['MultinomialNB', 'Random Forest'],
    'Accuracy':[accuracy_score(y_test, y_pred_nb),
               accuracy_score(y_test, y_pred_rf)]
})
processed_text_models= processed_text_models.sort_values(by='Accuracy', ascending=False)
processed_text_models

Unnamed: 0,models,Accuracy
1,Random Forest,0.972553
0,MultinomialNB,0.951531
