In [6]:
import pandas as pd
import spacy
df = pd.read_csv("Ecommerce_data.csv")
print(df.shape)
df.head()

(24000, 2)


Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [2]:
df.label.value_counts()

label
Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: count, dtype: int64

In [3]:
df['label_new']=df.label.map(
    {
        'Household' : 0,
        'Electronics' : 1,
        'Clothing & Accessories' : 0,
        'Books' : 0,        
    }
)

In [None]:
df

In [4]:
df.head()

Unnamed: 0,Text,label,label_new
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,0
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,0


In [32]:
nlp = spacy.load("en_core_web_sm") 
def preprocess(text):
    doc = nlp(text)
    filterd_tokens =[]
    for token in doc:
        if token.is_punct or token.is_stop:
            continue
        filterd_tokens.append(token.lemma_)
    return" ".join(filterd_tokens)

In [33]:
df['text_new'] = df['Text'].apply(preprocess)
df.head()

Unnamed: 0,Text,label,text_new
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,Indira Designer Women Art Mysore Silk Saree Bl...


In [35]:
df.text_new[:1]

0    Urban Ladder Eisner low Study Office Computer ...
Name: text_new, dtype: object

In [36]:
df_t = df.Text[0]

In [37]:
preprocess_txt = preprocess(df_t)
print(preprocess_txt)

Urban Ladder Eisner low Study Office Computer Chair(Black study simple Eisner study chair firm foam cushion make long hour desk comfortable flexible mesh design air circulation support lean curved arm provide ergonomic forearm support adjust height gas lift find comfortable position nylon castor easy space chrome leg refer image dimension detail assembly require UL team time delivery indoor use


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [40]:
df['label_new']=df.label.map(
    {
        'Household' : 0,
        'Electronics' : 1,
        'Clothing & Accessories' : 0,
        'Books' : 0,        
    }
)

In [43]:
X_train,X_test,y_train,y_test = train_test_split(
    df.text_new,
    df.label_new,
    test_size =0.2,
    random_state = 2000,
    stratify = df.label_new
)

In [54]:
clf = Pipeline(
    [
        ('vectorizer_tdf',TfidfVectorizer()),
        ('KNeighborsClassifier',KNeighborsClassifier())
    ]
)

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print("Model: KNeighborsClassifier | TfidfVectorizer ")

print(classification_report(y_test, y_pred))

Model: KNeighborsClassifier | TfidfVectorizer 
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3600
           1       0.97      0.97      0.97      1200

    accuracy                           0.99      4800
   macro avg       0.98      0.98      0.98      4800
weighted avg       0.99      0.99      0.99      4800



In [55]:
clf = Pipeline(
    [
        ('vectorizer_tdf',TfidfVectorizer()),
        ('MultinomialNB',MultinomialNB())
    ]
)

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print("Model: MultinomialNB | TfidfVectorizer ")
print(classification_report(y_test, y_pred))

Model: MultinomialNB | TfidfVectorizer 
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3600
           1       0.98      0.95      0.97      1200

    accuracy                           0.98      4800
   macro avg       0.98      0.97      0.98      4800
weighted avg       0.98      0.98      0.98      4800



In [56]:
clf = Pipeline(
    [
        ('vectorizer_tdf',TfidfVectorizer()),
        ('RandomForestClassifier',RandomForestClassifier())
    ]
)

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print("Model: RandomForestClassifier | TfidfVectorizer ")
print(classification_report(y_test, y_pred))

Model: RandomForestClassifier | TfidfVectorizer 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3600
           1       0.99      0.98      0.98      1200

    accuracy                           0.99      4800
   macro avg       0.99      0.99      0.99      4800
weighted avg       0.99      0.99      0.99      4800



In [50]:
from sklearn.feature_extraction.text import CountVectorizer
clf = Pipeline(
    [
        ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
        ('RandomForestClassifier',RandomForestClassifier())
    ]
)

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3600
           1       0.99      0.98      0.98      1200

    accuracy                           0.99      4800
   macro avg       0.99      0.99      0.99      4800
weighted avg       0.99      0.99      0.99      4800



In [51]:
df.head()

Unnamed: 0,Text,label,text_new,label_new
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,Urban Ladder Eisner low Study Office Computer ...,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,contrast live Wooden Decorative Box Painted Bo...,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,IO Crest SY PCI40010 PCI raid Host Controller ...,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...,0
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,Indira Designer Women Art Mysore Silk Saree Bl...,0
