<a href="https://colab.research.google.com/github/Aayush121202/NLP/blob/main/TF_IDF_ecommerce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np

In [13]:
df= pd.read_csv('Ecommerce_data.csv')
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [14]:
df.shape

(24000, 2)

In [15]:
df.label.value_counts()

Household                 6000
Electronics               6000
Clothing & Accessories    6000
Books                     6000
Name: label, dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label_num'] = label_encoder.fit_transform(df['label'])

df.head()

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,3
1,"Contrast living Wooden Decorative Box,Painted ...",Household,3
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,1
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,1


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Text,
    df.label_num,
    test_size=0.2,
    random_state=2023,
    stratify= df.label_num # equal number from all classes taken
)


In [18]:
X_train.shape, X_test.shape

((19200,), (4800,))

In [20]:
y_train.value_counts()

2    4800
1    4800
0    4800
3    4800
Name: label_num, dtype: int64

In [21]:
y_test.value_counts()

3    1200
1    1200
2    1200
0    1200
Name: label_num, dtype: int64

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('tf_idf_vectorizer', TfidfVectorizer()),
     ('KNN', KNeighborsClassifier())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1200
           1       0.97      0.98      0.97      1200
           2       0.96      0.96      0.96      1200
           3       0.96      0.96      0.96      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



We are getting very good precision and accuracy.

In [27]:
X_test[:10]

17094    Crompton Jupiter 1200mm 220-240V,50Hz, Ceiling...
4797     Ahhaaaa Kids Waistcoat, Shirt, Tie and Trouser...
431      Gone: Jack Caffery series 5 Review "Lacerating...
1361     Fressia Fabrics Women'S Cotton Saree Blouse (B...
16922    The Museum of Innocence Book Description The M...
22634    Surya Warmth Heat Convector Portable Instant h...
7325     FEELBLUE Men's Cotton Bathrobe (Yellow) Cover ...
2532     Canon Pixma G2012 All-in-One Ink Tank Colour P...
19403    Philips SHE1405BK/94 In-Ear Headphone Headset ...
16649    Electric Portable Mini Dehumidifier for Damp, ...
Name: Text, dtype: object

In [28]:
y_test[:10]

17094    3
4797     1
431      2
1361     1
16922    0
22634    3
7325     3
2532     2
19403    2
16649    3
Name: label_num, dtype: int64

In [29]:
y_pred[:10]

array([3, 1, 0, 1, 0, 3, 3, 2, 2, 3])

Only one class predicted wrong, others all are correct.

In [32]:
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf = Pipeline([
     ('tf_idf_vectorizer', TfidfVectorizer()),
     ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.98      0.93      0.95      1200
           1       0.97      0.99      0.98      1200
           2       0.97      0.96      0.96      1200
           3       0.92      0.97      0.94      1200

    accuracy                           0.96      4800
   macro avg       0.96      0.96      0.96      4800
weighted avg       0.96      0.96      0.96      4800



In [33]:
from sklearn.ensemble import RandomForestClassifier


#1. create a pipeline object
clf = Pipeline([
     ('tf_idf_vectorizer', TfidfVectorizer()),
     ('random_forest', RandomForestClassifier())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1200
           1       0.98      0.98      0.98      1200
           2       0.98      0.97      0.97      1200
           3       0.96      0.95      0.96      1200

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



Highest accuracy using Random Forest Classifier.

Now, using preprocessing.

In [34]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [36]:
df['preprocessed_text']=df['Text'].apply(preprocess)


In [37]:
df.head()

Unnamed: 0,Text,label,label_num,preprocessed_text
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,3,Urban Ladder Eisner low Study Office Computer ...
1,"Contrast living Wooden Decorative Box,Painted ...",Household,3,contrast live Wooden Decorative Box Painted Bo...
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,2,IO Crest SY PCI40010 PCI raid Host Controller ...
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,1,ISAKAA Baby Socks bear 8 Years- Pack 4 6 8 12 ...
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,1,Indira Designer Women Art Mysore Silk Saree Bl...


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_text,
    df.label_num,
    test_size=0.2,
    random_state=2023,
    stratify= df.label_num # equal number from all classes taken
)

In [39]:
from sklearn.ensemble import RandomForestClassifier


#1. create a pipeline object
clf = Pipeline([
     ('tf_idf_vectorizer', TfidfVectorizer()),
     ('random_forest', RandomForestClassifier())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1200
           1       0.98      0.99      0.98      1200
           2       0.98      0.97      0.98      1200
           3       0.97      0.97      0.97      1200

    accuracy                           0.98      4800
   macro avg       0.98      0.98      0.98      4800
weighted avg       0.98      0.98      0.98      4800



Accuracy, precision and f1 score improved after preprocessing.