In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('data/ecommerceDataset.csv', header = None)

In [3]:
df.head()

Unnamed: 0,0,1
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [4]:
df.columns = ['category', 'text']

In [5]:
df.head()

Unnamed: 0,category,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [6]:
df.category.value_counts()

category
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [7]:
df['label'] = df['category'].map({
    'Household' : 0,
    'Books' : 1,
    'Electronics' : 2,
    'Clothing & Accessories' : 3,
})

In [8]:
df.head()

Unnamed: 0,category,text,label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [9]:
def dataframe_summary(dataframe):
    """
    Generates a summary DataFrame containing information about null values, number of unique values,
    and duplicated rows for each column in the input DataFrame.

    Parameters:
    dataframe (pandas DataFrame): The DataFrame to be summarized.

    Returns:
    pandas DataFrame: A summary DataFrame containing information about null values, number of unique values,
    and duplicated rows for each column in the input DataFrame.
    """

    null_counts = dataframe.isnull().sum()
    unique_counts = dataframe.nunique()
    duplicated_counts = dataframe.duplicated().sum()

    summary_df = pd.DataFrame({
        'Null Values': null_counts,
        'Unique Values': unique_counts,
        'Duplicated Rows': duplicated_counts
    })

    return summary_df

dataframe_summary(df)

Unnamed: 0,Null Values,Unique Values,Duplicated Rows
category,0,4,22622
text,1,27802,22622
label,0,4,22622


In [10]:
def duplicated_rows(dataframe):
    """
    Finds duplicated rows in the input DataFrame and returns a DataFrame containing them.

    Parameters:
    dataframe (pandas DataFrame): The DataFrame to search for duplicated rows.

    Returns:
    pandas DataFrame: A DataFrame containing duplicated rows from the input DataFrame.
    """

    duplicated_rows = dataframe[dataframe.duplicated()]
    return duplicated_rows

duplicated_rows(df)

Unnamed: 0,category,text,label
7,Household,Pitaara Box Romantic Venice Canvas Painting 6m...,0
11,Household,Paper Plane Design Starry Night Vangoh Wall Ar...,0
12,Household,Pitaara Box Romantic Venice Canvas Painting 6m...,0
16,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
20,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
...,...,...,...
50420,Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...,2
50421,Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...,2
50422,Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...,2
50423,Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",2


In [11]:
df = df.dropna()

In [12]:
df.head()

Unnamed: 0,category,text,label
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,0
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
4,Household,Incredible Gifts India Wooden Happy Birthday U...,0


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.25, random_state=42)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf_knn = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('knn', KNeighborsClassifier())
    ])

In [15]:
clf_knn.fit(X_train, y_train)

y_pred = clf_knn.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      4839
           1       0.97      0.95      0.96      2942
           2       0.95      0.95      0.95      2623
           3       0.97      0.97      0.97      2202

    accuracy                           0.96     12606
   macro avg       0.96      0.96      0.96     12606
weighted avg       0.96      0.96      0.96     12606



In [16]:
from sklearn.naive_bayes import MultinomialNB

clf_nb = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('nb', MultinomialNB())
    ])

In [17]:
clf_nb.fit(X_train, y_train)

y_pred = clf_nb.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93      4839
           1       0.97      0.92      0.94      2942
           2       0.96      0.90      0.93      2623
           3       0.98      0.93      0.95      2202

    accuracy                           0.94     12606
   macro avg       0.95      0.93      0.94     12606
weighted avg       0.94      0.94      0.94     12606



In [18]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('rf', RandomForestClassifier(n_estimators=100))
    ])

In [19]:
clf_rf.fit(X_train, y_train)

y_pred = clf_rf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      4839
           1       0.98      0.97      0.97      2942
           2       0.98      0.94      0.96      2623
           3       0.98      0.97      0.98      2202

    accuracy                           0.97     12606
   macro avg       0.97      0.97      0.97     12606
weighted avg       0.97      0.97      0.97     12606



In [20]:
import spacy

nlp = spacy.load("en_core_web_trf")

def preprocessing(text):
    doc = nlp(text)
    filtered_text = []
    
    for token in doc:
        if token.is_stop and token.is_punct and token.is_space:
            continue
        filtered_text.append(token.lemma_)
        
    return ''.join(filtered_text)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [21]:
df['preprocessed_text'] = df['text'].apply(preprocessing)

In [None]:
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text'], df['label'], test_size=0.25, random_state=42)

In [None]:
clf_knn.fit(X_train, y_train)

y_pred = clf_knn.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
clf_nb.fit(X_train, y_train)

y_pred = clf_nb.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
clf_rf.fit(X_train, y_train)

y_pred = clf_rf.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
clf_rf.fit(X_train, y_train)

y_pred = clf_rf.predict(X_test)

print(classification_report(y_test, y_pred))