Dataset: https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification

In [1]:
import pandas as pd

df= pd.read_csv("ecommerceDataset.csv", names=["category", "description"], header=None)
print(df.shape)
df.head(3)

(50425, 2)


Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...


In [2]:
df.dropna(inplace=True)
df.shape

(50424, 2)

In [3]:
df.category.unique()

array(['Household', 'Books', 'Clothing & Accessories', 'Electronics'],
      dtype=object)

In [4]:
df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)


In [5]:
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [6]:
df['category'] = '__label__' + df['category'].astype(str)
df.head(5)

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [7]:
df['category_description'] = df['category'] + ' ' + df['description']
df.head(3)

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...


In [8]:
import re

text = "  VIKI's | Bookcase/Bookshelf (3-Shelf/Shelve, White) | ? . hi"
text = re.sub(r'[^\w\s\']',' ', text)
text = re.sub(' +', ' ', text)
text.strip().lower()

"viki's bookcase bookshelf 3 shelf shelve white hi"

In [9]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower() 

In [10]:
df['category_description'] = df['category_description'].map(preprocess)
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [12]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [15]:
train.to_csv("ecommerce.train", columns=["category_description"], index=False, header=False)
test.to_csv("ecommerce.test", columns=["category_description"], index=False, header=False)

In [16]:
import fasttext

model = fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

(10081, 0.9687530998908839, 0.9687530998908839)

In [17]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.99441987]))

In [18]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")


(('__label__clothing_accessories',), array([1.00001001]))

In [19]:
model.get_word_vector("sony")

array([-0.05928512,  0.08704174, -0.5860384 ,  0.0934303 , -0.08360924,
       -0.21244097,  0.3342601 ,  0.004716  , -0.19015004, -0.23530415,
        0.2867055 ,  0.00465215,  0.10817937, -0.06184506,  0.11601364,
        0.03530942, -0.1259692 ,  0.130656  ,  0.3585445 , -0.04751709,
        0.18952788,  0.00088067, -0.10851778,  0.13309403,  0.01920568,
        0.13868093, -0.14410962,  0.10870233,  0.10911786,  0.05133952,
       -0.04807989,  0.0563388 ,  0.23451917,  0.31605917,  0.17203937,
        0.15611047,  0.04184502, -0.19942096, -0.09053914, -0.03030651,
        0.19275236, -0.14600998, -0.04322247, -0.09681991, -0.27137253,
       -0.14790128, -0.1649573 ,  0.16422665, -0.0541787 , -0.07945853,
       -0.13754864,  0.10464923,  0.10291084, -0.07899626, -0.12886034,
       -0.20645498,  0.1366134 , -0.04700276, -0.00715541,  0.02868391,
       -0.01816451,  0.08205365, -0.22783493, -0.24764355, -0.1657109 ,
       -0.02606304,  0.22809716,  0.07688185, -0.0151119 ,  0.31

In [21]:
model.get_nearest_neighbors("pant")

[(0.9975021481513977, 'mens'),
 (0.9971955418586731, 'hanes'),
 (0.9964262247085571, 'slim'),
 (0.9961525201797485, "boy's"),
 (0.996063768863678, 'fashionable'),
 (0.9957043528556824, 'polka'),
 (0.9954155087471008, 'sweatshirt'),
 (0.9952267408370972, 'skirt'),
 (0.9940529465675354, 'breast'),
 (0.993574857711792, 'womens')]