In [78]:
import pandas as pd

df= pd.read_csv("ecommerceDataset.csv", names=["category", "description"], header=None)

df.head(3)

Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...


In [79]:
# df1 = pd.read_csv("ecommerceDataset.csv", names=["category", "description"], header=None)
# df1.head(3)

In [80]:
df.isna().sum()

category       0
description    1
dtype: int64

In [81]:
df = df.dropna()

In [82]:
df.isna().sum()

category       0
description    0
dtype: int64

In [83]:
df.category.replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)

In [84]:
df.category.unique()

array(['Household', 'Books', 'Clothing_Accessories', 'Electronics'],
      dtype=object)

In [85]:
df['category'] = '__label__' + df['category'].astype(str)
df.head(5)

Unnamed: 0,category,description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [86]:
df['category_description'] = df['category'] + ' ' + df['description']
df

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...
...,...,...,...
50420,__label__Electronics,Strontium MicroSD Class 10 8GB Memory Card (Bl...,__label__Electronics Strontium MicroSD Class 1...
50421,__label__Electronics,CrossBeats Wave Waterproof Bluetooth Wireless ...,__label__Electronics CrossBeats Wave Waterproo...
50422,__label__Electronics,Karbonn Titanium Wind W4 (White) Karbonn Titan...,__label__Electronics Karbonn Titanium Wind W4 ...
50423,__label__Electronics,"Samsung Guru FM Plus (SM-B110E/D, Black) Colou...",__label__Electronics Samsung Guru FM Plus (SM-...


In [87]:
df[df['category']=='Clothing & Accessories']

Unnamed: 0,category,description,category_description


In [88]:
import re
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower() 

In [89]:
df['category_description'] = df['category_description'].map(preprocess)

In [90]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [91]:
train.to_csv("ecommerce.train", columns=["category_description"], index=False, header=False)
test.to_csv("ecommerce.test", columns=["category_description"], index=False, header=False)

In [92]:
import fasttext
model = fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

(10084, 0.9698532328441095, 0.9698532328441095)

## DIFFERENT unsupervised

In [119]:
train.to_csv("ecommerce_train.txt", columns=["category_description"], index=False, header=False)
test.to_csv("ecommerce_test.txt", columns=["category_description"], index=False, header=False)

In [129]:
mdel = fasttext.train_unsupervised("ecommerce_train.txt")
mdel.test("ecommerce_test.txt")

ValueError: Model needs to be supervised for prediction!

## CONTINUE MAIN PROGRAM

In [93]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.98804116]))

In [124]:
model.get_nearest_neighbors("intel")

[(0.9952906966209412, 'adapater'),
 (0.9952906966209412, '14890783'),
 (0.9949653744697571, 'voidspecification'),
 (0.9949653744697571, 'substancesmicro'),
 (0.9949653744697571, 'sirius'),
 (0.9949653744697571, 'deviod'),
 (0.9949653744697571, 'fragrancesit'),
 (0.9949653744697571, 'wipesa'),
 (0.9941093325614929, 'visbility'),
 (0.9940900802612305, 'tasker')]

In [125]:
mdel.get_nearest_neighbors("intel")

[(0.9957075119018555, 'adaper'),
 (0.9954760670661926, 'cz48'),
 (0.9954760670661926, 'bulkiest'),
 (0.9953689575195312, 'visbility'),
 (0.9952864646911621, 'impressionable'),
 (0.9950974583625793, 'improvhome'),
 (0.9949110150337219, 'cemented'),
 (0.994904637336731, 'wipesa'),
 (0.994904637336731, 'substancesmicro'),
 (0.994904637336731, 'fragrancesit')]

In [95]:
model.get_nearest_neighbors("sony")

[(0.9993656873703003, 'buetooth'),
 (0.9993656873703003, 'uspto'),
 (0.9993656873703003, 'camkixif'),
 (0.9993656873703003, '3o'),
 (0.9993656873703003, '86127507'),
 (0.9993593692779541, 'internatonal'),
 (0.9992615580558777, 'cmyk'),
 (0.999249279499054, 'bitconnectors2'),
 (0.999249279499054, '400mbps3'),
 (0.999249279499054, 'connection1')]

In [126]:
mdel.get_nearest_neighbors("sony")

[(0.9993314743041992, "vehicles'"),
 (0.9992965459823608, '2120g'),
 (0.9992965459823608, '6lbpackage'),
 (0.9992965459823608, '3440g'),
 (0.9992886185646057, 'ditail'),
 (0.9992886185646057, 'desprition'),
 (0.9992886185646057, 'pr0duct'),
 (0.9991365075111389, '3hcce'),
 (0.9991365075111389, '2550mah'),
 (0.9991331100463867, 'payers')]

In [96]:

model.get_nearest_neighbors("banglore")

[(0.0, 'to'),
 (0.0, 'and'),
 (0.0, 'a'),
 (0.0, 'with'),
 (0.0, 'for'),
 (0.0, 'is'),
 (0.0, '</s>'),
 (0.0, 'crazyink'),
 (0.0, 'maintaing'),
 (0.0, 'offswitch')]

In [108]:
model.predict("shirts pants skirts clothes ")

(('__label__clothing_accessories',), array([0.65490621]))

In [127]:
mdel.predict("shirts pants skirts clothes ")

(('__label__clothing_accessories',), array([0.7084834]))

In [113]:
model.predict("speakers")

(('__label__books',), array([0.99681747]))

In [128]:
mdel.predict("speakers")

(('__label__books',), array([0.99652874]))