In [None]:
# Install the fasttext
pip install fasttext

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import fasttext
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split

<b> Dataset 

In [None]:
# Read the Dataset
dataset = pd.read_csv('ecommerceDataset.csv' , names = ['Category' , 'Description'] , header = None)

# Display the Data 
dataset.head(2)

Unnamed: 0,Category,Description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."


In [None]:
# Display the Shape of the Dataset
print('The Shape of the Dataset is : {}'.format(dataset.shape))

In [None]:
# Find out the Missing Values in the Dataset
nan_values = dataset.isnull().sum()
print(nan_values)

In [None]:
# Drop Nan Values in the Dataset
dataset.dropna(inplace = True)

# Display the Shape of the Dataset
print('The Shape of the Dataset is : {}'.format(dataset.shape))

In [None]:
# Find out the Duplicate Row 
dupRow = dataset.duplicated().sum()
print('Total Duplicated Row in the Dataset is : {}'.format(dupRow))

In [None]:
# Remove the Duplicated Row
dataset.drop_duplicates(inplace = True)

# Display the Shape of the Dataset
print('The Shape of the Dataset is : {}'.format(dataset.shape))

In [None]:
# Find out the Count of Each Class
countValues = dataset['Category'].value_counts()
print(countValues)

In [None]:
# Replace the Class Label (Clothing & Accessories)
dataset['Category'].replace('Clothing & Accessories' , 'Clothing_Accessories' , inplace = True)

# Now Find out the Count of Each Class
countValues = dataset['Category'].value_counts()
print(countValues)

<b>Text Pre-processing

In [None]:
# Import the Spacy Pre-trained English Model
nlp = spacy.load('en_core_web_sm')

In [None]:
def text_preprocessing(text):
    doc = nlp(text)
    filterTokens = []
    for token in doc:
        if token.like_num or token.is_stop or token.is_punct:
            continue
        filterTokens.append(token.lemma_.lower())
    return(' '.join(filterTokens))

In [None]:
# Apply the text Preprocessing Function on the Description Function
dataset['cleanDescription'] = dataset['Description'].map(text_preprocessing)

In [None]:
def label_preprocessing(label):
    return('__label__' + label)

In [None]:
# Apply the text Preprocessing Function on the Description Function
dataset['labelCategory'] = dataset['Category'].map(label_preprocessing)

In [34]:
# Now Add the labelCategory or clean Description
dataset['categoryDescription'] = dataset['labelCategory'] + ' ' + dataset['cleanDescription']

# Display the Dataset
dataset.head(2)

Unnamed: 0,Category,Description,cleanDescription,labelCategory,categoryDescription
0,Household,Paper Plane Design Framed Wall Hanging Motivat...,paper plane design framed wall hanging motivat...,__label__Household,__label__Household paper plane design framed w...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",saf floral framed painting wood inch x inch sp...,__label__Household,__label__Household saf floral framed painting ...


<b>Split Data for (Training & Testing)

In [35]:
# Split the Data into Train and Test
train , test = train_test_split(dataset , test_size = 0.25)

# Display the Shape of the Training & Testing Data
print('The Shape of the Train Dataset is : {}'.format(train.shape))
print('The Shape of the Test  Dataset is : {}'.format(test.shape))

The Shape of the Train Dataset is : (20851, 5)
The Shape of the Test  Dataset is : (6951, 5)


<b>FastText

In [36]:
# Now Get the CategoryDescription Column and store in txt file
train.to_csv('trainEcommerce.txt' , columns = ['categoryDescription'] , index = False , header = None)
test.to_csv('testEcommerce.txt'   , columns = ['categoryDescription'] , index = False , header = None) 

In [37]:
# Now Train the FastText Model
model = fasttext.train_supervised(input = 'trainEcommerce.txt')
model.test('testEcommerce.txt')

(6613, 0.9519128988356268, 0.9519128988356268)

<b>Let's Predict some Result

In [38]:
model.predict('wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3')

(('__label__Electronics',), array([0.99856251]))

In [40]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")

(('__label__Clothing_Accessories',), array([0.99977881]))

<b>Some Important Function of FastText 

In [None]:
model.get_nearest_neighbors('sony')

In [None]:
# Get and Display the Shape of the Word Vector
print('The Shape of the Word Vector is : {}'.format(model.get_word_vector('sony').shape))

In [None]:
# Get the Relationship of Words
model.get_analogies('electronic' , 'phone' , 'tv')