# Text Classification for Product Categorization

## Importing Packages

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
Grocery = pd.read_csv('GroceryDataset.csv')
Grocery.head()

Unnamed: 0,Sub Category,Price,Discount,Rating,Title,Currency,Feature,Product Description
0,Bakery & Desserts,$56.99,No Discount,Rated 4.3 out of 5 stars based on 265 reviews.,"David’s Cookies Mile High Peanut Butter Cake, ...",$,"""10"""" Peanut Butter Cake\nCertified Kosher OU-...",A cake the dessert epicure will die for!Our To...
1,Bakery & Desserts,$159.99,No Discount,Rated 5 out of 5 stars based on 1 reviews.,"The Cake Bake Shop 8"" Round Carrot Cake (16-22...",$,Spiced Carrot Cake with Cream Cheese Frosting ...,"Due to the perishable nature of this item, ord..."
2,Bakery & Desserts,$44.99,No Discount,Rated 4.1 out of 5 stars based on 441 reviews.,"St Michel Madeleine, Classic French Sponge Cak...",$,100 count\nIndividually wrapped\nMade in and I...,Moist and buttery sponge cakes with the tradit...
3,Bakery & Desserts,$39.99,No Discount,Rated 4.7 out of 5 stars based on 9459 reviews.,"David's Cookies Butter Pecan Meltaways 32 oz, ...",$,Butter Pecan Meltaways\n32 oz 2-Pack\nNo Prese...,These delectable butter pecan meltaways are th...
4,Bakery & Desserts,$59.99,No Discount,Rated 4.5 out of 5 stars based on 758 reviews.,"David’s Cookies Premier Chocolate Cake, 7.2 lb...",$,"""10"" Four Layer Chocolate Cake\nCertified Kosh...",A cake the dessert epicure will die for!To the...


## Cleaning data

In [3]:
nan_index = Grocery[Grocery['Product Description'].isnull()].index

In [4]:
Grocery.drop(nan_index, inplace=True)

In [5]:
Grocery['Sub Category'].nunique()

19

# Applying TFIDF 

In [7]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [8]:
dtm = tfidf.fit_transform(Grocery['Product Description'])

# Non-negative Matrix Factorization

In [12]:
nmf_model = NMF(n_components=19,random_state=42)

In [13]:
nmf_model.fit(dtm)

In [14]:
for index, topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['days', 'contact', 'help', 'need', 'ship', 'friday', 'event', 'calendar', 'checkout', 'arrival', 'date', 'order', 'arrive', 'orders', 'flowers']


THE TOP 15 WORDS FOR TOPIC #1
['chips', '64', '32', 'qty', 'count2', '40', 'mints', '75', '42', 'bags', 'cheese', '15', '12', 'count1', 'oz']


THE TOP 15 WORDS FOR TOPIC #2
['tender', 'company', 'black', 'lobster', 'fed', 'chicken', 'meat', 'aged', 'raised', 'angus', 'filet', 'burgers', 'steaks', 'beef', 'steak']


THE TOP 15 WORDS FOR TOPIC #3
['certifieddark', 'counttotal', 'arabica', 'beankosher', 'chocolatetotal', 'bag2', 'packtotal', 'lb', 'coffeewhole', '32', '10', 'total', 'lbs', 'weight', 'net']


THE TOP 15 WORDS FOR TOPIC #4
['chip', 'cake', 'butter', 'covered', 'size', 'caramel', 'bar', 'belgian', 'cookies', 'peanut', 'bars', 'dark', 'candy', 'milk', 'chocolate']


THE TOP 15 WORDS FOR TOPIC #5
['certified', 'roasted', 'beans', 'medium', 'dark', 'dispose', 'starbucks', 'blend', '100', 'arabica', 'or

# Add the topic Column

In [16]:
topic_results = nmf_model.transform(dtm)

In [17]:
topic_results.argmax(axis=1)

Grocery['Topic'] = topic_results.argmax(axis=1)

Grocery.head(10)

Unnamed: 0,Sub Category,Price,Discount,Rating,Title,Currency,Feature,Product Description,Topic
0,Bakery & Desserts,$56.99,No Discount,Rated 4.3 out of 5 stars based on 265 reviews.,"David’s Cookies Mile High Peanut Butter Cake, ...",$,"""10"""" Peanut Butter Cake\nCertified Kosher OU-...",A cake the dessert epicure will die for!Our To...,4
1,Bakery & Desserts,$159.99,No Discount,Rated 5 out of 5 stars based on 1 reviews.,"The Cake Bake Shop 8"" Round Carrot Cake (16-22...",$,Spiced Carrot Cake with Cream Cheese Frosting ...,"Due to the perishable nature of this item, ord...",0
2,Bakery & Desserts,$44.99,No Discount,Rated 4.1 out of 5 stars based on 441 reviews.,"St Michel Madeleine, Classic French Sponge Cak...",$,100 count\nIndividually wrapped\nMade in and I...,Moist and buttery sponge cakes with the tradit...,4
3,Bakery & Desserts,$39.99,No Discount,Rated 4.7 out of 5 stars based on 9459 reviews.,"David's Cookies Butter Pecan Meltaways 32 oz, ...",$,Butter Pecan Meltaways\n32 oz 2-Pack\nNo Prese...,These delectable butter pecan meltaways are th...,13
4,Bakery & Desserts,$59.99,No Discount,Rated 4.5 out of 5 stars based on 758 reviews.,"David’s Cookies Premier Chocolate Cake, 7.2 lb...",$,"""10"" Four Layer Chocolate Cake\nCertified Kosh...",A cake the dessert epicure will die for!To the...,4
5,Bakery & Desserts,$59.99,No Discount,Rated 4.4 out of 5 stars based on 369 reviews.,David's Cookies Mango & Strawberry Cheesecake ...,$,2-count\nStrawberry Cheesecake\nMango Cheeseca...,Strawberry Cheesecake: There's only one way to...,14
6,Bakery & Desserts,$74.99,No Discount,Rated 4.7 out of 5 stars based on 2241 reviews.,"La Grande Galette French Butter Cookies, 1.3 l...",$,"1.3 lb, 6-count\nBaked in, and Imported from, ...",Once upon a time in the French coastal town of...,4
7,Bakery & Desserts,$59.99,No Discount,Rated 4.4 out of 5 stars based on 232 reviews.,David's Cookies No Sugar Added Cheesecake & Ma...,$,2-count\nNo Sugar Added\nKosher OU-Dairy,Creamy Dreamy:This smooth creamy cheesecake ha...,4
8,Bakery & Desserts,$29.99,No Discount,Rated 4.4 out of 5 stars based on 1679 reviews.,David's Cookies Brownie and Cookie Combo Pack,$,6 Rocky Road Brownies\n12 Chocoloate Chunk Coo...,Due to the perishable nature of this product o...,4
9,Bakery & Desserts,$159.99,No Discount,Rated 5 out of 5 stars based on 2 reviews.,"The Cake Bake Shop 8"" Round Chocolate Cake (16...",$,3 Layers of French Valrhona Chocolate Cake M...,"Due to the perishable nature of this item, ord...",4


In [19]:
Grocery[Grocery['Sub Category'] == 'Bakery & Desserts']['Topic']

0      4
1      0
2      4
3     13
4      4
5     14
6      4
7      4
8      4
9      4
10     4
11     4
12     4
13     4
14     4
15     4
16     5
17    13
18     4
19     4
20     4
21     4
22     4
23     4
24     4
25     4
26     4
27     4
28     4
29     4
30     4
31     0
32    14
Name: Topic, dtype: int64