In [95]:
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
%matplotlib inline

In [96]:
raw_data = pd.read_json('/Users/andrewlevinson/programs/parsons/spring-2019/machine-learning-class/machine-learning/assignment-3/food_enforcement_US_bacteria.json', orient='records')

In [97]:
print(raw_data.shape)
print(raw_data.head(20).product_description)

(6104, 26)
0     Native Catch Salmon Bacon, Traditional flavor....
1     Delish Fruit Burst 10 oz.(283 g)  Enjoy by: 8/...
2     Garden Pure Holiday Fruit Bowl 4 lbs (1.81kg) ...
3     Fresh Fruit Product is labeled in part:  "***F...
4     Ready Pac¿ Gourmet Fruit Bowl, 64oz , UPC 7774...
5     River Ranch brand Diced Grn Cabbage w/Color, 4...
6     Sweet and Sour Stir Fry packaged under the fol...
7     Onion Slab Cut.  Products are labeled in part:...
8     MIX w/yellow onions used as an ingredient.  Pr...
9     Fresh n Easy brand Shred Grn Cabbage w/Color, ...
10    River Ranch brand Three-Color Cole Slaw Mix, 1...
11    MIX w/yellow onions used as an ingredient.  Pr...
12    River Ranch brand Spring Mix, 3 lb bag, UPC: n...
13    River Ranch brand Shredded Red Cabbage, 4 x 5 ...
14    Hy Vee brand Italian Blend, 10 oz bag, UPC:  7...
15    Onions Julienne Sliced.  Products are labeled ...
16    Fresh n Easy brand Garden with Romaine, 4 x 5 ...
17    Cross Valley brand Shredded Ice

### Bag of words feature representation for descriptive text

In [98]:
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer()

# corpus = raw_data['product_description']
# X = vectorizer.fit_transform(corpus)
# X.shape

### Additional Transformations

In [109]:
## remove numbers
raw_data['noNum'] = raw_data['product_description'].apply(lambda x: x.translate(str.maketrans('','','1234567890')))

In [110]:
print(raw_data['noNum'].head(20))

0     Native Catch Salmon Bacon, Traditional flavor....
1     Delish Fruit Burst  oz.( g)  Enjoy by: // Dist...
2     Garden Pure Holiday Fruit Bowl  lbs (.kg) Best...
3     Fresh Fruit Product is labeled in part:  "***F...
4     Ready Pac¿ Gourmet Fruit Bowl, oz , UPC , Use ...
5     River Ranch brand Diced Grn Cabbage w/Color,  ...
6     Sweet and Sour Stir Fry packaged under the fol...
7     Onion Slab Cut.  Products are labeled in part:...
8     MIX w/yellow onions used as an ingredient.  Pr...
9     Fresh n Easy brand Shred Grn Cabbage w/Color, ...
10    River Ranch brand Three-Color Cole Slaw Mix,  ...
11    MIX w/yellow onions used as an ingredient.  Pr...
12    River Ranch brand Spring Mix,  lb bag, UPC: n/...
13    River Ranch brand Shredded Red Cabbage,  x  lb...
14    Hy Vee brand Italian Blend,  oz bag, UPC:  -; ...
15    Onions Julienne Sliced.  Products are labeled ...
16    Fresh n Easy brand Garden with Romaine,  x  lb...
17    Cross Valley brand Shredded Icebreg (/"), 

### tfidf

In [111]:
vectorizer = TfidfVectorizer(stop_words='english')
corpus = raw_data['noNum']
X = vectorizer.fit_transform(corpus)
X.shape

(6104, 5956)

### fit Kmeans

In [137]:
#fit Kmeans with k clusters
from sklearn.cluster import KMeans
number_of_clusters = 18
km = KMeans(n_clusters=number_of_clusters, init='k-means++', n_init=10)
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=18, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

### labels are named with integers and stored in the `labels_` attribute

In [138]:
km.labels_

array([ 8, 15, 15, ..., 15, 15, 15], dtype=int32)

### take a look at one of the cluster results

In [144]:
# You should look at all the clusters, but let's look at cluster "15" (I chose this arbitrarily; you should look at all of them)

for i in range(0,len(km.labels_)):
    if km.labels_[i] == 3:
        print(raw_data['noNum'][i])
        print('**********')

Pierre Jamwich Crustless Peanut Butter & Jelly Sandwich, .oz., AdvancePierre Foods, Inc.,  Princeton Glendale Road, Cincinnati, OH .
**********
Sunland Salted Organic Peanuts,  lb UPC ---            
**********
Sunland Valencia Raw Peanuts,  lb UPC ---;  lb UPC ;  lb UPC ---;  lb UPC ---.
**********
Albertsons Brand Peanut Butter Cup Ice Cream, . Quarts (.L), commonly called a  fl oz container.     	Peanut Butter Cup Ice Cream,  (Vanilla ice cream with peanut butter cup pieces, fudge and peanut butter swirls); . QTS (.L), (UPC Code     for Albertsons Peanut Butter Cup)    Packaged in paper cardboard round oblong container. Labeled with a.	Buena Park (IMS Plant Number  -). Labeled as "Distributed by SUPERVALU INC. Eden Prairie, MN ".    	-  RA (plant code,  hour time, operator initial)    	MFG // EXP // LOTE A (manufacturing month, day, year , expiration month, day, year, lot number)    	Code date is injected on the bottom of container in black ink  	UPC Code:        for Albertsons Pean

In [140]:
# colors = ['b', 'g', 'c']
# markers = ['o', 'v', 's']
# print(km.cluster_centers_)
# centers = np.array(km.cluster_centers_)

# plt.plot()
# plt.title('k means centroids')

# for i, l in enumerate(km.labels_):
#     plt.plot(corpus[i], color=colors[l], marker=markers[l],ls='None')
#     plt.xlim([0, 10])
#     plt.ylim([0, 10])

# plt.scatter(centers[:,0], centers[:,1], marker="x", color='r')
# plt.show()

In [141]:
# inspired by tutorial https://pythonprogramminglanguage.com/kmeans-text-clustering/
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")

# add word to predict cluster to see if word makes sense within k means clusters
Y = vectorizer.transform(["chocolate"])
prediction = km.predict(Y)
print(prediction)

Top terms per cluster:
Cluster 0:
 mfg
 code
 wt
 net
 upc
 oz
 ham
 cheese
 chicken
 sausage
 turkey
 bun
 wheat
 bologna
 sub
Cluster 1:
 plastic
 oz
 packaged
 clear
 upc
 refrigerated
 lb
 packed
 product
 tub
 sold
 herring
 net
 cheese
 bags
Cluster 2:
 net
 wt
 oz
 upc
 organic
 lbs
 distributed
 brand
 lb
 kg
 wawa
 frozen
 mix
 packaged
 product
Cluster 3:
 butter
 peanut
 sunland
 upc
 creamy
 crunchy
 organic
 oz
 valencia
 natural
 lb
 peanuts
 roasted
 almond
 portales
Cluster 4:
 shipper
 sealed
 totaling
 gc
 attached
 total
 lbs
 units
 bueno
 cardboard
 package
 number
 box
 nm
 label
Cluster 5:
 reser
 cases
 carton
 formula
 fine
 beaverton
 packaged
 foods
 salad
 lb
 distributed
 potato
 po
 upc
 oz
Cluster 6:
 finest
 traditions
 lb
 salad
 code
 product
 potato
 dip
 pasta
 spread
 oz
 taco
 lbs
 creamy
 cole
Cluster 7:
 splendid
 creams
 pint
 jeni
 ice
 frozen
 llc
 individually
 bucket
 plastic
 case
 packed
 container
 size
 gallons
Cluster 8:
 smoked
 salmon