## KMeans classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [58]:
df = pd.read_csv('../data/recipes.csv').drop(columns=['Unnamed: 0'])
print(df.shape)
df['count'] = 1
df.head(3)

(761, 17)


Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,sustainable,lowFodmap,title,readyInMinutes,summary,cuisines,dishTypes,occasions,instructions,simplifiedIngredients,simplifiedInstructions,count
0,1,0,0,0,0,0,0,0,Gingerbread Mummies,45,Gingerbread Mummies might be just the dessert ...,[],['dessert'],['christmas'],"In a bowl of an electric mixer, beat the butte...","['unsalted butter', 'wheat flour', 'sugar', 'm...","['In a bowl of an electric mixer, beat the but...",1
1,0,0,0,0,0,0,0,0,Neiman Marcus Oatmeal Chocolate Chip Cookies,45,Neiman Marcus Oatmeal Chocolate Chip Cookies m...,[],['dessert'],[],Preheat oven to 375 degrees F.\nIn a large bow...,"['butter', 'sugar', 'golden brown sugar', 'egg...","['Preheat oven to 375 degrees F.', 'In a large...",1
2,1,0,0,0,0,0,0,0,Strawberry Tart,45,Strawberry Tart takes about <b>approximately 4...,[],['dessert'],"[""mother's day""]",Preheat oven to 350 degrees F.\nSift flour ont...,"['wheat flour', 'unsalted butter', 'water', 'e...","['Preheat oven to 350 degrees F.', 'Sift flour...",1


In [3]:
to_drop = [
    'cuisines', 
    'occasions',
    'simplifiedIngredients',
    'simplifiedInstructions'
]

In [4]:
df = df.drop(columns=to_drop)

---
### Model using summary

In [5]:
sum_df = df['summary']

In [6]:
sum_df

0      Gingerbread Mummies might be just the dessert ...
1      Neiman Marcus Oatmeal Chocolate Chip Cookies m...
2      Strawberry Tart takes about <b>approximately 4...
3      Vegan Strawberry Shortcake served with Vegan W...
4      Need a <b>lacto ovo vegetarian dessert</b>? Au...
                             ...                        
756    Lime S’more Tartlets might be just the hor d'o...
757    Beef Cottage Pie is a <b>gluten free</b> recip...
758    Easy Chicken Cordon Bleu takes about <b>around...
759    The recipe Hummus and Za'atar is ready <b>in a...
760    Dulce De Leche Crème Brûlée is a <b>gluten fre...
Name: summary, Length: 761, dtype: object

In [7]:
cv = CountVectorizer(stop_words='english')
cv.fit(sum_df)
X = cv.transform(sum_df)

In [8]:
len(cv.get_feature_names())

5385

In [11]:
'chicken' in cv.get_feature_names()

True

In [25]:
pd.DataFrame.sparse.from_spmatrix(X, columns=cv.get_feature_names())['chicken'].sort_values(ascending=False).head(135)

87     20
426    12
370    11
736    11
366    10
       ..
476     1
92      1
267     1
336     1
210     0
Name: chicken, Length: 135, dtype: Sparse[int64, 0]

In [26]:
km = KMeans(n_clusters=4, random_state=42)
km.fit(X)

KMeans(n_clusters=4, random_state=42)

In [27]:
preds = km.predict(X)

In [28]:
df['pred_by_summary'] = preds

In [29]:
df.groupby(by='pred_by_summary').sum()

Unnamed: 0_level_0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,sustainable,lowFodmap,readyInMinutes,count
pred_by_summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,6,0,9,8,1,0,0,3,585,13
1,275,70,299,221,69,0,0,23,31596,611
2,5,1,51,39,6,0,0,2,4092,81
3,17,4,13,10,1,0,0,3,2669,56


---
### Model using instructions

In [30]:
ins_df = df['instructions']

ins_df = ins_df.fillna('')

In [31]:
ins_df

0      In a bowl of an electric mixer, beat the butte...
1      Preheat oven to 375 degrees F.\nIn a large bow...
2      Preheat oven to 350 degrees F.\nSift flour ont...
3      Preheat your oven to 400 degrees F.\nAdd the s...
4      Beat butter in large bowl in an electric mixer...
                             ...                        
756    <ol><li>Prepare the graham crust: Preheat the ...
757    Preheat oven to 375 degrees.\nIn a large skill...
758    Pre-heat oven into 180C.\nGently pound the chi...
759    Rinse the chickpeas and soak for 8 hours or ov...
760    Oven: 325F\nPlace six ramekins in a water bath...
Name: instructions, Length: 761, dtype: object

In [32]:
cv = CountVectorizer(stop_words='english')
cv.fit(ins_df)
X = cv.transform(ins_df)

In [33]:
X

<761x3904 sparse matrix of type '<class 'numpy.int64'>'
	with 45838 stored elements in Compressed Sparse Row format>

In [34]:
len(cv.get_feature_names())

3904

In [37]:
km = KMeans(n_clusters=4, random_state=42)
km.fit(X)

KMeans(n_clusters=4, random_state=42)

In [38]:
preds = km.predict(X)

In [39]:
df['pred_by_instructions'] = preds

In [40]:
df.groupby(by='pred_by_instructions').sum()

Unnamed: 0_level_0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,sustainable,lowFodmap,readyInMinutes,count,pred_by_summary
pred_by_instructions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,178,44,227,176,39,0,0,20,22317,444,555
1,42,7,43,28,14,0,0,4,6445,118,155
2,5,0,3,5,2,0,0,1,585,10,12
3,78,24,99,69,22,0,0,6,9595,189,219


---
### Model using summary and instructions

In [41]:
com_df = df['summary'] + df['instructions'].fillna('')

In [42]:
com_df

0      Gingerbread Mummies might be just the dessert ...
1      Neiman Marcus Oatmeal Chocolate Chip Cookies m...
2      Strawberry Tart takes about <b>approximately 4...
3      Vegan Strawberry Shortcake served with Vegan W...
4      Need a <b>lacto ovo vegetarian dessert</b>? Au...
                             ...                        
756    Lime S’more Tartlets might be just the hor d'o...
757    Beef Cottage Pie is a <b>gluten free</b> recip...
758    Easy Chicken Cordon Bleu takes about <b>around...
759    The recipe Hummus and Za'atar is ready <b>in a...
760    Dulce De Leche Crème Brûlée is a <b>gluten fre...
Length: 761, dtype: object

In [43]:
type(com_df.iloc[0])

str

In [44]:
cv = CountVectorizer()
cv.fit(com_df)
X = cv.transform(com_df)

In [45]:
len(cv.get_feature_names())

8105

In [50]:
# Check how many occurances of chicken there are in the recipe ingredients
pd.DataFrame.sparse.from_spmatrix(X, columns=cv.get_feature_names())['chicken'].sort_values(ascending=False).head(160)

87     24
370    18
222    17
732    16
183    15
       ..
252     1
642     1
88      1
246     0
212     0
Name: chicken, Length: 160, dtype: Sparse[int64, 0]

In [52]:
# someone really likes chicken
df.iloc[87]

vegetarian                                                              0
vegan                                                                   0
glutenFree                                                              1
dairyFree                                                               1
veryHealthy                                                             0
cheap                                                                   0
sustainable                                                             0
lowFodmap                                                               0
title                                                   Three-Cup Chicken
readyInMinutes                                                         45
summary                 Three-Cup Chicken might be just the main cours...
dishTypes                 ['lunch', 'main course', 'main dish', 'dinner']
instructions            <ol><li>Marinate chicken with soy, sesame oil ...
count                                 

In [60]:
df['simplifiedIngredients'].iloc[87]

"['fresh basil', 'garlic', 'ginger', 'shaoxing wine', 'sesame oil', 'soy sauce', 'spring onions', 'sugar', 'whole chicken']"

In [39]:
km = KMeans(n_clusters=4, random_state=42)
km.fit(X)

KMeans(n_clusters=4, random_state=42)

In [40]:
preds = km.predict(X)

In [41]:
df['pred_by_summary'] = preds

In [42]:
df.groupby(by='pred_by_summary').sum()

Unnamed: 0_level_0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,sustainable,lowFodmap,readyInMinutes,count,pred_by_instructions
pred_by_summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,12,0,9,9,5,0,0,2,1785,32,41
1,189,54,216,168,35,0,0,21,21330,414,492
2,38,10,82,55,20,0,0,3,7257,148,174
3,64,11,65,46,17,0,0,5,8570,167,154
