## KMeans classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_csv('../data/recipes.csv').drop(columns=['Unnamed: 0'])
print(df.shape)
df['count'] = 1
df.head(3)

(761, 17)


Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,sustainable,lowFodmap,title,readyInMinutes,summary,cuisines,dishTypes,occasions,instructions,simplifiedIngredients,simplifiedInstructions,count
0,1,0,0,0,0,0,0,0,Gingerbread Mummies,45,Gingerbread Mummies might be just the dessert ...,[],['dessert'],['christmas'],"In a bowl of an electric mixer, beat the butte...","['unsalted butter', 'wheat flour', 'sugar', 'm...","['In a bowl of an electric mixer, beat the but...",1
1,0,0,0,0,0,0,0,0,Neiman Marcus Oatmeal Chocolate Chip Cookies,45,Neiman Marcus Oatmeal Chocolate Chip Cookies m...,[],['dessert'],[],Preheat oven to 375 degrees F.\r\nIn a large b...,"['butter', 'sugar', 'golden brown sugar', 'egg...","['Preheat oven to 375 degrees F.', 'In a large...",1
2,1,0,0,0,0,0,0,0,Strawberry Tart,45,Strawberry Tart takes about <b>approximately 4...,[],['dessert'],"[""mother's day""]",Preheat oven to 350 degrees F.\r\nSift flour o...,"['wheat flour', 'unsalted butter', 'water', 'e...","['Preheat oven to 350 degrees F.', 'Sift flour...",1


In [3]:
to_drop = [
    'cuisines', 
    'occasions',
    'simplifiedIngredients',
    'simplifiedInstructions'
]

In [4]:
df = df.drop(columns=to_drop)

---
### Model using summary

In [5]:
sum_df = df['summary']

In [6]:
sum_df

0      Gingerbread Mummies might be just the dessert ...
1      Neiman Marcus Oatmeal Chocolate Chip Cookies m...
2      Strawberry Tart takes about <b>approximately 4...
3      Vegan Strawberry Shortcake served with Vegan W...
4      Need a <b>lacto ovo vegetarian dessert</b>? Au...
                             ...                        
756    Lime S’more Tartlets might be just the hor d'o...
757    Beef Cottage Pie is a <b>gluten free</b> recip...
758    Easy Chicken Cordon Bleu takes about <b>around...
759    The recipe Hummus and Za'atar is ready <b>in a...
760    Dulce De Leche Crème Brûlée is a <b>gluten fre...
Name: summary, Length: 761, dtype: object

In [7]:
cv = CountVectorizer(stop_words='english')
cv.fit(sum_df)
X = cv.transform(sum_df)

In [8]:
len(cv.get_feature_names())

5385

In [9]:
km = KMeans(n_clusters=4, random_state=42)
km.fit(X)

KMeans(n_clusters=4, random_state=42)

In [10]:
preds = km.predict(X)

In [11]:
df['pred_by_summary'] = preds

In [12]:
df.groupby(by='pred_by_summary').sum()

Unnamed: 0_level_0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,sustainable,lowFodmap,readyInMinutes,count
pred_by_summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,280,68,297,220,62,0,0,25,32165,634
1,3,0,6,6,1,0,0,3,345,8
2,5,1,48,32,5,0,0,2,3377,74
3,15,6,21,20,9,0,0,1,3055,45


---
### Model using instructions

In [13]:
ins_df = df['instructions']

ins_df = ins_df.fillna('')

In [15]:
ins_df

0      In a bowl of an electric mixer, beat the butte...
1      Preheat oven to 375 degrees F.\r\nIn a large b...
2      Preheat oven to 350 degrees F.\r\nSift flour o...
3      Preheat your oven to 400 degrees F.\r\nAdd the...
4      Beat butter in large bowl in an electric mixer...
                             ...                        
756    <ol><li>Prepare the graham crust: Preheat the ...
757    Preheat oven to 375 degrees.\r\nIn a large ski...
758    Pre-heat oven into 180C.\r\nGently pound the c...
759    Rinse the chickpeas and soak for 8 hours or ov...
760    Oven: 325F\r\nPlace six ramekins in a water ba...
Name: instructions, Length: 761, dtype: object

In [16]:
cv = CountVectorizer(stop_words='english')
cv.fit(ins_df)
X = cv.transform(ins_df)

In [17]:
X

<761x3904 sparse matrix of type '<class 'numpy.int64'>'
	with 45838 stored elements in Compressed Sparse Row format>

In [18]:
len(cv.get_feature_names())

3904

In [19]:
km = KMeans(n_clusters=4, random_state=42)
km.fit(X)

KMeans(n_clusters=4, random_state=42)

In [20]:
preds = km.predict(X)

In [21]:
df['pred_by_instructions'] = preds

In [22]:
df.groupby(by='pred_by_instructions').sum()

Unnamed: 0_level_0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,sustainable,lowFodmap,readyInMinutes,count,pred_by_summary
pred_by_instructions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,42,7,43,28,13,0,0,4,6115,115,46
1,178,44,227,176,39,0,0,20,22317,444,182
2,78,24,99,69,22,0,0,6,9595,189,57
3,5,0,3,5,3,0,0,1,915,13,6


---
### Model using summary and instructions

In [34]:
com_df = df['summary'] + df['instructions'].fillna('')

In [35]:
com_df

0      Gingerbread Mummies might be just the dessert ...
1      Neiman Marcus Oatmeal Chocolate Chip Cookies m...
2      Strawberry Tart takes about <b>approximately 4...
3      Vegan Strawberry Shortcake served with Vegan W...
4      Need a <b>lacto ovo vegetarian dessert</b>? Au...
                             ...                        
756    Lime S’more Tartlets might be just the hor d'o...
757    Beef Cottage Pie is a <b>gluten free</b> recip...
758    Easy Chicken Cordon Bleu takes about <b>around...
759    The recipe Hummus and Za'atar is ready <b>in a...
760    Dulce De Leche Crème Brûlée is a <b>gluten fre...
Length: 761, dtype: object

In [36]:
type(com_df.iloc[0])

str

In [37]:
cv = CountVectorizer()
cv.fit(com_df)
X = cv.transform(com_df)

In [38]:
len(cv.get_feature_names())

8105

In [39]:
km = KMeans(n_clusters=4, random_state=42)
km.fit(X)

KMeans(n_clusters=4, random_state=42)

In [40]:
preds = km.predict(X)

In [41]:
df['pred_by_summary'] = preds

In [42]:
df.groupby(by='pred_by_summary').sum()

Unnamed: 0_level_0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,sustainable,lowFodmap,readyInMinutes,count,pred_by_instructions
pred_by_summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,12,0,9,9,5,0,0,2,1785,32,41
1,189,54,216,168,35,0,0,21,21330,414,492
2,38,10,82,55,20,0,0,3,7257,148,174
3,64,11,65,46,17,0,0,5,8570,167,154
