# Continuation of Allrecipes.com "core" ingredient project by Paige McKenzie

Follow-up to [original post](https://p-mckenzie.github.io/2018/10/01/ingredient-analysis/), contains supporting code for [new post](https://...).

Data can be acquired using associated `scraper.py` file and must be processed using the `Analysis.ipynb` file.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# import scraped dataset of recipes

df = pd.read_csv('scraped_data.csv', index_col=0)
df['recipe_key'] = df['link'].apply(lambda x:int(re.findall(r"\d+", x)[0]))
df.set_index('recipe_key', inplace=True)
df = df[~df.index.duplicated()] # only keep unique recipes

In [3]:
# import results of Analysis.ipynb
ingredients = pd.read_csv('results.csv', index_col=0)
ingredients = ingredients[~ingredients.original.isnull()]

# filter to only keep ingredients that appear in more than 15 recipes
popularity = ingredients['core'].value_counts()
popularity = popularity[popularity>15]

ingredients.loc[~ingredients['core'].isin(popularity.index), 'core'] = 'OTHER'
del popularity

# unstack so each recipe has dummy variables for popular ingredient
unstacked = pd.get_dummies(ingredients.set_index(ingredients.index.str.split("_").str[0].astype(int))['core']).groupby(level=0).sum().reindex(df.index, fill_value=0).applymap(lambda x:1 if x>0 else 0)
del ingredients
unstacked.head()

Unnamed: 0_level_0,OTHER,allspice,almond,apple,applesauce,asparagus,avocado,bacon,banana,basil,...,walnut,warm,water,whiskey,wine,yeast,yogurt,yolk,zest,zucchini
recipe_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100182,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10025,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10033,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100386,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10049,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Clustering

Implement k-modes to cluster recipes based on ingredients.

In [4]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=0).fit(unstacked)

In [27]:
[unstacked.columns[center>unstacked.mean()] for center in kmeans.cluster_centers_]

[Index(['allspice', 'bay', 'bean', 'beaten', 'beef', 'breast', 'broccoli',
        'bun', 'cabbage', 'carrot', 'celery', 'chestnut', 'chicken', 'chilies',
        'chive', 'chop', 'cilantro', 'coconut', 'cold', 'cornstarch',
        'cucumber', 'fillet', 'firm', 'floret', 'garlic', 'ginger', 'grass',
        'ground', 'honey', 'ketchup', 'kosher', 'lettuce', 'lime', 'meat',
        'mint', 'molasses', 'mushroom', 'mustard', 'noodle', 'oil', 'onion',
        'paprika', 'paste', 'pea', 'peanut', 'pepper', 'peppercorn',
        'pineapple', 'rib', 'rice', 'roast', 'roll', 'root', 'salmon', 'sauce',
        'sesame', 'shallot', 'sherry', 'shrimp', 'skewer', 'smoke', 'spaghetti',
        'steak', 'sugar', 'tenderloin', 'vegetable', 'vinegar', 'water',
        'whiskey', 'wine'],
       dtype='object'),
 Index(['OTHER', 'almond', 'apple', 'asparagus', 'banana', 'beer', 'blueberry',
        'cajun', 'cardamom', 'cherry', 'chip', 'chive', 'chocolate', 'clove',
        'coconut', 'cold', 'crack

In [6]:
from kmodes.kmodes import KModes

In [20]:
km = KModes(n_clusters=12, init='Huang', n_init=5, verbose=0)

clusters = km.fit_predict(unstacked)

In [21]:
[unstacked.columns[centroid.astype(bool)] for centroid in km.cluster_centroids_]

[Index(['butter', 'cinnamon', 'flour', 'salt', 'sugar'], dtype='object'),
 Index(['oil', 'pepper'], dtype='object'),
 Index(['butter', 'chicken', 'pepper'], dtype='object'),
 Index(['garlic', 'oil', 'onion', 'pepper', 'sauce'], dtype='object'),
 Index(['OTHER', 'garlic', 'lemon', 'oil', 'parsley', 'pepper', 'salt'], dtype='object'),
 Index([], dtype='object'),
 Index(['cheese', 'onion', 'pepper'], dtype='object'),
 Index(['bread', 'butter', 'cheese', 'pepper', 'salt'], dtype='object'),
 Index(['garlic', 'oil', 'onion', 'pepper', 'salt'], dtype='object'),
 Index(['butter', 'egg', 'extract', 'flour', 'salt', 'sugar'], dtype='object'),
 Index(['garlic', 'oil', 'onion', 'pepper'], dtype='object'),
 Index(['butter', 'chip', 'sugar'], dtype='object')]

In [None]:
def calc_lift(a, b, df):
    total_size = len(df)
    filter_a = df[df[a]>0]
    num_a = len(filter_a)
    num_b = len(df[df[b]>0])
    num_a_b = len(filter_a[filter_a[b]>0])
    return total_size*float(num_a_b)/float(num_a*num_b)

In [None]:
demos = unstacked.sum().sort_values(ascending=False).sample(n=100).index

result = pd.DataFrame([[0]*(i+1)+[calc_lift(a,demos[i], unstacked) for a in demos[i+1:]] for i in range(len(demos))],
                     index=demos, columns=demos)

result+result.T

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(unstacked, df['Bread Recipes']>0, 
                                                    test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression()
log.fit(X_train, y_train)

In [None]:
pd.Series(log.coef_[0], index=X_train.columns).sort_values(ascending=False)