In [1]:
# imports.  you're prolly gonna use all these, right?
import pdb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

import re
import string

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# Pulling in recipe lists
df = pd.read_csv('../data/clean_recipes.csv', sep=';')
kgl1_df = pd.read_json('../data/kgl_ingredient_train.json')
kgl2_df = pd.read_json('../data/kgl_ingredient_test.json')

In [3]:
df = df.rename(columns={'Ingredients':'ingredients'})
df.head(3)

Unnamed: 0,Recipe Name,Review Count,Recipe Photo,Author,Prepare Time,Cook Time,Total Time,ingredients,Directions,RecipeID
0,Golden Crescent Rolls Recipe,304,https://images.media-allrecipes.com/userphotos...,Mike A.,25 m,15 m,3 h 10 m,"yeast,water,white sugar,salt,egg,butter,flour,...","Dissolve yeast in warm water.**Stir in sugar, ...",7000
1,Poppy Seed Bread with Glaze Recipe,137,https://images.media-allrecipes.com/userphotos...,Christina Jun,15 m,1 h,1 h 20 m,"flour,salt,baking powder,poppy,butter,vegetabl...",'Preheat oven to 350 degrees F (175 degrees C)...,7001
2,Applesauce Bread I Recipe,124,https://images.media-allrecipes.com/userphotos...,GAF55,10 m,1 h 20 m,1 h 30 m,"flour,egg,white sugar,vegetable oil,applesauce...",Preheat oven to 350 degrees F (175 degrees C)....,7003


In [4]:
# df ingredients is a single string, splitting into a list of strings
df['ingredients'] = df['ingredients'].apply(lambda x: x.split(','))

In [5]:
l1 = df['ingredients']
l2 = kgl1_df['ingredients']
l3 = kgl2_df['ingredients']

ingr_series = l1.append([l2, l3], ignore_index=True)

In [6]:
ingr_series[:10]

0    [yeast, water, white sugar, salt, egg, butter,...
1    [flour, salt, baking powder, poppy, butter, ve...
2    [flour, egg, white sugar, vegetable oil, apple...
3    [flour, baking powder, baking soda, salt, cinn...
4    [oat, buttermilk, vegetable oil, egg, brown su...
5    [shortening, white sugar, salt, milk, egg, lem...
6    [shortening, water, brown sugar, yeast, water,...
7    [water, cottage cheese, vegetable oil, egg, fl...
8    [margarine, white sugar, egg, banana, coffee, ...
9    [cornmeal, milk, egg, bell pepper, onion, garl...
Name: ingredients, dtype: object

In [7]:
def scrub_ingredients(ingr_list):
    
    chtble = str.maketrans('', '', string.punctuation + '0123456789')
    
    # drop special characters and multiple spaces, then convert to lowercase
    clean_list = []
    for ingr_string in ingr_list:
        ingr_string = re.sub(r'\W\s+', ' ', ingr_string.strip().translate(chtble))
        clean_list.append(ingr_string.lower())

    # convert to lowercase & drop known measurement words
    dropwords = ['ounce', 'ounces', 'oz', 'lb', ' pound', 'the', 'a', 'an'
                 'package', 'packages', 'inches', 'and', 'for', 'as', 'is']

    outlist = []

    for i in clean_list:
        words = i.split()
        words = [word for word in words if word not in dropwords]
        i = ' '.join(words)
        outlist.append(i)
    
    return outlist

In [8]:
ingr_s = ingr_series.apply(scrub_ingredients)
ingr_s[:10]

0    [yeast, water, white sugar, salt, egg, butter,...
1    [flour, salt, baking powder, poppy, butter, ve...
2    [flour, egg, white sugar, vegetable oil, apple...
3    [flour, baking powder, baking soda, salt, cinn...
4    [oat, buttermilk, vegetable oil, egg, brown su...
5    [shortening, white sugar, salt, milk, egg, lem...
6    [shortening, water, brown sugar, yeast, water,...
7    [water, cottage cheese, vegetable oil, egg, fl...
8    [margarine, white sugar, egg, banana, coffee, ...
9    [cornmeal, milk, egg, bell pepper, onion, garl...
Name: ingredients, dtype: object

In [9]:
def ingredient_count(dfcolumn):

    # Create dictionary with word count
    d = dict()

    for l in dfcolumn:
        for i in l:
            if i in d:
                d[i] = d[i] + 1
            else:
                d[i] = 1

    return pd.DataFrame(d.items(), columns=['ingredient', 'count'])

In [10]:
# create df of unique ingredients and their number of occurances in recipe dfs
ingr_count_df = ingredient_count(ingr_s).sort_values('count', ascending=False).reset_index(drop=True)
ingr_count_df.head(15)

Unnamed: 0,ingredient,count
0,salt,27050
1,water,11899
2,garlic,11287
3,butter,10534
4,onions,10008
5,olive oil,9889
6,sugar,9319
7,garlic cloves,7772
8,pepper,7080
9,vegetable oil,6874


In [12]:
# make a list of all ingredients with count < 100
drop_list = list(ingr_count_df[ingr_count_df['count'] < 100]['ingredient'])

In [13]:
modeling_df = ingr_s.apply(pd.Series)
modeling_df = modeling_df.rename(columns = lambda x: 'ingr_' + str(x))
modeling_df.head(10)

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,yeast,water,white sugar,salt,egg,butter,flour,butter,,,...,,,,,,,,,,
1,flour,salt,baking powder,poppy,butter,vegetable oil,egg,milk,white sugar,vanilla,...,,,,,,,,,,
2,flour,egg,white sugar,vegetable oil,applesauce,raisin,cinnamon,baking soda,baking powder,sour cream,...,,,,,,,,,,
3,flour,baking powder,baking soda,salt,cinnamon,nutmeg,brown sugar,oat,apple,walnut,...,,,,,,,,,,
4,oat,buttermilk,vegetable oil,egg,brown sugar,flour,baking powder,baking soda,salt,,...,,,,,,,,,,
5,shortening,white sugar,salt,milk,egg,lemon,yeast,flour,fruit,,...,,,,,,,,,,
6,shortening,water,brown sugar,yeast,water,white sugar,salt,bread,whole wheat,,...,,,,,,,,,,
7,water,cottage cheese,vegetable oil,egg,flour,white sugar,baking soda,salt,yeast,,...,,,,,,,,,,
8,margarine,white sugar,egg,banana,coffee,water,vanilla,flour,salt,baking powder,...,,,,,,,,,,
9,cornmeal,milk,egg,bell pepper,onion,garlic,salt,baking soda,white sugar,corn,...,,,,,,,,,,


In [18]:
ingr_dict = {drop_list[i]: 'remove_this' for i in range(len(drop_list))}

In [16]:
# modeling_df_backup = modeling_df.copy()

In [49]:
# RESET!
modeling_df = modeling_df_backup.copy()
modeling_df.head(3)

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,yeast,water,white sugar,salt,egg,butter,flour,butter,,,...,,,,,,,,,,
1,flour,salt,baking powder,poppy,butter,vegetable oil,egg,milk,white sugar,vanilla,...,,,,,,,,,,
2,flour,egg,white sugar,vegetable oil,applesauce,raisin,cinnamon,baking soda,baking powder,sour cream,...,,,,,,,,,,


In [50]:
# remove all recipes using ingredients from that list
modeling_df = modeling_df.stack().replace(ingr_dict).unstack()
modeling_df.head(3)

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,yeast,water,white sugar,salt,egg,butter,flour,butter,,,...,,,,,,,,,,
1,flour,salt,baking powder,remove_this,butter,vegetable oil,egg,milk,white sugar,vanilla,...,,,,,,,,,,
2,flour,egg,white sugar,vegetable oil,applesauce,raisin,cinnamon,baking soda,baking powder,sour cream,...,,,,,,,,,,


In [51]:
# anthr_backup = modeling_df.copy()

In [88]:
modeling_df = anthr_backup.copy()

In [89]:
modeling_df = modeling_df[(modeling_df.iloc[:, 1:] != 'remove_this').all(axis=1)]
modeling_df.head()

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_55,ingr_56,ingr_57,ingr_58,ingr_59,ingr_60,ingr_61,ingr_62,ingr_63,ingr_64
0,yeast,water,white sugar,salt,egg,butter,flour,butter,,,...,,,,,,,,,,
2,flour,egg,white sugar,vegetable oil,applesauce,raisin,cinnamon,baking soda,baking powder,sour cream,...,,,,,,,,,,
3,flour,baking powder,baking soda,salt,cinnamon,nutmeg,brown sugar,oat,apple,walnut,...,,,,,,,,,,
4,oat,buttermilk,vegetable oil,egg,brown sugar,flour,baking powder,baking soda,salt,,...,,,,,,,,,,
5,shortening,white sugar,salt,milk,egg,lemon,yeast,flour,fruit,,...,,,,,,,,,,


In [94]:
modeling_df = modeling_df.replace(np.nan, '', regex=True)
modeling_df.head()

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_23,ingr_24,ingr_25,ingr_26,ingr_27,ingr_28,ingr_29,ingr_30,ingr_31,ingr_32
0,yeast,water,white sugar,salt,egg,butter,flour,butter,,,...,,,,,,,,,,
2,flour,egg,white sugar,vegetable oil,applesauce,raisin,cinnamon,baking soda,baking powder,sour cream,...,,,,,,,,,,
3,flour,baking powder,baking soda,salt,cinnamon,nutmeg,brown sugar,oat,apple,walnut,...,,,,,,,,,,
4,oat,buttermilk,vegetable oil,egg,brown sugar,flour,baking powder,baking soda,salt,,...,,,,,,,,,,
5,shortening,white sugar,salt,milk,egg,lemon,yeast,flour,fruit,,...,,,,,,,,,,


In [115]:
modeling_df = modeling_df.loc[(modeling_df['ingr_6'] != ''), :]
modeling_df.head(3)

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_23,ingr_24,ingr_25,ingr_26,ingr_27,ingr_28,ingr_29,ingr_30,ingr_31,ingr_32
0,yeast,water,white sugar,salt,egg,butter,flour,butter,,,...,,,,,,,,,,
2,flour,egg,white sugar,vegetable oil,applesauce,raisin,cinnamon,baking soda,baking powder,sour cream,...,,,,,,,,,,
3,flour,baking powder,baking soda,salt,cinnamon,nutmeg,brown sugar,oat,apple,walnut,...,,,,,,,,,,


In [119]:
derplist = [1,2,3,4]
test = 1
test2 = 5
test2 not in derplist

True

In [120]:
# get all unique ingr names for sequencing
# Create dictionary with word count
modeling_ingr_list = []

for c in modeling_df.columns:
    for i in modeling_df[c]:
        if i not in modeling_ingr_list:
            modeling_ingr_list.append(i)

modeling_ingr_list[:5]

['yeast', 'flour', 'oat', 'shortening', 'water']

In [121]:
len(modeling_ingr_list)

814

In [36]:
# run model training
# pkl model
# deploy model

40594

In [122]:
ingr_int = {ingr:i for i, ingr in enumerate(modeling_ingr_list)}
int_ingr = {i:ingr for i, ingr in enumerate(modeling_ingr_list)}

In [125]:
import json
with open('ingr_int.json', 'w') as fp:
    json.dump(ingr_int, fp)

with open('int_ingr.json', 'w') as fp:
    json.dump(int_ingr, fp)

In [126]:
# convert ingredients to numeric representations
encoded_df = modeling_df.applymap(ingr_int.get)
print(encoded_df.shape)
encoded_df.head()

(17082, 33)


Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_23,ingr_24,ingr_25,ingr_26,ingr_27,ingr_28,ingr_29,ingr_30,ingr_31,ingr_32
0,0,4,18,56,10,26,1,26,813,813,...,813,813,813,813,813,813,813,813,813,813
2,1,10,18,19,7,27,97,9,293,49,...,813,813,813,813,813,813,813,813,813,813
3,1,293,9,56,97,53,23,2,17,36,...,813,813,813,813,813,813,813,813,813,813
4,2,51,19,10,23,1,293,9,56,813,...,813,813,813,813,813,813,813,813,813,813
5,3,18,56,8,10,44,0,1,50,813,...,813,813,813,813,813,813,813,813,813,813


In [150]:
(1 == 5 or 1 == 2)

False

In [151]:
def create_rec_sequences(recipe_row):

    # take length of row & create length 5 lists that can be added to sequences[]
    
    counter = 0

    r_list = []
    next_ingr = []
    
    for counter in range(0, len(recipe_row), 3):
        if (recipe_row[counter + 5] == 813 or (counter >= 26)):
            return r_list, next_ingr
        r_list.append(recipe_row[counter:counter+5])
        next_ingr.append(recipe_row[counter + 5])

In [131]:
encoded_df = encoded_df.reset_index(drop=True)
encoded_df.head(3)

Unnamed: 0,ingr_0,ingr_1,ingr_2,ingr_3,ingr_4,ingr_5,ingr_6,ingr_7,ingr_8,ingr_9,...,ingr_23,ingr_24,ingr_25,ingr_26,ingr_27,ingr_28,ingr_29,ingr_30,ingr_31,ingr_32
0,0,4,18,56,10,26,1,26,813,813,...,813,813,813,813,813,813,813,813,813,813
1,1,10,18,19,7,27,97,9,293,49,...,813,813,813,813,813,813,813,813,813,813
2,1,293,9,56,97,53,23,2,17,36,...,813,813,813,813,813,813,813,813,813,813


In [152]:
sequences = []
next_ingr = []

for row in range(0, len(encoded_df)):
    
    ingr_list = encoded_df.iloc[row]

    seq_to_add, next_to_add = create_rec_sequences(ingr_list)

    for seq in seq_to_add:
        sequences.append(seq)
    for nxt in next_to_add:
        next_ingr.append(nxt)

for i in range(0, 10):
    print(sequences[i], '\n', next_ingr[i])


ingr_0     0
ingr_1     4
ingr_2    18
ingr_3    56
ingr_4    10
Name: 0, dtype: int64 
 26
ingr_0     1
ingr_1    10
ingr_2    18
ingr_3    19
ingr_4     7
Name: 1, dtype: int64 
 27
ingr_3    19
ingr_4     7
ingr_5    27
ingr_6    97
ingr_7     9
Name: 1, dtype: int64 
 293
ingr_0      1
ingr_1    293
ingr_2      9
ingr_3     56
ingr_4     97
Name: 2, dtype: int64 
 53
ingr_3    56
ingr_4    97
ingr_5    53
ingr_6    23
ingr_7     2
Name: 2, dtype: int64 
 17
ingr_6     23
ingr_7      2
ingr_8     17
ingr_9     36
ingr_10    27
Name: 2, dtype: int64 
 10
ingr_0     2
ingr_1    51
ingr_2    19
ingr_3    10
ingr_4    23
Name: 3, dtype: int64 
 1
ingr_3     10
ingr_4     23
ingr_5      1
ingr_6    293
ingr_7      9
Name: 3, dtype: int64 
 56
ingr_0     3
ingr_1    18
ingr_2    56
ingr_3     8
ingr_4    10
Name: 4, dtype: int64 
 44
ingr_3     8
ingr_4    10
ingr_5    44
ingr_6     0
ingr_7     1
Name: 4, dtype: int64 
 50


In [153]:
print('\n sequences:', len(sequences), '\n', 'next_ingredients:', len(next_ingr))


 sequences: 36578 
 next_ingredients: 36578


In [154]:
max_pred_len = 10

In [155]:
x = np.zeros((len(sequences), max_pred_len, len(modeling_ingr_list)), dtype=np.bool)
y = np.zeros((len(sequences), len(modeling_ingr_list)), dtype=np.bool)

for i, sequence in enumerate(sequences):
    for t, ingr in enumerate(sequence):
        x[i,t,ingr] = 1
    
    y[i, next_ingr[i]] = 1

In [156]:
print(x.shape)
print(y.shape)

(36578, 10, 814)
(36578, 814)


In [158]:
model = Sequential()
model.add(LSTM(128, input_shape=(max_pred_len, len(modeling_ingr_list))))
model.add(Dense(len(modeling_ingr_list), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='nadam')

In [159]:
model.fit(x, y,
          batch_size=128,
          epochs=5)

Train on 36578 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1eac7e42988>

In [160]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [177]:
input_ingr = ['salt', 'flour', 'sugar', 'chocolate']

pred_next = []

for diversity in [0.2, 0.5, 1.0, 1.2]:

    start_ingr = [ingr_int[x] for x in input_ingr]
    
    for i in range(10):
        x_pred = np.zeros((1, max_pred_len, len(modeling_ingr_list)))
        for t, ingr in enumerate(start_ingr):
            x_pred[0, t, ingr] = 1

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_ingredient = int_ingr[next_index]

        pred_next.append(next_ingredient)


print(set(pred_next) - set(input_ingr))

{'yellow corn meal', 'vanilla', 'evaporated milk', 'cold water', 'shortening', 'onion', 'powdered sugar', 'egg', 'cinnamon', 'steak', 'orange juice', 'white sugar', 'butter', 'pecan', 'clove', 'active dry yeast', 'vegetable oil', 'milk'}


In [180]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               482816    
_________________________________________________________________
dense_1 (Dense)              (None, 814)               105006    
Total params: 587,822
Trainable params: 587,822
Non-trainable params: 0
_________________________________________________________________


In [181]:
model.save('baseline_pred.h5')

In [182]:
len(ingr_int)

814