In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
#import keras
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten
from keras.preprocessing.sequence import pad_sequences
import ast
import tensorflow as tf
from functools import reduce
import pickle
from tqdm import tqdm_notebook as tqdm

Using TensorFlow backend.


In [2]:
#Read files
recipes_path = 'data/RAW_recipes.csv'
orig_recipes = pd.read_csv(recipes_path)
orig_recipes = orig_recipes['ingredients']
orig_recipes = list(orig_recipes.apply(ast.literal_eval))

#get all ingredients from recipes and create new ingredients list
# ingredients_with_dup = []
# for recipe in orig_recipes:
#     ingredients_with_dup += recipe
# ingredients_set = set(ingredients_with_dup)
# ingredients = list(ingredients_set)
# ingredients_dict = {ingr:i for i,ingr in tqdm(enumerate(ingredients))}

ingredients_with_dup = []
for recipe in orig_recipes:
    ingredients_with_dup += recipe

df = pd.Series(data=ingredients_with_dup)
ingr_counts = df.value_counts()
print(df.size, ingr_counts.size)


tot_sum = ingr_counts.sum()

ingredients = list(ingr_counts[ingr_counts > 100].index)
ingredients_set = set(ingredients)
ingredients_dict = {ingr:i for i,ingr in tqdm(enumerate(ingredients))}


# f = open('ingredients.pkl', 'w')
# pickle.dump(ingredients, f)
# f.close()

# f = open('ingredients_set.pkl', 'w')
# pickle.dump(ingredients_set, f)
# f.close()


2096582 14942


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [3]:
def recipe_to_idx(recipe):
    return list(map(lambda x : ingredients_dict[x], recipe))

def idx_to_recipe(idx_list):
    return [ingredients[i] for i in idx_list]

def filter_recipes(recipes):
    """
    removes recipe if ingredients in recipe is not in our ingredients list
    """
    filtered_recipes = []
    for i, recipe in enumerate(recipes):
        found = True
        for word in recipe:
            if word not in ingredients_set:
                found = False
                break
        if found and len(recipe) > 0:
            filtered_recipes.append(recipe)
    return filtered_recipes

def create_labels(recipes_indices):
    """
    creates a list of labels, where each label is an index pointing to some ingredient in the ingredients list
    """
    y_indices = []
    for recipe in recipes_indices:
        label_pos = random.randint(0, len(recipe) - 1)
        y_indices.append(recipe[label_pos])
        recipe.pop(label_pos)
    return y_indices

def pad_X(recipes_indices):
    """
    Pads the index lists so that they all have the same size
    """
    longest_recipe_size = max([len(recipe) for recipe in recipes_indices])
    X = pad_sequences(recipes_indices, maxlen=longest_recipe_size, padding='post')
    return X

def create_onehots(y_indices):
    """
    create one hot vectors for the labels
    """
    no_examples = len(y_indices)
    no_classes = len(ingredients)
    y = np.empty([no_examples, no_classes])
    for i, hot_idx in enumerate(y_indices):
        y_onehot = np.zeros(no_classes)
        y_onehot[hot_idx] = 1.0
        y[i] = y_onehot
    return y
    
print(len(orig_recipes))
filtered_recipes = filter_recipes(orig_recipes)
print(len(filtered_recipes))

recipes_indices = list(map(recipe_to_idx, tqdm(filtered_recipes)))

y_indices = create_labels(recipes_indices)

X = pad_X(recipes_indices)

y = create_onehots(y_indices)
print(X.shape)
print(y.shape)

231637
118996


HBox(children=(IntProgress(value=0, max=118996), HTML(value='')))


(118996, 34)
(118996, 1826)


In [4]:
#Train Neural network
print(X.shape[1])
no_classes = len(ingredients)
input_length = X.shape[1]
def get_compiled_model():
    #model using embeddings
    model = Sequential([
            Embedding(input_dim=no_classes,
                    output_dim=100, 
                    input_length=input_length),
            Flatten(),
            Dense(no_classes, activation='softmax')
        ])
    model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    return model

model = get_compiled_model()
print(model.summary())
model.fit(x=X, y=y, validation_split=0.2, epochs=15, batch_size=1024)

34
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 34, 100)           182600    
_________________________________________________________________
flatten_1 (Flatten)          (None, 3400)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1826)              6210226   
Total params: 6,392,826
Trainable params: 6,392,826
Non-trainable params: 0
_________________________________________________________________
None

Train on 95196 samples, validate on 23800 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
 7168/95196 [=>............................] - ETA: 52s - loss: 2.5271 - ac

KeyboardInterrupt: 

In [33]:
predictions = model.predict(X_test)

array([6.2584877e-07, 6.2584877e-07, 6.8545341e-07, ..., 6.2584877e-07,
       6.2584877e-07, 6.2584877e-07], dtype=float32)

In [309]:
def predict_ingredients(recipe, k):
    """
    returns k best suggestions in text
    """
    
    #convert word recipe to neural network input format
    indices = recipe_to_idx(recipe)
    
    longest_recipe_size = max([len(recipe) for recipe in recipes_indices])
    #pad sequences only take list as argument, thus we have to pick the first element whne we only 
    #want to predict one element
    X = pad_sequences([indices], maxlen=longest_recipe_size, padding='post')
    
    #predict
    suggested_ingr = model.predict(X)[0]
    scores = np.sort(suggested_ingr)[::-1][:k]
    
    #get k "best" ingredients
    k = min(k, len(suggested_ingr))
    best_ingr_idx_list = suggested_ingr.argsort()[-k:][::-1]
    
    #convert neural network format to word recipe
    return list(zip(idx_to_recipe(best_ingr_idx_list), scores))
  



#pizza_recipe = ['pizza crust', 'sausage', 'egg', 'milk', 'salt and pepper', 'cheese']
pizza_recipe = ['lettuce']
#pizza_recipe = ['soft silken tofu', 'avocado', 'chunky salsa', 'fresh cilantro', 'lime juice', 'hot sauce', 'salt and pepper']
#our_recipe = ['pasta', 'tomato']
print(predict_ingredients(pizza_recipe, 10))

[('beer', 0.012032799), ('sour cream', 0.011091416), ('green grape', 0.010717132), ('brown sugar', 0.01036949), ('cinnamon', 0.009476901), ('vodka', 0.008648412), ('tequila', 0.008561748), ('banana', 0.007880853), ('sugar', 0.007564536), ('milk', 0.007272173)]
