**TODO:**
* Stemming for all ingredients in recipes
* Remove adjectives from ingredients in recipes
    if this fails, use standardized ingredients from some website
* Create techniques vector, write down 20 techniques and see if they are present in steps string
* 



In [64]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, LSTM, GRU
from keras.preprocessing.sequence import pad_sequences
import ast
import tensorflow as tf
from functools import reduce
import pickle
from tqdm import tqdm_notebook as tqdm


In [38]:
#Read files
recipes_path = 'data/RAW_recipes.csv'
orig_recipes = pd.read_csv(recipes_path)
orig_recipes = orig_recipes['ingredients']
orig_recipes = list(orig_recipes.apply(ast.literal_eval))

ingredients_with_dup = []
for recipe in orig_recipes:
    ingredients_with_dup += recipe

df = pd.Series(data=ingredients_with_dup)
ingr_counts = df.value_counts()
tot_sum = ingr_counts.sum()

ingredients = list(ingr_counts[ingr_counts > 100].index)
ingredients_set = set(ingredients)
ingredients_dict = {ingr:i for i,ingr in tqdm(enumerate(ingredients))}


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [39]:
def recipe_to_idx(recipe):
    return list(map(lambda x : ingredients_dict[x], recipe))

def idx_to_recipe(idx_list):
    return [ingredients[i] for i in idx_list]

def filter_recipes(recipes):
    """
    removes recipe if ingredients in recipe is not in our ingredients list
    """
    filtered_recipes = []
    for i, recipe in enumerate(recipes):
        found = True
        for word in recipe:
            if word not in ingredients_set:
                found = False
                break
        if found and len(recipe) > 0:
            filtered_recipes.append(recipe)
    return filtered_recipes

def create_labels(recipes_indices):
    """
    creates a list of labels, where each label is an index pointing to some ingredient in the ingredients list
    """
    y_indices = []
    for recipe in recipes_indices:
        label_pos = random.randint(0, len(recipe) - 1)
        y_indices.append(recipe[label_pos])
        recipe.pop(label_pos)
    return y_indices

def pad_X(recipes_indices):
    """
    Pads the index lists so that they all have the same size
    """
    longest_recipe_size = max([len(recipe) for recipe in recipes_indices])
    X = pad_sequences(recipes_indices, maxlen=longest_recipe_size, padding='post')
    return X

def create_onehots(y_indices):
    """
    create one hot vectors for the labels
    """
    no_examples = len(y_indices)
    no_classes = len(ingredients)
    y = np.empty([no_examples, no_classes])
    for i, hot_idx in enumerate(y_indices):
        y_onehot = np.zeros(no_classes)
        y_onehot[hot_idx] = 1.0
        y[i] = y_onehot
    return y
    
print(len(orig_recipes))
filtered_recipes = filter_recipes(orig_recipes)
print(len(filtered_recipes))

recipes_indices = list(map(recipe_to_idx, tqdm(filtered_recipes)))

y_indices = create_labels(recipes_indices)

X = pad_X(recipes_indices)

y = create_onehots(y_indices)
print(X.shape)
print(y.shape)

231637
118996


HBox(children=(IntProgress(value=0, max=118996), HTML(value='')))


(118996, 34)
(118996, 1826)


In [10]:
#Train Neural network
no_classes = len(ingredients)
input_length = X.shape[1]
def get_compiled_model():
    #model using embeddings
    model = Sequential([
            Embedding(input_dim=no_classes,
                    output_dim=100, 
                    input_length=input_length),
            Flatten(),
            Dense(900, activation='relu'),
            Dense(no_classes, activation='softmax')
        ])
    model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    return model

model = get_compiled_model()
print(model.summary())
model.fit(x=X, y=y, validation_split=0.2, epochs=15, batch_size=1024)

34
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 34, 100)           182600    
_________________________________________________________________
flatten_3 (Flatten)          (None, 3400)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 900)               3060900   
_________________________________________________________________
dense_8 (Dense)              (None, 1826)              1645226   
Total params: 4,888,726
Trainable params: 4,888,726
Non-trainable params: 0
_________________________________________________________________
None
Train on 95196 samples, validate on 23800 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x1a4188cc90>

In [43]:
def predict_ingredients(recipe, k):
    """
    returns k best suggestions in text
    """
    
    #convert word recipe to neural network input format
    indices = recipe_to_idx(recipe)
    
    longest_recipe_size = max([len(recipe) for recipe in recipes_indices])
    #pad sequences only take list as argument, thus we have to pick the first element whne we only 
    #want to predict one element
    X = pad_sequences([indices], maxlen=longest_recipe_size, padding='post')
    
    #predict
    suggested_ingr = model.predict(X)[0]
    scores = np.sort(suggested_ingr)[::-1][:k]
    
    #get k "best" ingredients
    k = min(k, len(suggested_ingr))
    best_ingr_idx_list = suggested_ingr.argsort()[-k:][::-1]
    
    #convert neural network format to word recipe
    return list(zip(idx_to_recipe(best_ingr_idx_list), scores))
  
#salad_recipe = ['lettuce', 'tomatoes', 'onion']
meat_recipe = ['avocado', 'ground beef', 'sour cream']
print(predict_ingredients(meat_recipe, 10))

[('cheddar cheese', 0.08336744), ('onion', 0.074759744), ('salt and pepper', 0.059468783), ('salt', 0.05335347), ('white onion', 0.043249637), ('tomatoes', 0.039837014), ('blue cheese', 0.034071136), ('lemon juice', 0.027857946), ('green onions', 0.023794126), ('onion soup mix', 0.022429418)]
