# Recipe Generation using Recurrent Neural Network (RNN)

## Importing dependencies

In [79]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import platform
import time
import pathlib
import os
import json

print('Python version:', platform.python_version())
print('Tensorflow version:', tf.__version__)
print('Keras version:', tf.keras.__version__)

Python version: 3.7.6
Tensorflow version: 2.1.0
Keras version: 2.2.4-tf


## Exploring datasets

- 🤷 [Recipe Ingredients Dataset](https://www.kaggle.com/kaggle/recipe-ingredients-dataset/home) _(doesn't have ingredients proportions)_
- 🤷 [Recipe1M+](http://pic2recipe.csail.mit.edu/) _(requires registration to download)_
- 🤷 [Epicurious - Recipes with Rating and Nutrition](https://www.kaggle.com/hugodarwood/epirecipes?select=full_format_recipes.json) _(~20k recipes only, it would be nice to find more)_
- 👍🏻 [**Recipe box**](https://eightportions.com/datasets/Recipes/) _(~125,000 recipes with ingredients proportions)_

## Loading the dataset

In [80]:
# Create cache folder.
cache_dir = './tmp'
pathlib.Path(cache_dir).mkdir(exist_ok=True)

In [81]:
# Download and unpack the dataset.
dataset_file_name = 'recipes_raw.zip'
dataset_file_origin = 'https://storage.googleapis.com/recipe-box/recipes_raw.zip'

dataset_file_path = tf.keras.utils.get_file(
    fname=dataset_file_name,
    origin=dataset_file_origin,
    cache_dir=cache_dir,
    extract=True,
    archive_format='zip'
)

print(dataset_file_path)

./tmp/datasets/recipes_raw.zip


In [82]:
!ls -la ./tmp/datasets/

total 521128
drwxr-xr-x  7 trekhleb  staff       224 May 13 18:10 [34m.[m[m
drwxr-xr-x  3 trekhleb  staff        96 May 13 18:10 [34m..[m[m
-rw-r--r--  1 trekhleb  staff     20437 May 14 16:56 LICENSE
-rw-r--r--  1 trekhleb  staff  53355492 May 13 18:10 recipes_raw.zip
-rw-r--r--  1 trekhleb  staff  49784325 May 14 16:56 recipes_raw_nosource_ar.json
-rw-r--r--  1 trekhleb  staff  61133971 May 14 16:56 recipes_raw_nosource_epi.json
-rw-r--r--  1 trekhleb  staff  93702755 May 14 16:56 recipes_raw_nosource_fn.json


In [83]:
def load_dataset(silent=False):
    dataset_file_names = [
        'recipes_raw_nosource_ar.json',
        'recipes_raw_nosource_epi.json',
        'recipes_raw_nosource_fn.json',
    ]
    
    dataset = []

    for dataset_file_name in dataset_file_names:
        dataset_file_path = f'{cache_dir}/datasets/{dataset_file_name}'

        with open(dataset_file_path) as dataset_file:
            json_data_dict = json.load(dataset_file)
            json_data_list = list(json_data_dict.values())
            dict_keys = [key for key in json_data_list[0]]
            dict_keys.sort()
            dataset += json_data_list

            if silent == False:
                print(dataset_file_path)
                print('===========================================')
                print('Number of examples: ', len(json_data_list), '\n')
                print('Example object keys:\n', dict_keys, '\n')
                print('Example object:\n', json_data_list[0], '\n')
                print('Required keys:\n')
                print('  title: ', json_data_list[0]['title'], '\n')
                print('  ingredients: ', json_data_list[0]['ingredients'], '\n')
                print('  instructions: ', json_data_list[0]['instructions'])
                print('\n\n')
    
    return dataset  

In [84]:
dataset_raw = load_dataset()  

./tmp/datasets/recipes_raw_nosource_ar.json
Number of examples:  39802 

Example object keys:
 ['ingredients', 'instructions', 'picture_link', 'title'] 

Example object:
 {'title': 'Slow Cooker Chicken and Dumplings', 'ingredients': ['4 skinless, boneless chicken breast halves ADVERTISEMENT', '2 tablespoons butter ADVERTISEMENT', '2 (10.75 ounce) cans condensed cream of chicken soup ADVERTISEMENT', '1 onion, finely diced ADVERTISEMENT', '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ADVERTISEMENT', 'ADVERTISEMENT'], 'instructions': 'Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\nCover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.\n', 'picture_link': '55lznCYBbs2mT8BTx6BTkLhynGHzM.S'} 

Required keys:

  title:  Slow Cooker Chicken and Dumplings 

  ingredients:  ['4 skinless, boneless chicke

In [85]:
print('Total number of examples: ', len(dataset_raw))

Total number of examples:  125164


## Preprocessing the dataset

In [86]:
def recipe_validate(recipe):
    required_keys = ['title', 'ingredients', 'instructions']
    
    if not recipe:
        return False
    
    for required_key in required_keys:
        if not recipe[required_key]:
            return False
        
        if type(recipe[required_key]) == list and len(recipe[required_key]) == 0:
            return False
    
    return True

In [123]:
def recipe_to_string(recipe):
    noize_string = 'ADVERTISEMENT'
    
    stop_sign = '★'
    
    title = recipe['title']
    ingredients = recipe['ingredients']
    instructions = recipe['instructions'].split('\n')
    
    ingredients_string = ''
    for ingredient in ingredients:
        ingredient = ingredient.replace(noize_string, '')
        if ingredient:
            ingredients_string += f'\n• {ingredient}'
    
    instructions_string = ''
    for instruction in instructions:
        instruction = instruction.replace(noize_string, '')
        if instruction:
            instructions_string += f'\n▪︎ {instruction}'
    
    return f'[TITLE] \n{title} \n\n[INGREDIENTS] {ingredients_string} \n\n[INSTRUCTIONS] {instructions_string}\n\n{stop_sign}\n\n'

In [124]:
dataset_filtered = [recipe for recipe in dataset_raw if recipe_validate(recipe)]

print('Dataset size BEFORE filtering', len(dataset_raw))
print('Dataset size AFTER filtering', len(dataset_filtered))
print('Number of invalide recipes', len(dataset_raw) - len(dataset_filtered))

Dataset size BEFORE filtering 125164
Dataset size AFTER filtering 122938
Number of invalide recipes 2226


In [125]:
dataset_stringified = [recipe_to_string(recipe) for recipe in dataset_filtered]

print('Dataset size: ', len(dataset_stringified))

Dataset size:  122938


In [126]:
for recipe_string in dataset_stringified[:5]:
    print(recipe_string)

[TITLE] 
Slow Cooker Chicken and Dumplings 

[INGREDIENTS] 
• 4 skinless, boneless chicken breast halves 
• 2 tablespoons butter 
• 2 (10.75 ounce) cans condensed cream of chicken soup 
• 1 onion, finely diced 
• 2 (10 ounce) packages refrigerated biscuit dough, torn into pieces  

[INSTRUCTIONS] 
▪︎ Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.
▪︎ Cover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.

★


[TITLE] 
Awesome Slow Cooker Pot Roast 

[INGREDIENTS] 
• 2 (10.75 ounce) cans condensed cream of mushroom soup 
• 1 (1 ounce) package dry onion soup mix 
• 1 1/4 cups water 
• 5 1/2 pounds pot roast  

[INSTRUCTIONS] 
▪︎ In a slow cooker, mix cream of mushroom soup, dry onion soup mix and water. Place pot roast in slow cooker and coat with soup mixture.
▪︎ Cook on High setting for 3 to 4 hours, or on Low settin

In [127]:
dataset = tf.data.Dataset.from_tensor_slices(dataset_stringified)

print(dataset)

<TensorSliceDataset shapes: (), types: tf.string>


In [128]:
for recipe in dataset.take(1):
    print(recipe.numpy().decode())

[TITLE] 
Slow Cooker Chicken and Dumplings 

[INGREDIENTS] 
• 4 skinless, boneless chicken breast halves 
• 2 tablespoons butter 
• 2 (10.75 ounce) cans condensed cream of chicken soup 
• 1 onion, finely diced 
• 2 (10 ounce) packages refrigerated biscuit dough, torn into pieces  

[INSTRUCTIONS] 
▪︎ Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.
▪︎ Cover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.

★




## Creating vocabulary

In [129]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    char_level=True
)

In [29]:
tokenizer.fit_on_texts(dataset_stringified)

In [31]:
tokenizer.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': True,
 'oov_token': None,
 'document_count': 1,
 'word_counts': '{"p": 3, "l": 8, "a": 9, "c": 12, "e": 22, " ": 51, "t": 17, "h": 13, "i": 14, "k": 5, "n": 18, ",": 6, "b": 4, "u": 10, "r": 14, "s": 8, "o": 28, "d": 5, "w": 5, "f": 3, "g": 6, "v": 3, ".": 4, "\\n": 2, "5": 1, "6": 1, "3": 1, "0": 1, "m": 1}',
 'word_docs': '{" ": 1, "d": 1, ",": 1, "3": 1, "s": 1, "t": 1, "e": 1, "o": 1, ".": 1, "h": 1, "w": 1, "b": 1, "c": 1, "v": 1, "6": 1, "p": 1, "f": 1, "r": 1, "\\n": 1, "5": 1, "u": 1, "i": 1, "l": 1, "m": 1, "k": 1, "0": 1, "g": 1, "n": 1, "a": 1}',
 'index_docs': '{"1": 1, "17": 1, "14": 1, "27": 1, "13": 1, "5": 1, "3": 1, "2": 1, "20": 1, "8": 1, "18": 1, "19": 1, "9": 1, "23": 1, "26": 1, "21": 1, "22": 1, "7": 1, "24": 1, "25": 1, "10": 1, "6": 1, "12": 1, "29": 1, "16": 1, "28": 1, "15": 1, "4": 1, "11": 1}',
 'index_word': '{"1": " ", "2": "o", "3": "e", 

In [32]:
tokenizer.fit_on_texts([dataset_raw[1]['instructions']])

In [33]:
tokenizer.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': True,
 'oov_token': None,
 'document_count': 2,
 'word_counts': '{"p": 8, "l": 12, "a": 17, "c": 18, "e": 30, " ": 93, "t": 29, "h": 19, "i": 24, "k": 8, "n": 28, ",": 9, "b": 4, "u": 17, "r": 27, "s": 19, "o": 57, "d": 8, "w": 10, "f": 6, "g": 9, "v": 3, ".": 7, "\\n": 4, "5": 1, "6": 1, "3": 2, "0": 1, "m": 7, "x": 3, "y": 1, "4": 1, "8": 1, "9": 1}',
 'word_docs': '{" ": 2, "d": 2, ",": 2, "3": 2, "s": 2, "t": 2, "e": 2, "o": 2, ".": 2, "h": 2, "w": 2, "b": 1, "c": 2, "v": 1, "6": 1, "p": 2, "f": 2, "r": 2, "\\n": 2, "5": 1, "u": 2, "i": 2, "l": 2, "m": 2, "k": 2, "0": 1, "g": 2, "n": 2, "a": 2, "y": 1, "4": 1, "8": 1, "x": 1, "9": 1}',
 'index_docs': '{"1": 2, "17": 2, "14": 2, "27": 2, "13": 2, "5": 2, "3": 2, "2": 2, "20": 2, "8": 2, "18": 2, "19": 2, "9": 2, "23": 1, "26": 1, "21": 2, "22": 2, "7": 2, "24": 2, "25": 1, "10": 2, "6": 2, "12": 2, "29": 1, "16": 2, 

In [38]:
tokenizer.word_index['e']

3