**Mounting Google Drive for importing the Data Files which will be used in the Tokenization**

**Downloading, Installing & Importing Required Libraries**

In [1]:
!pip install regex numpy pandas torch tqdm matplotlib transformers



In [2]:
import re
import os
import math
import torch
import random
import numpy as np
import pandas as pd
from tqdm import trange
import torch.nn.functional as F
import matplotlib.pyplot as plt
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import  AutoTokenizer,AutoModelWithLMHead

  from .autonotebook import tqdm as notebook_tqdm


**Importing Recipe Ingredient Tables**

In [3]:
table = pd.read_csv("/home/hiren20066/BTP_Dev/Dataset/RecipeDB_ingredient_phrase.csv")

**Fetching the "recipe_no" and "ingredient" columns**

In [4]:
recipe_ingredient_table = table[['recipe_no', 'ingredient']].copy()

**Observation: Same ingredient is used more than once in the same recipe, for example "water" is used more than once in the recipe "2610.0"**

**Removing Duplicate rows**

In [5]:
recipe_ingredient_table_unique = recipe_ingredient_table.drop_duplicates(keep = 'first')
recipe_ingredient_table_unique = recipe_ingredient_table_unique[~recipe_ingredient_table_unique['ingredient'].isna()]

**Table that maps Recipe number to its ingredients result is a Dictionary that maps Recipe number to its ingredients list**

In [6]:
result=recipe_ingredient_table_unique.groupby('recipe_no')['ingredient'].apply(list).to_dict()
keys = list(result.keys())
values = list(result.values())
recipe_size =[ len(listElem) for listElem in values]

**final_df1 contains recipe_no, ingredients and recipe_size**

In [7]:
df1 = pd.DataFrame(list(zip(keys,values,recipe_size)),columns=['recipe_no','ingredients','recipe_size'])
final_df1 = df1.sort_values(by=['recipe_size'])
recipe_size_1 = final_df1.loc[final_df1['recipe_size'] == 1]
recipe_id_size_one_list = recipe_size_1['recipe_no'].tolist()
recipe_size_1_cooking_procedure = table[table['recipe_no'].isin(recipe_id_size_one_list)]

**Removing recipes from the "recipe_ingredient_table_unique table" with size equal to 1**

In [8]:
recipe_ingredient_table_unique = recipe_ingredient_table_unique[~recipe_ingredient_table_unique['recipe_no'].isin(recipe_id_size_one_list)]

**Finding count of each ingredient across the recipes**

In [9]:
df_count = recipe_ingredient_table_unique['ingredient'].value_counts()
recipe_ingredient_table_count = pd.DataFrame({'ingredient': df_count.index, 'Recipe_Count':df_count.values})

**Evaluating the PMF(Probability Mass Function) and CDF(Cumulative Distribution Function) values for each ingredient**

In [10]:
ingredients_count = recipe_ingredient_table_count.shape[0]                             ## ingredients_count is the total number of unique ingredients across all the recipes
recipe_count_list = recipe_ingredient_table_count['Recipe_Count'].tolist()             ## recipe_count_list contains the list of recipe_count for each ingredient
recipe_count_list_unique = recipe_ingredient_table_count['Recipe_Count'].unique()      ## recipe_count_list_unique contains the unique values of recipe_counts

pmf_list_unique = []                                                                   ## pmf_list_unique contains the pmf values corresponding to each recipe count
for item in recipe_count_list_unique:
    a = recipe_count_list.count(item)
    # print(a)
    pmf = a / ingredients_count
    pmf_list_unique.append(pmf)

cdf = 0                                                                                ## cdf_list_unique contains the cdf values corresponding to each recipe count
cdf_list_unique = []
for pmf in pmf_list_unique:
    cdf = cdf + pmf
    cdf_list_unique.append(cdf)

data = {'Recipe_Count': recipe_count_list_unique ,'Pmf': pmf_list_unique, 'Cdf': cdf_list_unique}
df = pd.DataFrame(data)

df1 = pd.merge(recipe_ingredient_table_count, df, how='inner', on = 'Recipe_Count')

**Creating Input Function that will perform the following tasks:**

**1. Taking random n(number of ingredients to select) and fetching same number of ingredients based on random cdf values selected.**

**2. In case the randomly selected cdf value belongs to more than one ingredients, then we select any one of them randomly.**

**3. Removing Duplicate Ingredients.**

**4. Coverting list to ingredients to single string of the form which is compatible with the out GPT2 model.**

In [11]:
def takeRandomInput():
  cdfValues=df['Cdf'].tolist()
  ingredientsChoices=[2,3,4,5,6,7,8]
  randomNumberOfIngredients=random.choice(ingredientsChoices)
  inputIngredientsList=list()
  for i in range(0,randomNumberOfIngredients):
    currentRandomCdf=random.choice(cdfValues)
    currentCdfIngredeintsList=list()
    for ind in df1.index:
      if(df1['Cdf'][ind]==currentRandomCdf):
        currentCdfIngredeintsList.append(df1['ingredient'][ind])
    inputIngredientsList.append(random.choice(currentCdfIngredeintsList))

  res = []
  for i1 in inputIngredientsList:
    if i1 not in res:
      res.append(i1)

  inputIngredientsString=str()
  for eachIngredeint in res:
    inputIngredientsString=str(eachIngredeint)+str(",")+inputIngredientsString
  inputIngredientsString=inputIngredientsString[0:len(inputIngredientsString)-1]
  inputIngredientsString=inputIngredientsString+str(";")
  return inputIngredientsString

In [12]:
takeRandomInput()

'hash brown,basil,double cream,brown lentil;'

**Building Model Pre-Requisites**

In [13]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)

In [14]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value
    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

In [15]:
def sample_sequence(model, length, context, tokenizer, num_samples=1, temperature=1, top_k=0, top_p=0.0, device = 'gpu'):
    end_token = tokenizer.convert_tokens_to_ids(["<END_RECIPE>"])[0]
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
    with torch.no_grad():
        for _ in trange(length):
            inputs = {'input_ids': generated}
            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
            next_token_logits = outputs[0][0, -1, :] / temperature
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
            if next_token.item() == end_token:
                print('breaking----->>')
                break
    return generated

In [16]:
set_seed(20)

**Defining the Method that will generate the Novel recipe by providing the list of Input Ingredients to Trained GPT2 Model**

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [18]:
def startRatatouileModel(ingredientsList):
  #Prepares model and provides the above random generated ingredients to Ratatouile model
  MODEL_CLASSES = {
    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
  }
  MODEL_CLASSES1 = {
    'gpt2': (AutoModelWithLMHead, AutoTokenizer),
  }
  model_class, tokenizer_class = MODEL_CLASSES['gpt2']
  tokenizer = tokenizer_class.from_pretrained('/home/hiren20066/BTP_Dev/Dataset_Project/project_model')
  model = model_class.from_pretrained('/home/hiren20066/BTP_Dev/Dataset_Project/project_model')
  model.to(torch.device("cuda" ))
  model.eval()

  raw_text=ingredientsList

  prepared_input = f'<RECIPE_START><INPUT_START> ' + ingredientsList.replace(',', ' <NEXT_INPUT> ').replace(';', ' <INPUT_END>')
  context_tokens = tokenizer.encode(prepared_input)

  out = sample_sequence(
    model=model,
    context=context_tokens,
    tokenizer=tokenizer,
    length=768,
    temperature=1,
    top_k=30,
    top_p=1,
    device=torch.device("cuda")
  )
  out = out[0, len(context_tokens):].tolist()
  text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
  print(tokenizer.decode)
  if "<RECIPE_END>" not in text:
    print(text)
    print("Failed to generate, recipe's too long")
  return text, prepared_input

**Defining the Final Dataframe that will contain the generated Novel Recipes**

**Defining the variable that will define how many novel recipes we want to generate using the loop, By default, we are setting it to 10k, change according to your need.**

In [19]:
import time
import pandas as pd

In [20]:
def generate_and_save_recipes(number_of_recipes, output_path):
    novelRecipesDataframe = pd.DataFrame(columns=['Random Ingredients', 'Recipe Title', 'Ingredient Phrases', 'Recipe Instructions'])
    total_time = 0

    for i in range(number_of_recipes):
        start_time = time.time() 
        randomIngredients = takeRandomInput()
        novelRecipeGenerated, user_input = startRatatouileModel(randomIngredients)
        generated_recipe = process_recipe(novelRecipeGenerated)

        rnidx = generated_recipe.find("Name:- ##\n")
        igidx = generated_recipe.find("dients ##\n")
        instnidx = generated_recipe.find("uctions ##\n")
        lastidx = generated_recipe.find("\n\n\n\n\n\n")

        resname = generated_recipe[rnidx + 11:igidx-12]
        ings = generated_recipe[igidx+10:instnidx-19].lower()
        instn = format_instructions(generated_recipe[instnidx+11:lastidx])

        df2 = {'Random Ingredients': randomIngredients, 'Recipe Title': resname, 'Ingredient Phrases': ings, 'Recipe Instructions': instn}
        novelRecipesDataframe = pd.concat([novelRecipesDataframe, pd.DataFrame([df2])], ignore_index=True)

        end_time = time.time()
        time_taken = end_time - start_time
        total_time += time_taken

        average_time = total_time / (i + 1)
        recipes_left = number_of_recipes - (i + 1)
        estimated_time_left = average_time * recipes_left

        print(f"Generated recipe {i+1}/{number_of_recipes}. Time taken: {time_taken:.2f} seconds. Estimated time remaining: {estimated_time_left:.2f} seconds.")

    novelRecipesDataframe.to_csv(output_path, index=False)

def process_recipe(recipe):
    return str(recipe.replace('<RECIPE_START> <INPUT_START>', '## User inputs ##\n    -').replace('<NEXT_INPUT>', '\n    -').replace('<INPUT_END>', '\n------------------------\n\n')\
                      .replace('<TITLE_START>', '## Recipe Name:- ##\n').replace('<TITLE_END>', '\n')\
                      .replace('<INGR_START>', '\n## Ingredients ##\n').replace('<NEXT_INGR>', '|').replace('<INGR_END>', '\n\n')\
                      .replace('<INSTR_START>', '## Cooking instructions ##\n').replace('.','.\n    -').replace(' <NEXT_INSTR>', '. ').replace(' <INSTR_END>', '. ')\
                      .replace(' <RECIPE_END>', '\n\n\n\nVoila Enjoy your recipe :)\n\n\n\n\n -----------\n'))

def format_instructions(instructions):
    its = instructions.split(' ')
    for i in range(len(its)):
        if i < len(its) - 1 and its[i].isnumeric() and its[i+1].isnumeric():
            its.insert(i+1, "-")
    return " ".join(its)

# List os
number_of_recipes = 2500
base_path = '/home/hiren20066/BTP_Dev/Dataset_Project/recipes/'

output_path = f'{base_path}Output.csv'
generate_and_save_recipes(number_of_recipes, output_path)

print("Recipes generated and saved successfully.")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 23%|██▎       | 173/768 [00:03<00:11, 49.93it/s]

**Saving the Final Dataframe that contains all the Novel Recipes Generated**

In [None]:
directory = '/home/hiren20066/BTP_Dev/Dataset_Project/recipes'

for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        print(f"Contents of {filename}:\n", df, "\n\n")

Contents of Trial1INDIAN SUBCONTINENT.csv:
                                   Random Ingredients  \
0  lemon,sun tomato,pasta,salt black pepper,ancho...   
1                               radish,caraway seed;   
2  pepperoni,green chile pepper,orange peel,butte...   
3  phyllo dough,sultana,chicken stock,broth,corn ...   
4  salsa,ketchup,pineapple tidbit,orange juice co...   
5  yellow pepper,cilantro leaf,water artichoke he...   
6                  rice vinegar,milk chocolate chip;   
7                                      rosemary,tea;   
8  chili sauce,creme fraiche,anchovy,apricot pres...   
9              ice water,parsley sprig,white pepper;   

                                        Recipe Title  \
0                          Sun-Dried Tomato Pasta \n   
1                           Radish-Wrapped Radish \n   
2                              Spicy Spicy Coffee \n   
3                     Makhani Chicken and Sultana \n   
4                Easy Pineapple & Pineapple Chili \n   
5  