In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    runs_on_gpu = True
else:
    device = torch.device('cpu')
    runs_on_gpu = False
print(f"Runs on {device.type}")

Runs on cuda


In [3]:
data = pd.read_json('preprocessed_recipes.json')
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20100 entries, 0 to 20099
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   directions   20100 non-null  object        
 1   fat          15901 non-null  float64       
 2   date         20100 non-null  datetime64[ns]
 3   categories   20100 non-null  object        
 4   calories     15969 non-null  float64       
 5   desc         20100 non-null  object        
 6   protein      15922 non-null  float64       
 7   rating       20100 non-null  float64       
 8   title        20100 non-null  object        
 9   ingredients  20100 non-null  object        
 10  sodium       15967 non-null  float64       
dtypes: datetime64[ns](1), float64(5), object(5)
memory usage: 1.8+ MB


In [4]:
data.head()

Unnamed: 0,directions,fat,date,categories,calories,desc,protein,rating,title,ingredients,sodium
0,"1. Place the stock, lentils, celery, carrot, t...",7.0,2006-09-01 04:00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",426.0,,30.0,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",559.0
1,Combine first 9 ingredients in heavy medium sa...,23.0,2004-08-20 04:00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",403.0,This uses the same ingredients found in boudin...,18.0,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",1439.0
2,In a large heavy saucepan cook diced fennel an...,7.0,2004-08-20 04:00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",165.0,,6.0,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",165.0
3,Heat oil in heavy large skillet over medium-hi...,,2009-03-27 04:00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",,The Sicilian-style tomato sauce has tons of Me...,,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",
4,Preheat oven to 350°F. Lightly grease 8x8x2-in...,32.0,2004-08-20 04:00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",547.0,,20.0,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",452.0


## 4
### Extension
Generate new recipes based on random ingredients

In [5]:
number_of_ingredients = np.random.randint(4, 8)
random_ingredients_list = []
for i in range(number_of_ingredients):
    ingredients = data['ingredients'].sample().iloc[0]
    if len(ingredients) == 0:
        continue
    random_ingredient = ingredients[np.random.randint(0, len(ingredients))]
    random_ingredients_list.append(random_ingredient)

random_ingredients = ', '.join(random_ingredients_list)
print(random_ingredients_list)

['3 tablespoons sour cream', '1 cup finely diced cucumber', '2 tablespoons vegetable oil, divided', '2 tablespoons unsalted butter, plus more', '2 tablespoons cider vinegar', '1 14 1/2-ounce can Mexican-style stewed tomatoes', '3 tablespoons olive oil']


### 4.1
Fine tune transformer to give directions for random ingredients

Tutorial from: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb

In [17]:
from transformers import AutoTokenizer

In [18]:
X = data['ingredients'].apply(lambda x: ', '.join(x)).tolist()
y = data['directions'].tolist()

In [19]:
tokenizer = AutoTokenizer.from_pretrained('t5-small')  # You can choose other models like `t5-base`, `gpt-2`, etc.

def preprocess_function(X, y):
    model_inputs = tokenizer(X, max_length=512, padding='max_length', truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(y, max_length=512, padding='max_length', truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = list(map(preprocess_function, X, y))



In [20]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(tokenized_datasets, test_size=0.2)

In [21]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [24]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    #evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=3,  # You may need to adjust the number of epochs
    weight_decay=0.01,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train
)

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,2.6268
1000,1.3853
1500,1.2876
2000,1.2653
2500,1.2126
3000,1.234
3500,1.2077
4000,1.2035
4500,1.1784
5000,1.1869


TrainOutput(global_step=6030, training_loss=1.3435772840458757, metrics={'train_runtime': 4192.2562, 'train_samples_per_second': 11.507, 'train_steps_per_second': 1.438, 'total_flos': 6528888503009280.0, 'train_loss': 1.3435772840458757, 'epoch': 3.0})

In [None]:
trainer.evaluate()
model.save_pretrained('recipe-directions-model')
tokenizer.save_pretrained('recipe-directions-tokenizer')

In [22]:
model.eval()

ingredients = "1 cup flour, 2 eggs, 1/2 cup milk"
inputs = tokenizer(ingredients, return_tensors='pt').input_ids

outputs = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
print("Generated Recipe Directions:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Generated Recipe Directions: 1 cup flour, 2 eggs, 1/2 cup milk


### 4.2
Compare to directions and title made by groq

In [None]:
from groq import Groq
import os
from dotenv import load_dotenv
import json

In [None]:
load_dotenv()

API_KEY = os.getenv('API_KEY')

client = Groq(
    api_key=API_KEY
)

prompt = f'I have following ingredients: {random_ingredients}. Give title and directions for a recipe. Your answer is a json-file wrapped in ```. The json looks like this: {{ "title" : "title", "directions" : ["direction1","direction2"] }}'

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama3-8b-8192",
)
reply = chat_completion.choices[0].message.content
print(reply)

In [None]:
new_recipe = reply[reply.find('```')+3:reply.rfind('```')]
new_recipe = json.loads(new_recipe)
new_recipe['ingredients'] = random_ingredients_list

new_recipes = pd.read_json('new_recipes.json')

new_recipe = pd.DataFrame([new_recipe])
new_recipes = pd.concat([new_recipes, new_recipe], ignore_index=True)
new_recipes.to_json('new_recipes.json')

new_recipes.head()

In [None]:
new_recipe