In [2]:
import pandas as pd
import numpy as np
import torch

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    runs_on_gpu = True
    from google.colab import drive
    drive.mount('/content/drive')
    directory = 'drive/MyDrive/'
    !pip install evaluate
    !pip install rouge_score
else:
    device = torch.device('cpu')
    runs_on_gpu = False
    directory = ''
print(f"Runs on {device.type}")

Runs on cpu


In [4]:
data = pd.read_json('preprocessed_recipes.json')
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20100 entries, 0 to 20099
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   directions         20100 non-null  object        
 1   date               20100 non-null  datetime64[ns]
 2   categories         20100 non-null  object        
 3   desc               20100 non-null  object        
 4   rating             20100 non-null  float64       
 5   title              20100 non-null  object        
 6   ingredients        20100 non-null  object        
 7   num_categories     20100 non-null  int64         
 8   num_ingredients    20100 non-null  int64         
 9   num_directions     20100 non-null  int64         
 10  numerical_columns  20100 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(5)
memory usage: 1.8+ MB


In [5]:
data.head()

Unnamed: 0,directions,date,categories,desc,rating,title,ingredients,num_categories,num_ingredients,num_directions,numerical_columns
0,"1. Place the stock, lentils, celery, carrot, t...",2006-09-01 04:00:00,"[Sandwich, Bean, Fruit, Tomato, turkey, Vegeta...",,2.5,"Lentil, Apple, and Turkey Wrap","[4 cups low-sodium vegetable or chicken stock,...",11,15,3,-0.034044
1,Combine first 9 ingredients in heavy medium sa...,2004-08-20 04:00:00,"[Food Processor, Onion, Pork, Bake, Bastille D...",This uses the same ingredients found in boudin...,4.375,Boudin Blanc Terrine with Red Onion Confit,"[1 1/2 cups whipping cream, 2 medium onions, c...",11,28,5,-0.033669
2,In a large heavy saucepan cook diced fennel an...,2004-08-20 04:00:00,"[Soup/Stew, Dairy, Potato, Vegetable, Fennel, ...",,3.75,Potato and Fennel Soup Hodge,"[1 fennel bulb (sometimes called anise), stalk...",7,6,2,-0.037781
3,Heat oil in heavy large skillet over medium-hi...,2009-03-27 04:00:00,"[Fish, Olive, Tomato, Sauté, Low Fat, Low Cal,...",The Sicilian-style tomato sauce has tons of Me...,5.0,Mahi-Mahi in Tomato Olive Sauce,"[2 tablespoons extra-virgin olive oil, 1 cup c...",17,10,2,-0.000106
4,Preheat oven to 350°F. Lightly grease 8x8x2-in...,2004-08-20 04:00:00,"[Cheese, Dairy, Pasta, Vegetable, Side, Bake, ...",,3.125,Spinach Noodle Casserole,"[1 12-ounce package frozen spinach soufflé, th...",11,6,1,-0.034548


## 4
### Extension
Generate new recipes based on random ingredients

In [6]:
# get 4 to 7 random recipes from the dataset and select a random ingredient from each recipe
number_of_ingredients = np.random.randint(4, 8)
random_ingredients_list = []
for i in range(number_of_ingredients):
    ingredients = data['ingredients'].sample().iloc[0]
    if len(ingredients) == 0:
        continue
    random_ingredient = ingredients[np.random.randint(0, len(ingredients))]
    random_ingredients_list.append(random_ingredient)

# take the random ingredients and create a string for later use
random_ingredients = ', '.join(random_ingredients_list)
print(random_ingredients_list)

['1 teaspoon fenugreek seeds', 'Corn, grapeseed, or other neutral oil as needed', 'Flaky sea salt', '4 garlic cloves, minced', '1 1/2 sticks (3/4 cup) unsalted butter, softened', '1 tablespoon golden brown sugar']


In [31]:
safe_state = ['Tangy Chocolate Frosting', 'Kosher salt', '1 jalapeño pepper', '2 green onions, thinly sliced', '1/2 cup chilled plain nonfat yogurt']
safe_state_string = ', '.join(safe_state)

### 4.1 Just a test
Fine tune transformer to give directions for random ingredients

In [32]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split
import nltk
import evaluate
import accelerate

In [33]:
tokenizer = AutoTokenizer.from_pretrained('t5-small')
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
model.to(device) 

metric = evaluate.load("rouge")

Tutorial from: https://medium.com/nlplanet/a-full-guide-to-finetuning-t5-for-text2text-and-building-a-demo-with-streamlit-c72009631887

In [34]:
def preprocess_data(X, y):
  model_inputs = tokenizer(X, max_length=512, truncation=True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(y, max_length=512, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [35]:
if not runs_on_gpu:
    X = data['ingredients'].sample(5).apply(lambda x: ', '.join(x)).values.tolist()
    y = data['directions'].sample(5).apply(lambda x: ', '.join(x)).values.tolist()
else:
    X = data['ingredients'].apply(lambda x: ', '.join(x)).values.tolist()
    y = data['directions'].apply(lambda x: ', '.join(x)).values.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}
        return item

    def __len__(self):
        return len(self.data)

In [36]:
train = Dataset(preprocess_data(X_train, y_train))
test = Dataset(preprocess_data(X_test, y_test))



In [37]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [46]:
args = Seq2SeqTrainingArguments(
    output_dir=directory + "t5-base-directions",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to=[],
)

data_collator = DataCollatorForSeq2Seq(tokenizer)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train,
    eval_dataset=test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [39]:
accelerate.__version__

'1.2.1'

In [None]:
trainer.train()

In [23]:
inputs = tokenizer(random_ingredients, return_tensors='pt', max_length=512, truncation=True, padding='max_length').to(device)
outputs = model.generate(**inputs)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

minced, 1 cup water, 1 cup milk, 1 cup milk, 1 cup water,


### 4.2
Compare to directions and title made by groq

In [53]:
from groq import Groq
import os
from dotenv import load_dotenv
import json

In [55]:
# This is the prompt we use as input for the models
prompt = f'I have following ingredients: {safe_state}. Give title and directions for a recipe. Your answer is a json-file wrapped in ```. The json looks like this: {{ "title" : "title", "directions" : ["direction1","direction2"] }}'
prompt

'I have following ingredients: [\'Tangy Chocolate Frosting\', \'Kosher salt\', \'1 jalapeño pepper\', \'2 green onions, thinly sliced\', \'1/2 cup chilled plain nonfat yogurt\']. Give title and directions for a recipe. Your answer is a json-file wrapped in ```. The json looks like this: { "title" : "title", "directions" : ["direction1","direction2"] }'

In [56]:
load_dotenv()

API_KEY = os.getenv('API_KEY') # Get my API key from the .env file

client = Groq( #init the groq client
    api_key=API_KEY
)

chat_completion = client.chat.completions.create( #create and send a request to the groq api
    messages=[
        {
            "role": "user", #the role is user, because we send the prompt
            "content": prompt, #the prompt we defined earlier
        }
    ],
    model="llama3-8b-8192", #this is a opensource model, that is free to use
)
reply = chat_completion.choices[0].message.content
print(reply)

```
{
  "title" : "Spicy Yogurt Sauce",
  "directions" : [
    "Slice the jalapeño pepper thinly and set aside.",
    "In a small bowl, combine the yogurt, Kosher salt, and Tangy Chocolate Frosting. Mix until smooth.",
    "Add the sliced green onions and jalapeño pepper to the yogurt mixture. Stir until well combined.",
    "Refrigerate the sauce for at least 30 minutes to allow the flavors to meld.",
    "Serve chilled or at room temperature."
  ]
}
```


In [57]:
new_recipe = reply[reply.find('```')+3:reply.rfind('```')] #extract the json content from the reply
new_recipe = json.loads(new_recipe) 
new_recipe['ingredients'] = random_ingredients_list

new_recipes = pd.read_json('new_recipes.json')

new_recipe = pd.DataFrame([new_recipe])
new_recipes = pd.concat([new_recipes, new_recipe], ignore_index=True) # make it one dataframe
new_recipes.to_json('new_recipes.json')

new_recipes.head()

Unnamed: 0,title,directions,ingredients
0,Shrimp with Apricot Glaze and Salt,"[Preheat oven to 400°F (200°C)., In a small bo...","[2 tbsp/20 g finely chopped red onion, 3 table..."
1,Roasted Root Vegetables with Coriander and Salt,"[Preheat the oven to 425°F (220°C)., Peel the ...","[1 1/2 teaspoons kosher salt, 2 teaspoons grou..."
2,Roasted Bell Pepper and Apricot Soup,[Preheat the oven to 400°F (200°C). Place the ...,"1 cup tomato juice, 1/2 cup extra-virgin olive..."
3,Roasted Bell Pepper and Apricot Soup,[Preheat the oven to 400°F (200°C). Place the ...,"[1 cup tomato juice, 1/2 cup extra-virgin oliv..."
4,Spicy Yogurt Sauce,[Slice the jalapeño pepper thinly and set asid...,[1 baguette (preferably day-old; about 18 by 2...


In [58]:
new_recipe

Unnamed: 0,title,directions,ingredients
0,Spicy Yogurt Sauce,[Slice the jalapeño pepper thinly and set asid...,[1 baguette (preferably day-old; about 18 by 2...


# 4.1 Use transformer

In [60]:
import torch
from transformers import pipeline

token = "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"
pipe = pipeline("text-generation", "meta-llama/Llama-3.1-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto", token=token)
response = pipe(prompt, max_new_tokens=512)
print(response[0]['generated_text'][-1]['content'])


ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`