In [21]:
import pandas as pd
import json

df = pd.read_csv("processed_recipes.csv", encoding='latin1')

df['inputs'] = df['Processed_Ingredients'].apply(lambda x: x.replace("'", "").replace("[", "").replace("]", ""))
df['outputs'] = df['Recipe Name'] + '\nIngredients: ' + df['Processed_Ingredients_Quantities'].apply(lambda x: x.replace("'", "").replace("[", "").replace("]", "")) + '\nDirections: ' + df['Directions']

df.head()
df[['inputs', 'outputs']].to_csv('processed_recipes_2.csv', index=False)

from datasets import load_dataset

data = load_dataset('csv', data_files='processed_recipes_2.csv')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [22]:
data

DatasetDict({
    train: Dataset({
        features: ['inputs', 'outputs'],
        num_rows: 6411
    })
})

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from torch.utils.data import TensorDataset
import torch

# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize the 'Processed_Ingredients' and 'Directions' columns
input_text = ["translate ingredients to recipe: " + text for text in df['Processed_Ingredients'].tolist()]
input_encoding = tokenizer(input_text, padding=True, truncation=True)
target_encoding = tokenizer(df['Directions'].tolist(), padding=True, truncation=True)

# Convert tokenized inputs and targets to PyTorch tensors
input_ids = torch.tensor(input_encoding['input_ids'])
target_ids = torch.tensor(target_encoding['input_ids'])

from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, input_ids, target_ids):
        self.input_ids = input_ids
        self.target_ids = target_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'labels': self.target_ids[idx]
        }
        return item

# Initialize the CustomDataset
dataset = CustomDataset(input_ids, target_ids)

In [3]:
# Set up the training configurations
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize the model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

 42%|████▏     | 500/1203 [27:55<37:56,  3.24s/it]  

{'loss': 1.8889, 'learning_rate': 5e-05, 'epoch': 1.25}


 83%|████████▎ | 1000/1203 [55:19<10:53,  3.22s/it]

{'loss': 0.971, 'learning_rate': 1.4438122332859174e-05, 'epoch': 2.49}


100%|██████████| 1203/1203 [1:07:38<00:00,  3.37s/it]

{'train_runtime': 4058.6306, 'train_samples_per_second': 4.739, 'train_steps_per_second': 0.296, 'train_loss': 1.3465254253282808, 'epoch': 3.0}





TrainOutput(global_step=1203, training_loss=1.3465254253282808, metrics={'train_runtime': 4058.6306, 'train_samples_per_second': 4.739, 'train_steps_per_second': 0.296, 'train_loss': 1.3465254253282808, 'epoch': 3.0})

In [6]:
# Save the model
model.save_pretrained("fine_tuned_model")

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import transformers
import torch

model = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

generation_config = GenerationConfig.from_dict(
    {
        "max_length": 200,
        "do_sample": True,
        "top_k": 10,
        "num_return_sequences": 1,
        "eos_token_id": tokenizer.eos_token_id,
    }
)

sequences = pipeline(
    f"Ingredients: {df['Processed_Ingredients'][10]}.\nInstructions:",
    generation_config=generation_config,
)

for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Result: Ingredients: egg, sugar, oil, zucchini, vanilla extract, flour, walnut, cinnamon, salt, baking soda, baking powder.
Instructions: 1. Wash and peel the zucchini.
2. Grate the zucchini (I use the biggest hole of the grater).
3. Mix all the ingredients in a bowl.
4. Preheat the oven to 360ºF.
5. Butter and dust with flour a 13 x 9 inch baking pan and bake the cake for 45 - 50 minutes.
6. Cool the cake for 10 minutes before slicing.
This is a perfect recipe for the zucchini lovers! I made it a couple of times and it always turned out perfect. I used to add chocolate chips, raisins, and other ingredients. The texture is soft and it's very easy to prepare.
I like this recipe, and I have made it a few times. I also like using it in muffins, and it


In [2]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer

# Preprocess data
df = pd.read_csv("processed_recipes.csv", encoding='latin1')
df['Directions'] = df['Recipe Name'] + '\n' + df['Directions']
df['Processed_Ingredients'] = df['Processed_Ingredients'].apply(lambda x: x.replace("'", "").replace("[", "").replace("]", ""))
df['full_text'] = "Ingredients: " + df['Processed_Ingredients'] + "\nInstructions: " + df['Directions']

# Save the full_text to a text file
with open("recipe_text.txt", "w") as f:
    for item in df['full_text']:
        f.write("%s\n" % item)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")

# Prepare the dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="recipe_text.txt",
    block_size=128,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./falcon7b_results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize the model
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("fine_tuned_falcon7b")

# Inference
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_falcon7b")
model = AutoModelForCausalLM.from_pretrained("fine_tuned_falcon7b")

# Generate recipe
input_text = f"Ingredients: {df['Processed_Ingredients'][10]}.\nInstructions:"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(input_ids, max_length=200, num_return_sequences=1, top_k=10, eos_token_id=tokenizer.eos_token_id)

output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Generated recipe:\n{output_text}")




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/295 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 284.00 MiB (GPU 0; 8.00 GiB total capacity; 37.64 GiB already allocated; 0 bytes free; 37.79 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF