# Importing Libraries and Reading Data

In [1]:
import pandas as pd
from  sklearn.model_selection import train_test_split
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('/content/drive/MyDrive/unique_prompts_generated_recipes_v2.csv')

In [3]:
data.head(8)

Unnamed: 0,Prompt,Generated Recipe
0,Generate a dairy-free recipe for lunch with yo...,"Dish: Ingredients: yogurt, chickpeas, spinach,..."
1,Generate a dairy-free recipe for dinner with g...,"Dish: Ingredients: ginger, olive oil, tomato, ..."
2,Generate a vegetarian recipe for dinner with c...,"Dish: Ingredients: cucumber, potato, tofu, bre..."
3,Generate a dairy-free recipe for lunch with le...,"Dish: Ingredients: lentils, basil, spinach, on..."
4,Generate a vegetarian recipe for dinner with b...,"Dish: Ingredients: basil, lemongrass, pasta, b..."
5,Generate a vegetarian recipe for dinner with g...,"Dish: Ingredients: garlic, basil, bell peppers..."
6,Generate a vegetarian recipe for dinner with o...,"Dish: Ingredients: onion, eggplant, chickpeas,..."
7,Generate a dairy-free recipe for lunch with to...,"Dish: Ingredients: tofu, rice, chickpeas, pota..."


### Data Cleaning

In [4]:
def clean_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # convert to lowercase
    text = text.lower()
    return text


In [5]:
data['cleaned_prompt']=data['Prompt'].apply(clean_text)
data['cleaned_recipe']=data['Generated Recipe'].apply(clean_text)



# Splitting Data

In [6]:
train_data, test_data= train_test_split(data, test_size=0.2, random_state=44, shuffle =True)

In [7]:
# pip install datasets

#Initializing Tokenizer and Model

In [8]:
# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')


# Tokenize the input prompts and generated recipes
train_encodings = tokenizer(list(train_data['cleaned_prompt']), truncation=True, padding=True, max_length=128)
train_labels = tokenizer(list(train_data['cleaned_recipe']), truncation=True, padding=True, max_length=128)

test_encodings = tokenizer(list(test_data['cleaned_prompt']), truncation=True, padding=True, max_length=128)
test_labels = tokenizer(list(test_data['cleaned_recipe']), truncation=True, padding=True, max_length=128)

# Create a custom dataset for use in the Trainer
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels['input_ids']
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels['input_ids']
})


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Setting Training Arguments

In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_steps=1000,
    load_best_model_at_end=True,
    save_total_limit=3,
)


# Training the Model



In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer, )

In [11]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mahmedlotfyalt[0m ([33mahmedlotfyalt-github[0m). Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.2299,0.01582
2,0.0078,8.6e-05
3,0.0031,5.1e-05


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=900, training_loss=1.8617472116380103, metrics={'train_runtime': 103.9745, 'train_samples_per_second': 69.248, 'train_steps_per_second': 8.656, 'total_flos': 55194078412800.0, 'train_loss': 1.8617472116380103, 'epoch': 3.0})

#  Save the fine-tuned model

In [12]:
model.save_pretrained('./fine_tuned_t5_recipe_model')
tokenizer.save_pretrained('./fine_tuned_t5_recipe_model')

('./fine_tuned_t5_recipe_model/tokenizer_config.json',
 './fine_tuned_t5_recipe_model/special_tokens_map.json',
 './fine_tuned_t5_recipe_model/spiece.model',
 './fine_tuned_t5_recipe_model/added_tokens.json')

# recipe generation system

In [13]:
import torch

model_path = './fine_tuned_t5_recipe_model'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)



def generate_recipe(prompt,model,tokenizer,max_length=160):
  prompt=clean_text(prompt)
  inputs=tokenizer(prompt,return_tensors='pt',max_length=max_length,truncation=True,padding='max_length')
  input_ids = inputs['input_ids'].to(device)
  attention_mask = inputs['attention_mask'].to(device)

  # Generate the recipe
  output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)

  return tokenizer.decode(output[0], skip_special_tokens=True)


In [14]:
# Test the recipe generation
prompt = "Generate a vegetarian recipe for dinner with tomatoes and spinach"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)

dish ingredients tomatoes and spinach instructions combine tomatoes and spinach add spinach cook thoroughly and serve hot


In [15]:
# Test the recipe generation with a different prompt
prompt = "Generate a vegan dessert recipe with chocolate and almonds"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)

dish ingredients chocolate and almonds instructions combine chocolate and almonds cook thoroughly and serve hot


In [16]:
# Test the recipe generation with another prompt
prompt = "Generate a gluten-free recipe for breakfast with eggs and avocado"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)

dish ingredients eggs and avocado instructions combine eggs and avocado add avocado cook thoroughly and serve hot


# Exporting the Fine-Tuned Model

In [17]:
import shutil
from google.colab import files

#spicify the path of your model directory
model_dir='/content/fine_tuned_t5_recipe_model'

shutil.make_archive('/content/fine_tuned_t5_recipe_model', 'zip', model_dir)
files.download('/content/fine_tuned_t5_recipe_model.zip')



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>