In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load the dataset
dataset = pd.read_csv("RecipeNLG_dataset.csv")

# Keep only 10,000 samples (to reduce training time)
dataset = dataset.sample(n=10000, random_state=42).reset_index(drop=True)

# Keep only relevant columns
dataset = dataset[['ingredients', 'directions']].dropna()

# Convert to lowercase
dataset["ingredients"] = dataset["ingredients"].str.lower()
dataset["directions"] = dataset["directions"].str.lower()

# Remove special characters
dataset["ingredients"] = dataset["ingredients"].apply(lambda x: re.sub(r"[^a-zA-Z0-9, ]", "", x))
dataset["directions"] = dataset["directions"].apply(lambda x: re.sub(r"[^a-zA-Z0-9, ]", "", x))

# Format dataset for text generation
dataset["input_text"] = "Ingredients: " + dataset["ingredients"]
dataset["target_text"] = "Directions: " + dataset["directions"]

# Train-validation split
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Save cleaned datasets
train_data.to_csv("train_data.csv", index=False)
val_data.to_csv("val_data.csv", index=False)

print("✅ Data Preprocessing Done!")

✅ Data Preprocessing Done!


In [2]:
from torch.utils.data import Dataset
from transformers import T5Tokenizer

# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

class RecipeDataset(Dataset):
    def __init__(self, data):
        self.input_texts = data["input_text"].tolist()
        self.target_texts = data["target_text"].tolist()

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_enc = tokenizer(self.input_texts[idx], padding="max_length", truncation=True, max_length=32, return_tensors="pt")
        target_enc = tokenizer(self.target_texts[idx], padding="max_length", truncation=True, max_length=32, return_tensors="pt")
        
        return {
            "input_ids": input_enc["input_ids"].squeeze(),
            "attention_mask": input_enc["attention_mask"].squeeze(),
            "labels": target_enc["input_ids"].squeeze()
        }

# Load dataset
train_dataset = RecipeDataset(train_data)
val_dataset = RecipeDataset(val_data)

print("✅ Dataset Created!")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


✅ Dataset Created!


In [3]:
from torch.utils.data import DataLoader

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2)

print("✅ Data Loaders Created!")

✅ Data Loaders Created!


In [4]:
from transformers import T5ForConditionalGeneration
import torch

# Load T5 model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

print("✅ T5 Model Loaded!")


✅ T5 Model Loaded!


In [6]:
from torch.optim import AdamW
from transformers import get_scheduler

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define learning rate scheduler
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

print("✅ Optimizer and Scheduler Set Up!")

✅ Optimizer and Scheduler Set Up!


In [None]:
from tqdm import tqdm

# Training loop
num_epochs = 3
best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        batch = {key: val.to(device) for key, val in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_train_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1} | Validation Loss: {avg_val_loss:.4f}")

    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_t5_model.pth")

print("✅ Training Completed!")

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_bleu(model, dataloader, tokenizer):
    model.eval()
    references = []
    predictions = []
    for batch in dataloader:
        batch = {key: val.to(device) for key, val in batch.items()}
        with torch.no_grad():
            generated_ids = model.generate(batch["input_ids"], max_length=32)
        preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        refs = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
        predictions.extend(preds)
        references.extend(refs)
    bleu_score = sum(sentence_bleu([ref], pred) for ref, pred in zip(references, predictions)) / len(references)
    return bleu_score

train_bleu = evaluate_bleu(model, train_loader, tokenizer)
val_bleu = evaluate_bleu(model, val_loader, tokenizer)
print(f"Train BLEU: {train_bleu:.4f}, Val BLEU: {val_bleu:.4f}")

In [None]:
def generate_recipe(ingredients):
    model.eval()
    
    # Format input
    input_text = "Ingredients: " + ingredients
    input_enc = tokenizer(input_text, return_tensors="pt").to(device)

    # Generate output
    with torch.no_grad():
        output = model.generate(**input_enc, max_length=32)
    
    # Decode output
    recipe = tokenizer.decode(output[0], skip_special_tokens=True)
    return recipe

# Test with a sample ingredient list
ingredients = "chicken, garlic, onions, tomatoes, salt, pepper"
recipe = generate_recipe(ingredients)
print("Generated Recipe:", recipe)