In [2]:
print("hello")

hello


In [3]:
import pandas as pd
import re

# Load dataset
dataset = pd.read_csv("RecipeNLG_dataset.csv")[['ingredients', 'directions']].dropna()

# Keep only 100,000 samples
dataset = dataset.sample(n=50000, random_state=42).reset_index(drop=True)


# Preprocess data
dataset["ingredients"] = dataset["ingredients"].str.lower()
dataset["directions"] = dataset["directions"].str.lower()
dataset["ingredients"] = dataset["ingredients"].apply(lambda x: re.sub(r"[^a-zA-Z0-9, ]", "", x))
dataset["directions"] = dataset["directions"].apply(lambda x: re.sub(r"[^a-zA-Z0-9, ]", "", x))

dataset["input_text"] = "Ingredients: " + dataset["ingredients"]
dataset["target_text"] = "Directions: " + dataset["directions"]

# Display the first few rows
print(dataset.head())


                                         ingredients  \
0  1 12 pound flank steak, 12 c finely minced gre...   
1  1 tablespoon rosemary, 1 teaspoon thyme, 3 bay...   
2  3 to 4 carrots, 1 12 tbsp butter, 13 c brown s...   
3  45 cups flour, 15 tsp salt, pinch baking powde...   
4  2 c crushed small thin pretzels sticks, 34 c m...   

                                          directions  \
0  remove tenderloin from steak, score meat, comb...   
1  combine all ingredients in slow cooker 6 quart...   
2  cook 3 to 4 carrots cut crosswise in 1inch pie...   
3  mix all dry ingredients in a bowl, , add crisc...   
4  mix and press in baking pan, approximately 13 ...   

                                          input_text  \
0  Ingredients: 1 12 pound flank steak, 12 c fine...   
1  Ingredients: 1 tablespoon rosemary, 1 teaspoon...   
2  Ingredients: 3 to 4 carrots, 1 12 tbsp butter,...   
3  Ingredients: 45 cups flour, 15 tsp salt, pinch...   
4  Ingredients: 2 c crushed small thin pretzel

In [4]:
from sklearn.model_selection import train_test_split

# Train-validation-test split (80% train, 10% val, 10% test)
train_data, temp_data = train_test_split(dataset, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Print dataset sizes
print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")


Train size: 40000, Validation size: 5000, Test size: 5000


In [5]:
import torch
from torch.utils.data import Dataset

class RecipeDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=64):
        self.input_texts = data["input_text"].tolist()
        self.target_texts = data["target_text"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.input_texts)
    
    def __getitem__(self, idx):
        input_enc = self.tokenizer(
            self.input_texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        target_enc = self.tokenizer(
            self.target_texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        return {
            "input_ids": input_enc["input_ids"].squeeze(),
            "attention_mask": input_enc["attention_mask"].squeeze(),
            "labels": target_enc["input_ids"].squeeze()
        }


In [6]:
from transformers import T5Tokenizer

# Load the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Create dataset objects
train_dataset = RecipeDataset(train_data, tokenizer)
val_dataset = RecipeDataset(val_data, tokenizer)
test_dataset = RecipeDataset(test_data, tokenizer)

# Print sample input-output
print("Sample Input:", train_dataset[0]["input_ids"][:10])  # First 10 tokens of input
print("Sample Target:", train_dataset[0]["labels"][:10])    # First 10 tokens of target


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Sample Input: tensor([23482,     7,    10,   220,  1669,   158, 14693,     6,   158,   400])
Sample Target: tensor([19436,     7,    10, 14514,     8,   158,  1836, 18647,   859,   662])


In [7]:
from torch.utils.data import DataLoader

# Define batch size
batch_size = 16

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Print batch sizes to confirm
print(f"Train batches: {len(train_loader)}, Validation batches: {len(val_loader)}, Test batches: {len(test_loader)}")


Train batches: 2500, Validation batches: 313, Test batches: 313


In [8]:
import torch
from transformers import T5ForConditionalGeneration, get_scheduler
from torch.optim import AdamW
from torch.utils.data import DataLoader  # Replace with actual dataset

# ✅ Check for CPU
device = torch.device("cpu")
print(f"Using device: {device}")

# ✅ Load T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# ✅ Set dropout rates
model.config.dropout_rate = 0.3  # Default is 0.1, increased for better regularization
model.config.attention_dropout = 0.3

# ✅ Move model to CPU
model.to(device)

# ✅ Define DataLoader (Replace with actual dataset)
train_loader = DataLoader([])  # Replace with actual dataset
val_loader = DataLoader([])    # Replace with actual dataset

# ✅ Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# ✅ Define learning rate scheduler
num_epochs = 3
num_training_steps = max(len(train_loader) * num_epochs, 1)  # Prevents division by zero
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

print("✅ Model, Optimizer, Scheduler, and Data Loaders initialized!")



Using device: cpu
✅ Model, Optimizer, Scheduler, and Data Loaders initialized!


In [9]:
import torch
from transformers import T5ForConditionalGeneration, get_scheduler
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm

# ✅ Check for CPU (No GPU usage)
device = torch.device("cpu")
print(f"Using device: {device}")

# ✅ Load T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# ✅ Set dropout rates
model.config.dropout_rate = 0.3  # Default is 0.1, increase if needed
model.config.attention_dropout = 0.3

# ✅ Move model to CPU
model.to(device)

# ✅ Define dummy train_loader (Replace with actual data)
train_loader = DataLoader([])  # Replace with your dataset
val_loader = DataLoader([])  # Replace with actual validation data

# ✅ Define optimizer (using torch's AdamW)
optimizer = AdamW(model.parameters(), lr=5e-5)

# ✅ Define learning rate scheduler
num_epochs = 3
num_training_steps = max(len(train_loader) * num_epochs, 1)  # Prevents division by zero
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

print("✅ Model, Optimizer, and Scheduler initialized!")

# === Training Loop ===
best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}", total=len(train_loader), leave=True)

    for batch in loop:
        batch = {k: v.to(device, dtype=torch.long) for k, v in batch.items()}

        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()

        # ✅ Gradient Clipping (Prevents Exploding Gradients)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        total_train_loss += loss.item()

        # ✅ Show loss in tqdm
        loop.set_postfix(train_loss=loss.item())

    # ✅ Move scheduler step **outside** batch loop (once per epoch)
    lr_scheduler.step()

    # === Validation Loop ===
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        val_loop = tqdm(val_loader, desc=f"Validating Epoch {epoch+1}", total=len(val_loader), leave=True)
        for batch in val_loop:
            batch = {k: v.to(device, dtype=torch.long) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss.item()
            total_val_loss += loss

            val_loop.set_postfix(val_loss=loss)

    # === Logging Loss ===
    avg_train_loss = total_train_loss / max(len(train_loader), 1)  # Avoid division by zero
    avg_val_loss = total_val_loss / max(len(val_loader), 1)
    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Validation Loss: {avg_val_loss:.4f}")

    # ✅ Save Best Model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_t5_model.pth")

print("✅ Training Completed!")


Using device: cpu
✅ Model, Optimizer, and Scheduler initialized!


Epoch 1: 0it [00:00, ?it/s]
Validating Epoch 1: 0it [00:00, ?it/s]

Epoch 1 | Train Loss: 0.0000 | Validation Loss: 0.0000



Epoch 2: 0it [00:00, ?it/s]
Validating Epoch 2: 0it [00:00, ?it/s]


Epoch 2 | Train Loss: 0.0000 | Validation Loss: 0.0000


Epoch 3: 0it [00:00, ?it/s]
Validating Epoch 3: 0it [00:00, ?it/s]

Epoch 3 | Train Loss: 0.0000 | Validation Loss: 0.0000
✅ Training Completed!





In [10]:
# Model evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    total_test_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            total_test_loss += outputs.loss.item()
    
    avg_test_loss = total_test_loss / len(test_loader)
    print(f"Test Loss: {avg_test_loss:.4f}")

# Evaluate model on test set
evaluate_model(model, test_loader)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Test Loss: 7.5454


In [11]:
# Recipe generation function
def generate_recipe(ingredients):
    model.eval()
    input_text = "Ingredients: " + ingredients
    input_enc = tokenizer(input_text, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**input_enc, max_length=150)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test the recipe generation function
sample_ingredients = test_data.iloc[0]["ingredients"]
generated_recipe = generate_recipe(sample_ingredients)

print(f"Sample Ingredients: {sample_ingredients}")
print(f"Generated Recipe: {generated_recipe}")


Sample Ingredients: 1 c selfrising flour, 1 c plain flour, 1 teaspoon baking soda, 2 large eggs, 12 stick butter, 2 2 onions, minced, 4 tbsp sugar, healthy pinch of salt, 1 stick celery, minced, add in beer till thick
Generated Recipe: : 1 c selfrising flour, 1 c plain flour, 1 teaspoon baking soda, 2 large eggs, 12 stick butter, 2 2 onions, minced, 4 tbsp sugar, healthy pinch of salt, 1 stick celery, minced, add in beer till thick, 1 c selfrising flour, 1 c plain flour, 1 teaspoon baking soda, 2 large eggs, 12 stick butter, 2 2 onions, minced, 4 tbsp sugar, healthy pinch of salt, 1 stick cele
