In [1]:
print("hello")

hello


In [1]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load dataset
dataset = pd.read_csv("RecipeNLG_dataset.csv")
dataset = dataset[['ingredients', 'directions']].dropna()

# Drop 50% of the data
dataset = dataset.sample(frac=0.5, random_state=42).reset_index(drop=True)

# Preprocess data
dataset["ingredients"] = dataset["ingredients"].str.lower()
dataset["directions"] = dataset["directions"].str.lower()
dataset["ingredients"] = dataset["ingredients"].apply(lambda x: re.sub(r"[^a-zA-Z0-9, ]", "", x))
dataset["directions"] = dataset["directions"].apply(lambda x: re.sub(r"[^a-zA-Z0-9, ]", "", x))

dataset["input_text"] = "Ingredients: " + dataset["ingredients"]
dataset["target_text"] = "Directions: " + dataset["directions"]

# Split data into train (80%), validation (10%), and test (10%)
train_data, temp_data = train_test_split(dataset, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Define dataset class
class RecipeDataset(Dataset):
    def __init__(self, data):
        self.input_texts = data["input_text"].tolist()
        self.target_texts = data["target_text"].tolist()

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_enc = tokenizer(self.input_texts[idx], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        target_enc = tokenizer(self.target_texts[idx], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        
        return {
            "input_ids": input_enc["input_ids"].squeeze(),
            "attention_mask": input_enc["attention_mask"].squeeze(),
            "labels": target_enc["input_ids"].squeeze()
        }

# Load tokenizer and dataset
tokenizer = T5Tokenizer.from_pretrained("t5-small")
train_dataset = RecipeDataset(train_data)
val_dataset = RecipeDataset(val_data)
test_dataset = RecipeDataset(test_data)

# Define dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Define optimizer & scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
num_epochs = 3
best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        batch = {key: val.to(device) for key, val in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_train_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1} | Validation Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_t5_model.pth")

print("âœ… Training Completed!")

# Evaluate on test dataset
def evaluate_model():
    model.eval()
    total_test_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            total_test_loss += outputs.loss.item()

    avg_test_loss = total_test_loss / len(test_loader)
    print(f"Test Loss: {avg_test_loss:.4f}")

evaluate_model()

# Recipe generation function
def generate_recipe(ingredients):
    model.eval()
    input_text = "Ingredients: " + ingredients
    input_enc = tokenizer(input_text, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**input_enc, max_length=150)
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test recipe generation
ingredients = "chicken, garlic, onions, tomatoes, salt, pepper"
recipe = generate_recipe(ingredients)
print("Generated Recipe:", recipe)





tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  0%|          | 0/111557 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1:   0%|          | 21/111557 [02:36<231:12:10,  7.46s/it, loss=5.74]


KeyboardInterrupt: 

In [None]:
#Evaluation & Accuracy Calculation
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for batch in val_loader:
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)

        # Get predictions
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == batch["labels"]).sum().item()
        total += batch["labels"].size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy * 100:.2f}%")