In [None]:
import pandas as pd

dataset= pd.read_csv("RecipeNLG_dataset.csv")
dataset.head()
print(dataset.columns)  # Check available columns


Index(['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source',
       'NER'],
      dtype='object')


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split


# Keep only relevant columns
dataset = dataset[['ingredients', 'directions']].dropna()

# Convert to lowercase
dataset["ingredients"] = dataset["ingredients"].str.lower()
dataset["directions"] = dataset["directions"].str.lower()

# Remove special characters
dataset["ingredients"] = dataset["ingredients"].apply(lambda x: re.sub(r"[^a-zA-Z0-9, ]", "", x))
dataset["directions"] = dataset["directions"].apply(lambda x: re.sub(r"[^a-zA-Z0-9, ]", "", x))

# Format dataset for text generation
dataset["input_text"] = "Ingredients: " + dataset["ingredients"]
dataset["target_text"] = "Directions: " + dataset["directions"]

# Train-validation split
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Save cleaned datasets
train_data.to_csv("train_data.csv", index=False)
val_data.to_csv("val_data.csv", index=False)

print("âœ… Data Preprocessing Done!")


Unique Labels: [0]


In [None]:
import pandas as pd

def safe_join(lst):
    return " ".join(lst) if isinstance(lst, list) else ""

# Convert ingredients and directions into single strings
train_data["ingredients"] = train_data["ingredients"].apply(safe_join)
train_data["directions"] = train_data["directions"].apply(safe_join)

# Convert ingredients and directions into single strings for validation data
val_data["ingredients"] = val_data["ingredients"].apply(safe_join)
val_data["directions"] = val_data["directions"].apply(safe_join)

# Convert ingredients and directions into a single text per recipe
train_texts = train_data.apply(lambda x: f"{x['ingredients']} {x['directions']}", axis=1).tolist()
val_texts = val_data.apply(lambda x: f"{x['ingredients']} {x['directions']}", axis=1).tolist()


overlap = set(train_texts) & set(val_texts)
print(f"Tokenized Text Overlap: {len(overlap)}")  


print("Text processing completed! âœ…")

Text processing completed! âœ…


In [None]:
from transformers import BertTokenizer

# Load TinyBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")

# Tokenize train texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64)

print("Tokenization completed!")


Tokenization completed!


In [None]:
from transformers import BertForSequenceClassification
import torch
import torch.nn as nn

# Load TinyBERT for classification
model = BertForSequenceClassification.from_pretrained(
    "huawei-noah/TinyBERT_General_4L_312D",
    num_labels=10,  
    hidden_dropout_prob=0.5,  # Increase dropout
    attention_probs_dropout_prob=0.5  # Dropout for attention layers
)

# Freeze some TinyBERT layers to reduce overfitting
for param in model.bert.parameters():
    param.requires_grad = False  # Freeze base TinyBERT layers

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("TinyBERT model with regularization loaded! ðŸš€")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TinyBERT model with regularization loaded! ðŸš€


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader

# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

class RecipeDataset(Dataset):
    def __init__(self, data):
        self.input_texts = data["input_text"].tolist()
        self.target_texts = data["target_text"].tolist()

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_enc = tokenizer(self.input_texts[idx], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        target_enc = tokenizer(self.target_texts[idx], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        
        return {
            "input_ids": input_enc["input_ids"].squeeze(),
            "attention_mask": input_enc["attention_mask"].squeeze(),
            "labels": target_enc["input_ids"].squeeze()
        }

# Load dataset
train_dataset = RecipeDataset(train_data)
val_dataset = RecipeDataset(val_data)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

print("âœ… Data Loaded for Training!")


Dataset ready!


In [None]:
from torch.utils.data import DataLoader

# Define DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

print("Dataloader ready!")


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

# Define loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Learning rate scheduler
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

print("Loss function & optimizer set up!")


Loss function & optimizer set up!


In [None]:
from transformers import AdamW, get_scheduler
from tqdm import tqdm

# Load T5 model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

# Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
num_epochs = 3
best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        batch = {key: val.to(device) for key, val in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_train_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1} | Validation Loss: {avg_val_loss:.4f}")

    # Save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "best_t5_model.pth")

print("âœ… Training Completed!")


Epoch 1: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [19:54<00:00,  4.19it/s, loss=1.03] 


Epoch 1 | Validation Loss: 0.6246


Epoch 2: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [17:35<00:00,  4.74it/s, loss=0.162]


Epoch 2 | Validation Loss: 0.1584


Epoch 3: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [18:48<00:00,  4.43it/s, loss=0.0911] 


Epoch 3 | Validation Loss: 0.0949
Training completed! âœ…


In [None]:
# Set model to evaluation mode
model.eval()

correct, total = 0, 0
with torch.no_grad():
    for batch in train_loader:
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)

        # Get predictions
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == batch["labels"]).sum().item()
        total += batch["labels"].size(0)

accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 100.00%


In [None]:
correct, total = 0, 0
with torch.no_grad():
    for batch in val_loader:  # Use validation set
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**batch)

        predictions = torch.argmax(outputs.logits, dim=-1)
        correct += (predictions == batch["labels"]).sum().item()
        total += batch["labels"].size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Validation Accuracy: 100.00%


In [1]:
def generate_recipe(ingredients):
    model.eval()
    
    # Format input
    input_text = "Ingredients: " + ingredients
    input_enc = tokenizer(input_text, return_tensors="pt").to(device)

    # Generate output
    with torch.no_grad():
        output = model.generate(**input_enc, max_length=150)
    
    # Decode output
    recipe = tokenizer.decode(output[0], skip_special_tokens=True)
    return recipe

# Test with a sample ingredient list
ingredients = "chicken, garlic, onions, tomatoes, salt, pepper"
recipe = generate_recipe(ingredients)
print("Generated Recipe:", recipe)


NameError: name 'model' is not defined