# **CONFIGURATION & DEPENDENCIES**

In [None]:
"""
Alternative Implementation: GPT-2 Recipe Generation System
Dataset: 3A2M Extended Recipe Collection
Model: GPT-2 (OpenAI)
Author: Alternative Approach
"""

# ═══════════════════════════════════════════════════════════════════════════
# CONFIGURATION & DEPENDENCIES
# ═══════════════════════════════════════════════════════════════════════════

import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from dataclasses import dataclass
from typing import List, Dict, Tuple
import gradio as gr

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

print("│ ✓ Dependencies loaded successfully")
print("└─" + "─" * 58)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
│ ✓ Dependencies loaded successfully
└───────────────────────────────────────────────────────────


# **HYPERPARAMETERS & CONFIGURATION**

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# HYPERPARAMETERS & CONFIGURATION
# ═══════════════════════════════════════════════════════════════════════════

@dataclass
class Config:
    """Training configuration parameters"""
    # Paths
    data_path: str = '/content/drive/MyDrive/nlp-p2/3A2M_EXTENDED.csv'
    save_dir: str = './recipe_gpt2_checkpoint'

    # Data
    sample_size: int = 3000
    train_ratio: float = 0.8
    random_seed: int = 42

    # Model
    model_name: str = 'gpt2'
    max_seq_len: int = 256

    # Training
    batch_size: int = 4
    num_epochs: int = 2
    learning_rate: float = 5e-5
    warmup_steps: int = 100
    gradient_accum_steps: int = 1

    # Generation
    temp: float = 0.8
    top_p: float = 0.9
    max_gen_length: int = 200

config = Config()

print("\n┌─ CONFIGURATION")
print(f"│ Dataset: {config.data_path}")
print(f"│ Model: {config.model_name}")
print(f"│ Samples: {config.sample_size}")
print(f"│ Epochs: {config.num_epochs}")
print(f"│ Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
print("└─" + "─" * 58)


┌─ CONFIGURATION
│ Dataset: /content/drive/MyDrive/nlp-p2/3A2M_EXTENDED.csv
│ Model: gpt2
│ Samples: 3000
│ Epochs: 2
│ Device: CUDA
└───────────────────────────────────────────────────────────


# **DATA LOADING & PREPROCESSING**

In [None]:

# ═══════════════════════════════════════════════════════════════════════════
# DATA LOADING & PREPROCESSING
# ═══════════════════════════════════════════════════════════════════════════

class RecipeDataLoader:
    """Handles recipe data loading and preprocessing"""

    def __init__(self, filepath: str, sample_size: int, seed: int):
        self.filepath = filepath
        self.sample_size = sample_size
        self.seed = seed
        np.random.seed(seed)

    def load(self) -> pd.DataFrame:
        """Load and preprocess recipe dataset"""
        print("\n┌─ DATA LOADING")

        # Read CSV
        df = pd.read_csv(self.filepath)
        print(f"│ Loaded {len(df):,} total recipes")
        print(f"│ Columns: {list(df.columns)}")

        # Identify columns
        cols = df.columns.tolist()
        title_col = next((c for c in cols if 'title' in c.lower()), None)
        ingr_col = next((c for c in cols if 'ingredient' in c.lower() or 'ner' in c.lower()), None)
        dir_col = next((c for c in cols if 'direction' in c.lower()), None)

        if not all([title_col, ingr_col, dir_col]):
            raise ValueError("Required columns not found")

        print(f"│ Title column: {title_col}")
        print(f"│ Ingredients: {ingr_col}")
        print(f"│ Directions: {dir_col}")

        # Clean and sample
        df_clean = df[[title_col, ingr_col, dir_col]].dropna()
        df_sample = df_clean.sample(
            n=min(self.sample_size, len(df_clean)),
            random_state=self.seed
        )

        # Format recipes
        df_sample['recipe_text'] = df_sample.apply(
            lambda row: self._format_recipe(
                row[title_col],
                row[ingr_col],
                row[dir_col]
            ),
            axis=1
        )

        print(f"│ Preprocessed {len(df_sample)} recipes")
        print("└─" + "─" * 58)

        return df_sample

    @staticmethod
    def _format_recipe(title: str, ingredients: str, directions: str) -> str:
        """Format recipe into training text"""
        return f"Title: {title} | Ingredients: {ingredients} | Directions: {directions}<|endoftext|>"

# Load data
loader = RecipeDataLoader(config.data_path, config.sample_size, config.random_seed)
recipe_df = loader.load()


┌─ DATA LOADING
│ Loaded 2,231,143 total recipes
│ Columns: ['title', 'NER', 'Extended_NER', 'genre', 'label', 'directions']
│ Title column: title
│ Ingredients: NER
│ Directions: directions
│ Preprocessed 3000 recipes
└───────────────────────────────────────────────────────────


# **TOKENIZER & MODEL INITIALIZATION**

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# TOKENIZER & MODEL INITIALIZATION
# ═══════════════════════════════════════════════════════════════════════════

print("\n┌─ MODEL INITIALIZATION")

# Initialize tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token
print(f"│ Tokenizer loaded: {config.model_name}")

# Initialize model
model = GPT2LMHeadModel.from_pretrained(config.model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"│ Model loaded on: {device}")
print(f"│ Parameters: {sum(p.numel() for p in model.parameters()):,}")
print("└─" + "─" * 58)


┌─ MODEL INITIALIZATION


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

│ Tokenizer loaded: gpt2


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

│ Model loaded on: cuda
│ Parameters: 124,439,808
└───────────────────────────────────────────────────────────


# **CUSTOM DATASET CLASS**

In [None]:

# ═══════════════════════════════════════════════════════════════════════════
# CUSTOM DATASET CLASS
# ═══════════════════════════════════════════════════════════════════════════

class RecipeTokenDataset(Dataset):
    """PyTorch Dataset for tokenized recipes"""

    def __init__(self, texts: List[str], tokenizer, max_length: int):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.encodings = tokenizer(
            texts,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_tensors='pt'
        )

    def __len__(self) -> int:
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.encodings['input_ids'][idx].clone()
        }

# Split data
train_texts, val_texts = train_test_split(
    recipe_df['recipe_text'].tolist(),
    test_size=1-config.train_ratio,
    random_state=config.random_seed
)

# Create datasets
train_dataset = RecipeTokenDataset(train_texts, tokenizer, config.max_seq_len)
val_dataset = RecipeTokenDataset(val_texts, tokenizer, config.max_seq_len)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=config.batch_size,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=config.batch_size
)

print(f"\n┌─ DATASET STATISTICS")
print(f"│ Training samples: {len(train_texts)}")
print(f"│ Validation samples: {len(val_texts)}")
print(f"│ Batches per epoch: {len(train_loader)}")
print("└─" + "─" * 58)


┌─ DATASET STATISTICS
│ Training samples: 2400
│ Validation samples: 600
│ Batches per epoch: 600
└───────────────────────────────────────────────────────────


# **TRAINING ENGINE**

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# TRAINING ENGINE
# ═══════════════════════════════════════════════════════════════════════════

class Trainer:
    """Training orchestrator for GPT-2"""

    def __init__(self, model, train_loader, val_loader, config, device):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.config = config
        self.device = device

        # Optimizer
        self.optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=config.learning_rate
        )

        # Scheduler
        total_steps = len(train_loader) * config.num_epochs
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=config.warmup_steps,
            num_training_steps=total_steps
        )

        self.history = {'train_loss': [], 'val_loss': []}

    def train_epoch(self) -> float:
        """Execute one training epoch"""
        self.model.train()
        epoch_loss = 0.0

        progress_bar = tqdm(self.train_loader, desc="Training")
        for batch in progress_bar:
            # Move to device
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)

            # Forward pass
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

            # Backward pass
            loss.backward()
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

            epoch_loss += loss.item()
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

        return epoch_loss / len(self.train_loader)

    def validate(self) -> float:
        """Run validation"""
        self.model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for batch in tqdm(self.val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                val_loss += outputs.loss.item()

        return val_loss / len(self.val_loader)

    def fit(self):
        """Train the model"""
        print("\n┌─ TRAINING STARTED")

        for epoch in range(self.config.num_epochs):
            print(f"\n│ Epoch {epoch + 1}/{self.config.num_epochs}")
            print("│ " + "─" * 56)

            # Train
            train_loss = self.train_epoch()
            self.history['train_loss'].append(train_loss)
            print(f"│ Train Loss: {train_loss:.4f}")

            # Validate
            val_loss = self.validate()
            self.history['val_loss'].append(val_loss)
            print(f"│ Val Loss: {val_loss:.4f}")

        print("\n└─ TRAINING COMPLETED")
        return self.history

# Initialize and run training
trainer = Trainer(model, train_loader, val_loader, config, device)
training_history = trainer.fit()

# Save model
Path(config.save_dir).mkdir(exist_ok=True)
model.save_pretrained(config.save_dir)
tokenizer.save_pretrained(config.save_dir)
print(f"\n✓ Model saved to: {config.save_dir}")



┌─ TRAINING STARTED

│ Epoch 1/2
│ ────────────────────────────────────────────────────────


Training:   0%|          | 0/600 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


│ Train Loss: 1.6151


Validation:   0%|          | 0/150 [00:00<?, ?it/s]

│ Val Loss: 1.2989

│ Epoch 2/2
│ ────────────────────────────────────────────────────────


Training:   0%|          | 0/600 [00:00<?, ?it/s]

│ Train Loss: 1.3126


Validation:   0%|          | 0/150 [00:00<?, ?it/s]

│ Val Loss: 1.2779

└─ TRAINING COMPLETED

✓ Model saved to: ./recipe_gpt2_checkpoint


# **ENERATION ENGINE**

In [None]:
# ═══════════════════════════════════════════════════════════════════════════
# GENERATION ENGINE
# ═══════════════════════════════════════════════════════════════════════════

class RecipeGenerator:
    """Recipe generation inference engine"""

    def __init__(self, model, tokenizer, device, config):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.config = config
        self.model.eval()

    def generate(self, prompt: str, max_length: int = None) -> str:
        """Generate recipe from prompt"""
        if max_length is None:
            max_length = self.config.max_gen_length

        # Tokenize
        inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device)

        # Generate
        with torch.no_grad():
            output_ids = self.model.generate(
                inputs['input_ids'],
                max_length=max_length,
                temperature=self.config.temp,
                top_p=self.config.top_p,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )

        # Decode
        return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)

generator = RecipeGenerator(model, tokenizer, device, config)

# **EVALUATION METRICS AND INTERACTIVE DEMO**

In [None]:

# ═══════════════════════════════════════════════════════════════════════════
# EVALUATION METRICS
# ═══════════════════════════════════════════════════════════════════════════

print("\n┌─ EVALUATION")

# Install metrics
import subprocess
subprocess.run(['pip', 'install', '-q', 'nltk', 'rouge-score'], check=True)

import nltk
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer

nltk.download('punkt', quiet=True)

# Generate predictions
test_samples = val_texts[:10]
references = []
predictions = []

print("│ Generating predictions...")
for sample in tqdm(test_samples, desc="│ Evaluating"):
    title = sample.split('|')[0] + '|'
    generated = generator.generate(title)

    references.append([sample.split()])
    predictions.append(generated.split())

# BLEU
bleu = corpus_bleu(references, predictions)
print(f"│ BLEU Score: {bleu:.4f}")

# ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}

for ref, pred in zip(references, predictions):
    ref_text = ' '.join(ref[0])
    pred_text = ' '.join(pred)
    scores = scorer.score(ref_text, pred_text)

    for key in rouge_scores:
        rouge_scores[key] += scores[key].fmeasure

for key in rouge_scores:
    rouge_scores[key] /= len(test_samples)

print(f"│ ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"│ ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"│ ROUGE-L: {rouge_scores['rougeL']:.4f}")
print("└─" + "─" * 58)

# ═══════════════════════════════════════════════════════════════════════════
# INTERACTIVE DEMO
# ═══════════════════════════════════════════════════════════════════════════

def create_recipe_interface():
    """Build Gradio interface"""

    def generate_from_title(title: str, length: int) -> str:
        """Interface generation function"""
        if not title.strip():
            return "⚠️ Please provide a recipe title"

        prompt = f"Title: {title.strip()} |"
        result = generator.generate(prompt, max_length=length)

        # Format output
        try:
            parts = result.split('|')
            if len(parts) >= 3:
                formatted = f"### {parts[0].strip()}\n\n"
                formatted += f"**Ingredients:**\n{parts[1].strip()}\n\n"
                formatted += f"**Directions:**\n{parts[2].strip()}"
                return formatted
        except:
            pass

        return result

    # Build interface
    interface = gr.Interface(
        fn=generate_from_title,
        inputs=[
            gr.Textbox(
                label="🍳 Recipe Title",
                placeholder="Enter recipe name...",
                lines=1
            ),
            gr.Slider(
                minimum=150,
                maximum=400,
                value=300,
                step=50,
                label="Generation Length"
            )
        ],
        outputs=gr.Markdown(label="📝 Generated Recipe"),
        title="🤖 GPT-2 Recipe Generator",
        description=f"""
        ### AI-Powered Recipe Generation System

        **Model:** GPT-2 (124M params)
        **Dataset:** 3A2M Extended ({config.sample_size:,} recipes)
        **Performance:** BLEU={bleu:.4f}, ROUGE-1={rouge_scores['rouge1']:.4f}

        Enter a recipe title to generate ingredients and cooking instructions!
        """,
        examples=[
            ["Chocolate Chip Cookies", 300],
            ["Chicken Tikka Masala", 300],
            ["Greek Salad", 250],
            ["Banana Bread", 300],
            ["Spaghetti Bolognese", 300]
        ],
        theme="soft"
    )

    return interface

# Launch demo
print("\n┌─ LAUNCHING DEMO")
demo = create_recipe_interface()
demo.launch(share=True, debug=True)

print("\n" + "═" * 60)
print("✓ ALL TASKS COMPLETED SUCCESSFULLY")
print("═" * 60)


┌─ EVALUATION
│ Generating predictions...


│ Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


│ BLEU Score: 0.0779
│ ROUGE-1: 0.2886
│ ROUGE-2: 0.0792
│ ROUGE-L: 0.2232
└───────────────────────────────────────────────────────────

┌─ LAUNCHING DEMO
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://f4771e8c6bd108201d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
