In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, logging
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import json
import ast
from tqdm import tqdm
import pickle
import warnings
import os


logging.set_verbosity_error()
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.enabled = True

class RecipeDataset(Dataset):
    def __init__(self, recipes_df, tokenizer, max_length=256, cache_dir='cache'):
        self.recipes_df = recipes_df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.cache_dir = cache_dir
        
        os.makedirs(cache_dir, exist_ok=True)
        
        self.processed_data = self._preprocess_all_recipes()
        
    def _preprocess_all_recipes(self):
        cache_file = os.path.join(self.cache_dir, 'processed_recipes.pkl')
        
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as f:
                return pickle.load(f)
        
        processed_data = []
        
        for idx in tqdm(range(len(self.recipes_df)), desc="Processing recipes"):
            recipe = self.recipes_df.iloc[idx]
            text = self._create_recipe_text(recipe)
            
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
            
            processed_data.append({
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'recipe_id': recipe['id'],
                'recipe_name': recipe['name']
            })
        
        with open(cache_file, 'wb') as f:
            pickle.dump(processed_data, f)
        
        return processed_data
        
    def __len__(self):
        return len(self.processed_data)
    
    def __getitem__(self, idx):
        return self.processed_data[idx]
    
    def _create_recipe_text(self, recipe):
        tags = recipe['tags']
        if isinstance(tags, str):
            try:
                tags = ast.literal_eval(tags)
            except:
                tags = []
        
        ingredients = recipe['ingredients']
        if isinstance(ingredients, str):
            try:
                ingredients = ast.literal_eval(ingredients)
            except:
                ingredients = []
        
        text_parts = [
            f"Recipe: {recipe['name']}",
            f"Tags: {' '.join(tags)}",
            f"Ingredients: {' '.join(ingredients)}",
            f"Description: {recipe['description']}" if pd.notna(recipe['description']) else "",
            f"Time: {recipe['minutes']} minutes" if pd.notna(recipe['minutes']) else ""
        ]
        
        return " ".join([part for part in text_parts if part])

class RecipeBERTModel(nn.Module):
    """BERT-based model for recipe embedding"""
    def __init__(self, bert_model_name='bert-base-uncased', hidden_size=768, num_labels=256):
        super(RecipeBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        for param in self.bert.embeddings.parameters():
            param.requires_grad = False
        
        for i in range(6): 
            for param in self.bert.encoder.layer[i].parameters():
                param.requires_grad = False
                
        self.dropout = nn.Dropout(0.1)
        self.projection = nn.Linear(hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        embeddings = self.projection(pooled_output)
        return embeddings

class RecipeSearchEngine:
    def __init__(self, model_path=None):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = RecipeBERTModel()
        self.model.to(device)
        
        if model_path:
            self.load_model(model_path)
        
        self.recipe_embeddings = None
        self.recipes_df = None
        
    def train_model(self, recipes_df, interactions_df, epochs=5, batch_size=64, learning_rate=2e-5, accumulation_steps=4):
        
        if len(recipes_df) > 50000:
            print(f"Dataset has {len(recipes_df)} recipes. Sampling 50000 for training...")
            recipes_df = recipes_df.sample(n=50000, random_state=42)
        
        dataset = RecipeDataset(recipes_df, self.tokenizer, max_length=256)
        
        if torch.cuda.is_available():
            # Adjust batch size based on available GPU memory
            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
            if gpu_memory < 8:  # Less than 8GB
                batch_size = min(batch_size, 32)
            elif gpu_memory < 16:  # Less than 16GB
                batch_size = min(batch_size, 64)
            print(f"Using batch size: {batch_size}")
        
        dataloader = DataLoader(
            dataset, 
            batch_size=batch_size, 
            shuffle=True,
            num_workers=0,  
            pin_memory=True if torch.cuda.is_available() else False,  
            drop_last=True  
        )
        
        optimizer = AdamW(
            self.model.parameters(), 
            lr=learning_rate,
            weight_decay=0.01
        )
        
        total_steps = len(dataloader) * epochs
        warmup_steps = int(0.1 * total_steps)
        
        from transformers import get_linear_schedule_with_warmup
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps
        )
        
        use_amp = torch.cuda.is_available()
        if use_amp:
            from torch.cuda.amp import GradScaler, autocast
            scaler = GradScaler()
        
        self.model.train()
        
        for epoch in range(epochs):
            total_loss = 0
            progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{epochs}')
            
            optimizer.zero_grad()
            
            for i, batch in enumerate(progress_bar):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                if use_amp:
                    with autocast():
                        embeddings = self.model(input_ids, attention_mask)
                        loss = self._contrastive_loss(embeddings)
                        loss = loss / accumulation_steps
                    
                    scaler.scale(loss).backward()
                    
                    if (i + 1) % accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()
                        optimizer.zero_grad()
                else:
                    embeddings = self.model(input_ids, attention_mask)
                    loss = self._contrastive_loss(embeddings)
                    loss = loss / accumulation_steps
                    loss.backward()
                    
                    if (i + 1) % accumulation_steps == 0:
                        optimizer.step()
                        scheduler.step()
                        optimizer.zero_grad()
                
                total_loss += loss.item() * accumulation_steps
                
                progress_bar.set_postfix({
                    'loss': f'{loss.item() * accumulation_steps:.4f}',
                    'lr': f'{scheduler.get_last_lr()[0]:.2e}'
                })
                
                if i % 100 == 0:
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
            
            avg_loss = total_loss / len(dataloader)
            print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")
        
        
    def _contrastive_loss(self, embeddings, temperature=0.07):
        embeddings = nn.functional.normalize(embeddings, p=2, dim=1)
        
        similarity_matrix = torch.matmul(embeddings, embeddings.T) / temperature
        
        batch_size = embeddings.shape[0]
        labels = torch.arange(batch_size).to(device)
        
        loss = nn.functional.cross_entropy(similarity_matrix, labels)
        
        return loss
    
    def generate_embeddings(self, recipes_df):
        self.recipes_df = recipes_df
        self.model.eval()
        
        inference_batch_size = 64 if torch.cuda.is_available() else 32
        
        dataset = RecipeDataset(recipes_df, self.tokenizer, max_length=128)
        dataloader = DataLoader(
            dataset, 
            batch_size=inference_batch_size, 
            shuffle=False,
            num_workers=0,  # Avoid multiprocessing issues
            pin_memory=True if torch.cuda.is_available() else False,
            drop_last=False
        )
        
        all_embeddings = []
        recipe_ids = []
        
        with torch.no_grad():
            for batch in tqdm(dataloader, desc="Generating embeddings"):
                input_ids = batch['input_ids'].to(device, non_blocking=True)
                attention_mask = batch['attention_mask'].to(device, non_blocking=True)
                
                if torch.cuda.is_available():
                    from torch.cuda.amp import autocast
                    with autocast():
                        embeddings = self.model(input_ids, attention_mask)
                else:
                    embeddings = self.model(input_ids, attention_mask)
                
                all_embeddings.append(embeddings.cpu().numpy())
                recipe_ids.extend(batch['recipe_id'].numpy())
                
                if torch.cuda.is_available() and len(all_embeddings) % 20 == 0:
                    torch.cuda.empty_cache()
        
        self.recipe_embeddings = np.vstack(all_embeddings)
        self.recipe_ids = np.array(recipe_ids)
        
        print(f"Generated embeddings for {len(self.recipe_embeddings)} recipes")
        
    def search(self, query_tags, top_k=10):
        if self.recipe_embeddings is None:
            raise ValueError("Recipe embeddings not generated. Call generate_embeddings first.")
        
        query_text = f"Tags: {' '.join(query_tags)}"
        
        encoding = self.tokenizer(
            query_text,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors='pt'
        )
        
        self.model.eval()
        with torch.no_grad():
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            query_embedding = self.model(input_ids, attention_mask).cpu().numpy()
        
        similarities = cosine_similarity(query_embedding, self.recipe_embeddings)[0]
        
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            recipe_id = self.recipe_ids[idx]
            recipe = self.recipes_df[self.recipes_df['id'] == recipe_id].iloc[0]
            
            results.append({
                'id': int(recipe_id),
                'name': recipe['name'],
                'score': float(similarities[idx]),
                'tags': ast.literal_eval(recipe['tags']) if isinstance(recipe['tags'], str) else recipe['tags'],
                'ingredients': ast.literal_eval(recipe['ingredients']) if isinstance(recipe['ingredients'], str) else recipe['ingredients'],
                'minutes': int(recipe['minutes']) if pd.notna(recipe['minutes']) else None,
                'description': recipe['description'] if pd.notna(recipe['description']) else ""
            })
        
        return results
    
    def save_model(self, path):
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'recipe_embeddings': self.recipe_embeddings,
            'recipe_ids': self.recipe_ids
        }, path)

        
    def load_model(self, path):
        """Load a trained model"""
        checkpoint = torch.load(path, map_location=device)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.recipe_embeddings = checkpoint.get('recipe_embeddings')
        self.recipe_ids = checkpoint.get('recipe_ids')
        print(f"Model loaded from {path}")

def main():
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
    recipes_df = pd.read_csv('RAW_recipes.csv')
    interactions_df = pd.read_csv('RAW_Interactions.csv')
    
    print(f"Loaded {len(recipes_df)} recipes and {len(interactions_df)} interactions")
    
    search_engine = RecipeSearchEngine()
    
    search_engine.train_model(
        recipes_df, 
        interactions_df, 
        epochs=2,  
        batch_size=32,  
        learning_rate=5e-5,  
        accumulation_steps=2  
    )
    
    search_engine.generate_embeddings(recipes_df)
    
    search_engine.save_model('recipe_bert_model.pth')
    
    test_queries = [
        ['healthy', 'quick', 'vegetarian'],
        ['mexican', 'spicy', 'chicken'],
        ['dessert', 'chocolate', 'easy'],
        ['pasta', 'italian', 'cheese'],
        ['breakfast', 'eggs', 'quick']
    ]
    
    
    for query in test_queries:
        print(f"\nQuery: {query}")
        results = search_engine.search(query, top_k=5)
        
        for i, result in enumerate(results, 1):
            print(f"{i}. {result['name']} (Score: {result['score']:.3f})")
            if len(result['tags']) > 0:
                print(f"   Tags: {', '.join(result['tags'][:5])}...")
            print(f"   Time: {result['minutes']} minutes")
    
    with open('search_results.json', 'w') as f:
        all_results = {}
        for query in test_queries:
            all_results[' '.join(query)] = search_engine.search(query, top_k=10)
        json.dump(all_results, f, indent=2)
    
    print("\nSearch results saved to 'search_results.json'")
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
GPU: NVIDIA GeForce RTX 4070 Laptop GPU
GPU Memory: 8.59 GB
Loading data...
Loaded 231637 recipes and 1132367 interactions
Preparing data for training...
Dataset has 231637 recipes. Sampling 50000 for training...
Pre-processing all recipes (this will be cached for future runs)...


Processing recipes: 100%|██████████| 50000/50000 [01:51<00:00, 448.99it/s]


Using batch size: 32
Starting training...


Epoch 1/2: 100%|██████████| 1562/1562 [06:30<00:00,  4.00it/s, loss=0.0001, lr=4.17e-05]


Epoch 1 - Average Loss: 0.0505


Epoch 2/2: 100%|██████████| 1562/1562 [06:31<00:00,  3.99it/s, loss=0.0001, lr=2.78e-05]


Epoch 2 - Average Loss: 0.0001
Training completed!
Generating recipe embeddings...
Loading cached processed recipes...


Generating embeddings: 100%|██████████| 782/782 [03:05<00:00,  4.21it/s]


Generated embeddings for 50000 recipes
Model saved to recipe_bert_model.pth

TESTING SEARCH ENGINE

Query: ['healthy', 'quick', 'vegetarian']
1. smoky baked beans   originally canary baked beans (Score: 0.820)
   Tags: weeknight, time-to-make, course, main-ingredient, cuisine...
   Time: 140 minutes
2. halvah parfait (Score: 0.802)
   Tags: course, cuisine, preparation, 5-ingredients-or-less, desserts...
   Time: 510 minutes
3. thai style coconut eggnog  no eggs (Score: 0.789)
   Tags: ...
   Time: 4 minutes
4. soda bread (Score: 0.769)
   Tags: course, breads...
   Time: 55 minutes
5. pan haggerty (Score: 0.769)
   Tags: 60-minutes-or-less, time-to-make, main-ingredient, preparation, potatoes...
   Time: 50 minutes

Query: ['mexican', 'spicy', 'chicken']
1. smoky baked beans   originally canary baked beans (Score: 0.824)
   Tags: weeknight, time-to-make, course, main-ingredient, cuisine...
   Time: 140 minutes
2. light as a feather cake (Score: 0.816)
   Tags: time-to-make, course, ma

## Different LLM Model

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel, logging
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import json
import ast
from tqdm import tqdm
import pickle
import warnings
import os
from collections import defaultdict

logging.set_verbosity_error()
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.enabled = True

def preprocess_csv_data(recipes_path='RAW_recipes.csv', interactions_path='RAW_interactions.csv', cache_dir='cache'):
    os.makedirs(cache_dir, exist_ok=True)
    
    recipes_cache = os.path.join(cache_dir, 'preprocessed_recipes.pkl')
    interactions_cache = os.path.join(cache_dir, 'preprocessed_interactions.pkl')
    
    if os.path.exists(recipes_cache) and os.path.exists(interactions_cache):
        print("Loading cached preprocessed data...")
        with open(recipes_cache, 'rb') as f:
            recipes_df = pickle.load(f)
        with open(interactions_cache, 'rb') as f:
            interactions_df = pickle.load(f)
        print(f"Loaded {len(recipes_df)} recipes and {len(interactions_df)} interactions from cache")
        return recipes_df, interactions_df
    
    

    recipes_df = pd.read_csv(recipes_path)
    interactions_df = pd.read_csv(interactions_path)
    recipes_df = preprocess_recipes(recipes_df)
    interactions_df = preprocess_interactions(interactions_df, recipes_df)
    
    with open(recipes_cache, 'wb') as f:
        pickle.dump(recipes_df, f)
    with open(interactions_cache, 'wb') as f:
        pickle.dump(interactions_df, f)
    
    print(f"Preprocessed data: {len(recipes_df)} recipes, {len(interactions_df)} interactions")
    return recipes_df, interactions_df

def preprocess_recipes(recipes_df):
    original_count = len(recipes_df)
    
    recipes_df = recipes_df.dropna(subset=['name', 'id', 'tags', 'ingredients'])
    def parse_tags(tags_str):
        if pd.isna(tags_str) or tags_str == '':
            return []
        try:
            if isinstance(tags_str, str):
                tags = ast.literal_eval(tags_str)
                if isinstance(tags, list):
                    return [tag.lower().strip() for tag in tags if isinstance(tag, str)]
            return []
        except:
            return []
    
    def parse_ingredients(ingredients_str):
        if pd.isna(ingredients_str) or ingredients_str == '':
            return []
        try:
            if isinstance(ingredients_str, str):
                ingredients = ast.literal_eval(ingredients_str)
                if isinstance(ingredients, list):
                    return [ing.lower().strip() for ing in ingredients if isinstance(ing, str)]
            return []
        except:
            return []
    
    recipes_df['parsed_tags'] = recipes_df['tags'].apply(parse_tags)
    recipes_df['parsed_ingredients'] = recipes_df['ingredients'].apply(parse_ingredients)
    
    recipes_df = recipes_df[
        (recipes_df['parsed_tags'].apply(len) > 0) & 
        (recipes_df['parsed_ingredients'].apply(len) > 0)
    ]
    
    recipes_df['description'] = recipes_df['description'].fillna('')
    
    recipes_df['minutes'] = pd.to_numeric(recipes_df['minutes'], errors='coerce')
    recipes_df['minutes'] = recipes_df['minutes'].fillna(0)
    
    recipes_df = recipes_df[recipes_df['minutes'] <= 1440] 
    return recipes_df

def preprocess_interactions(interactions_df, recipes_df):
    original_count = len(interactions_df)
    interactions_df = interactions_df.dropna(subset=['user_id', 'recipe_id', 'rating'])
    interactions_df = interactions_df[
        (interactions_df['rating'] >= 1) & 
        (interactions_df['rating'] <= 5)
    ]
    
    valid_recipe_ids = set(recipes_df['id'].values)
    interactions_df = interactions_df[interactions_df['recipe_id'].isin(valid_recipe_ids)]
    interactions_df['user_id'] = pd.to_numeric(interactions_df['user_id'], errors='coerce')
    interactions_df['recipe_id'] = pd.to_numeric(interactions_df['recipe_id'], errors='coerce')
    interactions_df['rating'] = pd.to_numeric(interactions_df['rating'], errors='coerce')
    interactions_df = interactions_df.dropna(subset=['user_id', 'recipe_id', 'rating'])
    
    return interactions_df

class RecipeDataset(Dataset):
    def __init__(self, recipes_df, tokenizer, max_length=128, cache_dir='cache'):
        self.recipes_df = recipes_df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)
        
        self.processed_data = self._preprocess_all_recipes()
        
    def _preprocess_all_recipes(self):
        cache_file = os.path.join(self.cache_dir, 'processed_recipes_distilbert.pkl')
        
        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as f:
                return pickle.load(f)
        processed_data = []
        
        for idx in tqdm(range(len(self.recipes_df)), desc="Processing recipes"):
            recipe = self.recipes_df.iloc[idx]
            text = self._create_recipe_text(recipe)
            
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
            
            processed_data.append({
                'input_ids': encoding['input_ids'].squeeze(0),
                'attention_mask': encoding['attention_mask'].squeeze(0),
                'recipe_id': recipe['id'],
                'recipe_name': recipe['name']
            })
        with open(cache_file, 'wb') as f:
            pickle.dump(processed_data, f)
        
        return processed_data
        
    def __len__(self):
        return len(self.processed_data)
    
    def __getitem__(self, idx):
        return self.processed_data[idx]
    
    def _create_recipe_text(self, recipe):
        tags = recipe['parsed_tags'] if 'parsed_tags' in recipe else []
        ingredients = recipe['parsed_ingredients'] if 'parsed_ingredients' in recipe else []
        
        if not tags:
            tags_raw = recipe['tags']
            if isinstance(tags_raw, str):
                try:
                    tags = ast.literal_eval(tags_raw)
                    tags = [tag.lower().strip() for tag in tags if isinstance(tag, str)]
                except:
                    tags = []
        
        if not ingredients:
            ingredients_raw = recipe['ingredients']
            if isinstance(ingredients_raw, str):
                try:
                    ingredients = ast.literal_eval(ingredients_raw)
                    ingredients = [ing.lower().strip() for ing in ingredients if isinstance(ing, str)]
                except:
                    ingredients = []
        
        text_parts = [
            f"Recipe: {recipe['name']}",
            f"Tags: {' '.join(tags[:15])}",  
            f"Ingredients: {' '.join(ingredients[:10])}",  
        ]
        
        if pd.notna(recipe['description']) and len(str(recipe['description'])) < 150:
            text_parts.append(f"Description: {recipe['description']}")
            
        if pd.notna(recipe['minutes']) and recipe['minutes'] > 0:
            text_parts.append(f"Time: {int(recipe['minutes'])} minutes")
        
        return " ".join([part for part in text_parts if part])

class RecipeDistilBERTModel(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased', hidden_size=768, embedding_dim=256):
        super(RecipeDistilBERTModel, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained(model_name)
        
        for i in range(3):  
            for param in self.distilbert.transformer.layer[i].parameters():
                param.requires_grad = False
                
        self.dropout = nn.Dropout(0.1)
        self.projection = nn.Linear(hidden_size, embedding_dim)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        embeddings = self.projection(cls_output)
        return embeddings

class RecipeSearchEngine:
    def __init__(self, model_path=None):
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = RecipeDistilBERTModel()
        self.model.to(device)
        
        if model_path and os.path.exists(model_path):
            self.load_model(model_path)
        
        self.recipe_embeddings = None
        self.recipes_df = None
        
    def train_model(self, recipes_df, interactions_df, epochs=2, batch_size=64, learning_rate=3e-5, accumulation_steps=2):
        
        self.recipes_df = recipes_df
        
        dataset = RecipeDataset(recipes_df, self.tokenizer, max_length=128)
        
        if torch.cuda.is_available():
            gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
            if gpu_memory < 8:
                batch_size = min(batch_size, 32)
            elif gpu_memory < 16:
                batch_size = min(batch_size, 64)
            else:
                batch_size = min(batch_size, 96)
            print(f"Using batch size: {batch_size}")
        
        dataloader = DataLoader(
            dataset, 
            batch_size=batch_size, 
            shuffle=True,
            num_workers=0,
            pin_memory=True if torch.cuda.is_available() else False,
            drop_last=True
        )
        
        optimizer = AdamW(
            self.model.parameters(), 
            lr=learning_rate,
            weight_decay=0.01
        )
        
        total_steps = len(dataloader) * epochs
        warmup_steps = int(0.1 * total_steps)
        
        from transformers import get_linear_schedule_with_warmup
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps
        )
        
        use_amp = torch.cuda.is_available()
        if use_amp:
            from torch.cuda.amp import GradScaler, autocast
            scaler = GradScaler()
        
        self.model.train()
        
        for epoch in range(epochs):
            total_loss = 0
            progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{epochs}')
            
            optimizer.zero_grad()
            
            for i, batch in enumerate(progress_bar):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                if use_amp:
                    with autocast():
                        embeddings = self.model(input_ids, attention_mask)
                        loss = self._contrastive_loss(embeddings)
                        loss = loss / accumulation_steps
                    
                    scaler.scale(loss).backward()
                    
                    if (i + 1) % accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()
                        optimizer.zero_grad()
                else:
                    embeddings = self.model(input_ids, attention_mask)
                    loss = self._contrastive_loss(embeddings)
                    loss = loss / accumulation_steps
                    loss.backward()
                    
                    if (i + 1) % accumulation_steps == 0:
                        optimizer.step()
                        scheduler.step()
                        optimizer.zero_grad()
                
                total_loss += loss.item() * accumulation_steps
                
                progress_bar.set_postfix({
                    'loss': f'{loss.item() * accumulation_steps:.4f}',
                    'lr': f'{scheduler.get_last_lr()[0]:.2e}'
                })
                
                if i % 100 == 0:
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
            
            avg_loss = total_loss / len(dataloader)
            print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")
        
        print("Training completed!")
        
    def _contrastive_loss(self, embeddings, temperature=0.07):
        embeddings = nn.functional.normalize(embeddings, p=2, dim=1)
        similarity_matrix = torch.matmul(embeddings, embeddings.T) / temperature
        batch_size = embeddings.shape[0]
        labels = torch.arange(batch_size).to(device)
        loss = nn.functional.cross_entropy(similarity_matrix, labels)
        return loss
    
    def generate_embeddings(self, recipes_df):
        self.recipes_df = recipes_df
        self.model.eval()
        
        inference_batch_size = 96 if torch.cuda.is_available() else 32
        
        dataset = RecipeDataset(recipes_df, self.tokenizer, max_length=128)
        dataloader = DataLoader(
            dataset, 
            batch_size=inference_batch_size, 
            shuffle=False,
            num_workers=0,
            pin_memory=True if torch.cuda.is_available() else False,
            drop_last=False
        )
        
        all_embeddings = []
        recipe_ids = []
        
        with torch.no_grad():
            for batch in tqdm(dataloader, desc="Generating embeddings"):
                input_ids = batch['input_ids'].to(device, non_blocking=True)
                attention_mask = batch['attention_mask'].to(device, non_blocking=True)
                
                if torch.cuda.is_available():
                    from torch.cuda.amp import autocast
                    with autocast():
                        embeddings = self.model(input_ids, attention_mask)
                else:
                    embeddings = self.model(input_ids, attention_mask)
                
                all_embeddings.append(embeddings.cpu().numpy())
                recipe_ids.extend(batch['recipe_id'].numpy())
                
                if torch.cuda.is_available() and len(all_embeddings) % 20 == 0:
                    torch.cuda.empty_cache()
        
        self.recipe_embeddings = np.vstack(all_embeddings)
        self.recipe_ids = np.array(recipe_ids)
        
        print(f"Generated embeddings for {len(self.recipe_embeddings)} recipes")
        
    def save_model(self, path):

        recipes_data = []
        if self.recipes_df is not None:
            for _, recipe in self.recipes_df.iterrows():
                recipe_dict = {
                    'id': int(recipe['id']),
                    'name': str(recipe['name']),
                    'minutes': int(recipe.get('minutes', 0)) if pd.notna(recipe.get('minutes')) else 0,
                    'description': str(recipe.get('description', '')),
                    'n_steps': int(recipe.get('n_steps', 0)) if pd.notna(recipe.get('n_steps')) else 0,
                    'parsed_tags': recipe.get('parsed_tags', []),
                    'parsed_ingredients': recipe.get('parsed_ingredients', []),
                    # Keep original formats as backup
                    'tags': recipe.get('tags', []),
                    'ingredients': recipe.get('ingredients', [])
                }
                recipes_data.append(recipe_dict)
        
        save_data = {
            'model_state_dict': self.model.state_dict(),
            'recipe_embeddings': self.recipe_embeddings,
            'recipe_ids': self.recipe_ids,
            'recipes_data': recipes_data  # THIS IS THE KEY ADDITION!
        }
        
        torch.save(save_data, path)
        
    def load_model(self, path):
        checkpoint = torch.load(path, map_location=device)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.recipe_embeddings = checkpoint.get('recipe_embeddings')
        self.recipe_ids = checkpoint.get('recipe_ids')

def main():
    os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
    
    print("Starting enhanced recipe model training...")
    
    recipes_df, interactions_df = preprocess_csv_data()
    search_engine = RecipeSearchEngine()
    
    search_engine.train_model(
        recipes_df, 
        interactions_df,
        epochs=3,
        batch_size=64,
        learning_rate=3e-5,
        accumulation_steps=2
    )
    search_engine.generate_embeddings(recipes_df)
    search_engine.save_model('recipe_distilbert_model.pth')
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()

Using device: cuda
GPU: NVIDIA GeForce RTX 4070 Laptop GPU
GPU Memory: 8.59 GB
🚀 Starting enhanced recipe model training...
Loading and preprocessing data...
Loading cached preprocessed data...
Loaded 229636 recipes and 1063999 interactions from cache
Training model...
Preparing data for training...
Training on FULL dataset: 229636 recipes
Loading cached processed recipes...
Using batch size: 64
Starting training...


Epoch 1/3: 100%|██████████| 3619/3619 [07:27<00:00,  8.09it/s, loss=0.0001, lr=2.78e-05]


Epoch 1 - Average Loss: 0.0694


Epoch 2/3: 100%|██████████| 3619/3619 [07:21<00:00,  8.20it/s, loss=0.0001, lr=2.22e-05]


Epoch 2 - Average Loss: 0.0001


Epoch 3/3: 100%|██████████| 3619/3619 [07:27<00:00,  8.09it/s, loss=0.0001, lr=1.67e-05]


Epoch 3 - Average Loss: 0.0001
Training completed!
Generating embeddings...
Generating recipe embeddings...
Loading cached processed recipes...


Generating embeddings: 100%|██████████| 2413/2413 [02:26<00:00, 16.44it/s]


Generated embeddings for 231637 recipes
Saving enhanced model...
Preparing recipe data for backend...
✅ Enhanced model saved to recipe_distilbert_model.pth
✅ Included 229636 recipe records for backend use
✅ Model ready for Streamlit app!

✅ TRAINING COMPLETE!
✅ Model saved with recipe data included
✅ Ready for Streamlit backend!
✅ No CSV files needed for inference
✅ Model file: recipe_distilbert_model.pth
✅ Recipe count: 229636
