# Data Inspection

In [15]:
import pandas as pd
import random
import json
import ast
import math
from typing import List, Dict, Any

In [16]:
dtypes = {
    "ingredient_food_kg_urls": "string",
    "ingredient_food_kg_names": "string"
}

In [17]:
df = pd.read_csv(
    "../data/pp_recipes.csv", 
    index_col=0, 
    low_memory=True, 
    dtype=dtypes
)

In [18]:
df.head()

Unnamed: 0,recipe_id,title,description,author_id,duration,directions,ingredients,serves,last_changed_date,food_kg_locator,...,sugars [g],protein [g],direction_size,ingredients_sizes,who_score,fsa_score,nutri_score,normalization_comment,ingredient_food_kg_urls,ingredient_food_kg_names
0,447642,Cranberry-Orange Caramel Corn,"Taken from Beter Homes and Gardens, November 2...",409184,40.0,['Preheat oven to 275 degrees. In a very large...,"{'': [('popcorn, popped', '12 time(s) cups ')...",20,2011-01-29,http://idea.rpi.edu/heals/kb/recipe/5e6ae80d-C...,...,6.9,0.8,9,9,0.168254,0.0,0.0,,['http://idea.rpi.edu/heals/kb/ingredientname/...,"['baking soda', 'brown sugar', 'butter', 'corn..."
1,53662,Cocktail De Camarones,I ordered shrimp cocktail at a Mexican restaur...,64251,20.0,"['After you chop the tomatoes, onions and cila...",{'': [('shelled and cooked shrimp (50-60 count...,2-4,2009-06-03,http://idea.rpi.edu/heals/kb/recipe/d6b47db2-C...,...,6.3,51.3,7,10,0.335455,0.625,0.25,,['http://idea.rpi.edu/heals/kb/ingredientname/...,"['avocado', 'cilantro', 'cooked shrimp', 'garl..."
2,520465,Inbal Jerusalem Hotel Snowball,This recipe was created in response to a Janua...,1803632771,210.0,"['Combine gelatin with cold water', 'Combine c...","{'': [('cream', '200 time(s) g '), ('vanilla ...",5,2015-01-08,http://idea.rpi.edu/heals/kb/recipe/29080638-I...,...,78.0,87.7,12,17,0.128082,0.0,0.0,,['http://idea.rpi.edu/heals/kb/ingredientname/...,"['cold water', 'condensed milk', 'cornflour', ..."
3,213601,Santorini Roasted Leg of Lamb,A succulent slow-roasted leg of lamb with herb...,183057,375.0,['Trim excess fat and silver skin from lamb le...,"{'': [('of lamb', '6 time(s) lbs leg '), ('dr...",10-12,2009-05-17,http://idea.rpi.edu/heals/kb/recipe/6af68948-S...,...,3.6,51.1,8,8,0.320622,0.375,0.0,,['http://idea.rpi.edu/heals/kb/ingredientname/...,"['dried mint flakes', 'dried oregano', 'dried ..."
4,67731,Cantonese Chicken Burgers,Adapted from a recipe in BHG's grilling magazine.,37779,38.0,"['In a mixing bowl, add the egg, sesame oil, a...","{'': [('egg, beaten', '1 time(s) '), ('toaste...",4,2007-11-21,http://idea.rpi.edu/heals/kb/recipe/83f130d0-C...,...,4.2,30.3,6,12,0.254427,0.5,0.25,,['http://idea.rpi.edu/heals/kb/ingredientname/...,"['carrots', 'egg', 'fine dry breadcrumbs', 'ga..."


In [19]:
df.columns

Index(['recipe_id', 'title', 'description', 'author_id', 'duration',
       'directions', 'ingredients', 'serves', 'last_changed_date',
       'food_kg_locator', 'recipe_url', 'tags', 'new_recipe_id',
       'new_author_id', 'average_rating', 'number_of_ratings',
       'servingsPerRecipe', 'servingSize [g]', 'calories [cal]',
       'caloriesFromFat [cal]', 'totalFat [g]', 'saturatedFat [g]',
       'cholesterol [mg]', 'sodium [mg]', 'totalCarbohydrate [g]',
       'dietaryFiber [g]', 'sugars [g]', 'protein [g]', 'direction_size',
       'ingredients_sizes', 'who_score', 'fsa_score', 'nutri_score',
       'normalization_comment', 'ingredient_food_kg_urls',
       'ingredient_food_kg_names'],
      dtype='object')

In [20]:
df.shape

(507335, 36)

# Fine-Tunning Dataset Generation

In [25]:
import pandas as pd
import json
import random
from typing import List, Dict, Any

def parse_ingredients(ingredients_dict):
    """Extract ingredient names from the ingredients dictionary"""
    if pd.isna(ingredients_dict) or not isinstance(ingredients_dict, dict):
        return []
    
    ingredient_list = []
    for category, items in ingredients_dict.items():
        for item in items:
            if isinstance(item, tuple) and len(item) > 0:
                ingredient_list.append(item[0])
    return ingredient_list

def generate_instruction_variants(recipe: Dict[str, Any]) -> List[Dict[str, str]]:
    """Generate multiple instruction-response pairs for a single recipe"""
    variants = []
    
    # Extract recipe details with NaN handling
    title = recipe.get('title', 'Unknown Recipe')
    
    # Helper function to safely convert to float
    def safe_float(value, default=0):
        try:
            if pd.isna(value):
                return default
            return float(value)
        except (ValueError, TypeError):
            return default
    
    # Helper function to safely convert to int
    def safe_int(value, default=None):
        try:
            if pd.isna(value):
                return default
            return int(float(value))
        except (ValueError, TypeError):
            return default
    
    calories = safe_float(recipe.get('calories [cal]', 0))
    protein = safe_float(recipe.get('protein [g]', 0))
    duration = safe_int(recipe.get('duration', 0))
    sodium = safe_float(recipe.get('sodium [mg]', 0))
    total_fat = safe_float(recipe.get('totalFat [g]', 0))
    carbs = safe_float(recipe.get('totalCarbohydrate [g]', 0))
    fiber = safe_float(recipe.get('dietaryFiber [g]', 0))
    serves = safe_int(recipe.get('serves', 1), default=1)
    
    # Parse ingredients
    ingredients = parse_ingredients(recipe.get('ingredients'))
    main_ingredient = ingredients[0] if ingredients else "ingredients"
    
    # Variant 1: Calorie-based query
    if calories > 0:
        calorie_range = "under" if calories < 400 else "around"
        instruction = f"Suggest a {main_ingredient} recipe {calorie_range} {int(calories)} calories."
        response = f"{title} - {calories:.1f} calories, {protein:.1f}g protein"
        if duration:
            response += f", ready in {duration} minutes."
        else:
            response += "."
        variants.append({"instruction": instruction, "output": response})
    
    # Variant 2: Time-based query
    if duration is not None and duration > 0:
        instruction = f"Recommend a quick recipe that takes less than {duration + 10} minutes."
        response = f"{title} - Takes {duration} minutes, {calories:.1f} calories"
        if serves and serves > 0:
            response += f", serves {serves}."
        else:
            response += "."
        variants.append({"instruction": instruction, "output": response})
    
    # Variant 3: Nutrition-focused query
    if protein > 5:
        instruction = f"Find a high-protein recipe with at least {int(protein - 2)}g protein."
        response = f"{title} - {protein:.1f}g protein, {calories:.1f} calories, {total_fat:.1f}g fat, {carbs:.1f}g carbs."
        variants.append({"instruction": instruction, "output": response})
    
    # Variant 4: Low-sodium query
    if sodium > 0:
        instruction = f"List a low-sodium recipe under {int(sodium + 50)}mg sodium."
        response = f"{title} - {sodium:.1f}mg sodium, {calories:.1f} calories"
        if duration is not None and duration > 0:
            response += f", {duration} minutes prep time."
        else:
            response += "."
        variants.append({"instruction": instruction, "output": response})
    
    # Variant 5: Ingredient-based query
    if ingredients and len(ingredients) >= 2:
        ing1, ing2 = ingredients[0], ingredients[1] if len(ingredients) > 1 else ingredients[0]
        instruction = f"Suggest a recipe using {ing1} and {ing2}."
        serves_text = serves if serves and serves > 0 else "multiple people"
        response = f"{title} - Features {ing1} and {ing2}, {calories:.1f} calories, serves {serves_text}."
        variants.append({"instruction": instruction, "output": response})
    
    # Variant 6: Dietary fiber query
    if fiber > 2:
        instruction = f"Recommend a high-fiber recipe with at least {int(fiber - 1)}g fiber."
        response = f"{title} - {fiber:.1f}g dietary fiber, {calories:.1f} calories, {carbs:.1f}g total carbs."
        variants.append({"instruction": instruction, "output": response})
    
    # Variant 7: Balanced meal query
    if calories > 0 and protein > 0 and total_fat > 0:
        instruction = "Suggest a balanced meal with moderate calories and good protein."
        response = f"{title} - {calories:.1f} calories, {protein:.1f}g protein, {total_fat:.1f}g fat, {carbs:.1f}g carbs."
        variants.append({"instruction": instruction, "output": response})
    
    return variants

def generate_multi_recipe_queries(recipes_df: pd.DataFrame, n_queries: int = 100) -> List[Dict[str, str]]:
    """Generate queries that require listing multiple recipes"""
    multi_queries = []
    
    # Helper function to safely get int values
    def safe_duration(val):
        try:
            return int(float(val)) if pd.notna(val) else None
        except (ValueError, TypeError):
            return None
    
    # Low-calorie recipes
    low_cal = recipes_df[recipes_df['calories [cal]'] < 400].head(3)
    if len(low_cal) >= 3:
        instruction = "List three low-calorie recipes under 400 calories."
        response_parts = []
        for idx, (_, row) in enumerate(low_cal.iterrows(), 1):
            response_parts.append(
                f"{idx}. {row['title']} ({row['calories [cal]']:.1f} calories, "
                f"{row['protein [g]']:.1f}g protein)"
            )
        multi_queries.append({
            "instruction": instruction,
            "output": " ".join(response_parts)
        })
    
    # Low-sodium recipes
    low_sodium = recipes_df[recipes_df['sodium [mg]'] < 300].head(3)
    if len(low_sodium) >= 3:
        instruction = "List three low-sodium recipes under 300mg sodium."
        response_parts = []
        for idx, (_, row) in enumerate(low_sodium.iterrows(), 1):
            response_parts.append(
                f"{idx}. {row['title']} ({row['sodium [mg]']:.1f}mg sodium, "
                f"{row['calories [cal]']:.1f} calories)"
            )
        multi_queries.append({
            "instruction": instruction,
            "output": " ".join(response_parts)
        })
    
    # Quick recipes
    quick = recipes_df[recipes_df['duration'].notna() & (recipes_df['duration'] < 30)].head(3)
    if len(quick) >= 3:
        instruction = "Suggest three quick recipes under 30 minutes."
        response_parts = []
        for idx, (_, row) in enumerate(quick.iterrows(), 1):
            dur = safe_duration(row['duration'])
            if dur:
                response_parts.append(
                    f"{idx}. {row['title']} ({dur} minutes, "
                    f"{row['calories [cal]']:.1f} calories)"
                )
        if len(response_parts) >= 3:
            multi_queries.append({
                "instruction": instruction,
                "output": " ".join(response_parts[:3])
            })
    
    # High-protein recipes
    high_protein = recipes_df[recipes_df['protein [g]'] > 20].head(3)
    if len(high_protein) >= 3:
        instruction = "List three high-protein recipes with over 20g protein."
        response_parts = []
        for idx, (_, row) in enumerate(high_protein.iterrows(), 1):
            response_parts.append(
                f"{idx}. {row['title']} ({row['protein [g]']:.1f}g protein, "
                f"{row['calories [cal]']:.1f} calories)"
            )
        multi_queries.append({
            "instruction": instruction,
            "output": " ".join(response_parts)
        })
    
    return multi_queries

def create_finetuning_dataset(csv_path: str, output_path: str, max_recipes: int = 1000):
    """
    Main function to create fine-tuning dataset from hummus dataset
    
    Args:
        csv_path: Path to the hummus dataset CSV file
        output_path: Path where the JSON output will be saved
        max_recipes: Maximum number of recipes to process
    """
    # Load dataset
    print("Loading dataset...")
    df = pd.read_csv(csv_path)
    
    # Clean data - remove recipes with missing critical info
    df_clean = df.dropna(subset=['title', 'calories [cal]'])
    df_clean = df_clean[df_clean['calories [cal]'] > 0]
    
    # Sample recipes if dataset is large
    if len(df_clean) > max_recipes:
        df_clean = df_clean.sample(n=max_recipes, random_state=42)
    
    print(f"Processing {len(df_clean)} recipes...")
    
    # Generate single-recipe queries
    all_examples = []
    for _, row in df_clean.iterrows():
        recipe_dict = row.to_dict()
        variants = generate_instruction_variants(recipe_dict)
        all_examples.extend(variants)
    
    # Generate multi-recipe queries
    print("Generating multi-recipe queries...")
    multi_queries = generate_multi_recipe_queries(df_clean, n_queries=100)
    all_examples.extend(multi_queries)
    
    # Shuffle the dataset
    random.shuffle(all_examples)
    
    # Save to JSON
    print(f"Saving {len(all_examples)} examples to {output_path}...")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(all_examples, f, indent=2, ensure_ascii=False)
    
    print(f"✓ Fine-tuning dataset created successfully!")
    print(f"  Total examples: {len(all_examples)}")
    print(f"  Output file: {output_path}")
    
    # Print sample examples
    print("\n--- Sample Examples ---")
    for i, example in enumerate(random.sample(all_examples, min(3, len(all_examples))), 1):
        print(f"\nExample {i}:")
        print(f"Instruction: {example['instruction']}")
        print(f"Response: {example['output']}")

# Usage example
if __name__ == "__main__":
    # Replace with your actual file path
    csv_path = "../data/pp_recipes.csv"
    output_path = "../data/recipe_finetuning_data.json"
    
    create_finetuning_dataset(csv_path, output_path, max_recipes=3000)

Loading dataset...


  df = pd.read_csv(csv_path)


Processing 3000 recipes...
Generating multi-recipe queries...
Saving 15481 examples to ../data/recipe_finetuning_data.json...
✓ Fine-tuning dataset created successfully!
  Total examples: 15481
  Output file: ../data/recipe_finetuning_data.json

--- Sample Examples ---

Example 1:
Instruction: List a low-sodium recipe under 490mg sodium.
Response: Baked Bean Scrolls - 440.2mg sodium, 145.0 calories, 55 minutes prep time.

Example 2:
Instruction: Suggest a ingredients recipe around 592 calories.
Response: Chablis Blanc Cake With Wine Glaze - 592.4 calories, 5.4g protein, ready in 70 minutes.

Example 3:
Instruction: Find a high-protein recipe with at least 8g protein.
Response: Amish Onion Cake - 10.4g protein, 648.1 calories, 44.5g fat, 52.6g carbs.


## Formatting the dataset for llama2

In [51]:
import re
import json

# Load the dataset
with open("../data/recipe_finetuning_data.json", "r") as f:
    dataset = json.load(f)
    



In [52]:
with open("../data/recipe_data_llama.jsonl", "w", encoding="utf-8") as f:
    for item in dataset:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

In [None]:
reformatted = []
for data in dataset:
    human_text, assistant_text = data["instruction"], data["output"]
    reformatted.append({'text': f'<s>[INST] {human_text} [/INST] {assistant_text} </s>'})

In [32]:
reformatted[:2]

[{'text': '<s>[INST] List a low-sodium recipe under 457mg sodium. [/INST] Roast Potatoes - 407.8mg sodium, 336.2 calories, 60 minutes prep time. </s>'},
 {'text': '<s>[INST] Recommend a quick recipe that takes less than 28 minutes. [/INST] Super Healthy Spicy Oatmeal Custard All-Day Breakfast - Takes 18 minutes, 413.3 calories, serves 1. </s>'}]

In [47]:
with open("../data/recipe_data_llama.jsonl", "w", encoding="utf-8") as f:
    for item in reformatted:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

    

In [53]:
from datasets import load_dataset

ds = load_dataset(
    "json",
    data_files="../data/recipe_data_llama.jsonl",
    split="train",
    field=None  # important: None means top-level list
)

Generating train split: 0 examples [00:00, ? examples/s]

In [54]:
print(ds)
print(ds[0])

Dataset({
    features: ['instruction', 'output'],
    num_rows: 15481
})
{'instruction': 'List a low-sodium recipe under 457mg sodium.', 'output': 'Roast Potatoes - 407.8mg sodium, 336.2 calories, 60 minutes prep time.'}


In [56]:
ds.to_parquet("../data/recipe_recom.parquet")

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2194742

In [None]:
ds