In [1]:
import pandas as pd
import json
import ast

def parse_nutrition(nutrition_str):
    nutrition_str = nutrition_str.replace('[', '').replace(']', '')
    try:
        values = [float(x.strip()) for x in nutrition_str.split(",")]
        if len(values) >= 6:
            calories = values[0]
            macros = {
                "total_fat": values[1],
                "sugar": values[2],
                "sodium": values[3],
                "protein": values[4],
                "saturated_fat": values[5]
            }
        else:
            calories = None
            macros = {"total_fat": None, "sugar": None, "sodium": None, "protein": None, "saturated_fat": None}
    except Exception:
        calories = None
        macros = {"total_fat": None, "sugar": None, "sodium": None, "protein": None, "saturated_fat": None}
    return calories, macros
#Look for diet-related keywords
def extract_diet(tags_str):
    
    diet_keywords = ["vegan", "vegetarian", "gluten free", "keto", "low carb"]
    tags_lower = tags_str.lower()
    for keyword in diet_keywords:
        if keyword in tags_lower:
            return keyword.capitalize()
    return ""

def extract_allergies(text):
    allergen_keywords = ["nuts", "dairy", "gluten", "soy", "egg", "shellfish"]
    text_lower = text.lower()
    return [keyword for keyword in allergen_keywords if keyword in text_lower]

recipes_df = pd.read_csv("RAW_recipes.csv")


with open("recipes_chat_finetune.jsonl", "w") as fout:
    for _, row in recipes_df.iterrows():
        #Create a prompt
        prompt_content = f"Ingredients: {row['ingredients']}."
        
        # Convert ingredients from string to list if it returns a string
        try:
            ingredient_list = ast.literal_eval(row['ingredients'])
        except Exception:
            ingredient_list = [item.strip() for item in row['ingredients'].split(",")]
        
    
        instructions = [step.strip() for step in row['steps'].split('.') if step.strip()]
        
        #Extract diet and allergies
        diet = extract_diet(row['tags'])
        allergies_from_tags = extract_allergies(row['tags'])
        allergies_from_ingredients = extract_allergies(row['ingredients'])
        allergies = list(set(allergies_from_tags + allergies_from_ingredients))
        
        
        calories_value, macros_values = parse_nutrition(row["nutrition"])
        
        
        recipe_obj = {
            "title": row["name"],
            "ingredients": ingredient_list,
            "instructions": instructions,
            "macros": macros_values,
            "calories": calories_value,
            "diet": diet,
            "allergies": allergies,
            "cookTime": row["minutes"],
            "servings": 1
        }
        
        
        completion_content = json.dumps(recipe_obj, separators=(',', ':'))
        
        #Format it for training
        training_example = {
            "messages": [
                {"role": "user", "content": prompt_content},
                {"role": "assistant", "content": completion_content}
            ]
        }
        fout.write(json.dumps(training_example, separators=(',', ':')) + "\n")
