In [1]:
import pandas as pd

# Read File and create a new dataframe called 'df'
df = pd.read_csv('dataset_1.csv')

# Removing useless columns
df = df.drop(columns = ['Unnamed: 0', 'link', 'source'])

df

Unnamed: 0,title,ingredients,directions,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...","[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....","[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...","[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...","[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...","[""peanut butter"", ""graham cracker crumbs"", ""bu..."
...,...,...,...,...
223111,Hot Rolls,"[""1 3/4 c. warm water"", ""1/2 c. sugar"", ""1 Tbs...","[""Combine water, sugar, salt and butter, yeast...","[""water"", ""sugar"", ""salt"", ""butter"", ""yeast"", ..."
223112,Cheesy Vegetable Casserole,"[""1/2 lb. American cheese"", ""1/2 c. butter"", ""...","[""Cut cheese into cubes and place in saucepan ...","[""American cheese"", ""butter"", ""containing broc..."
223113,Volcanic Shake,"[""2 medium bananas"", ""4 ice cubes"", ""1/2 c. yo...","[""Put all ingredients in blender. Blend on low...","[""bananas"", ""cubes"", ""yogurt"", ""orange juice"",..."
223114,Yogurt Popsicles,"[""2 cartons plain yogurt"", ""1 (12 oz.) can con...","[""Combine ingredients in a bowl. Mix well. Fre...","[""yogurt"", ""fruit juice"", ""vanilla""]"


In [2]:
import re
from fractions import Fraction

def convert_fractions(text):
    """Defining a method that convert fractions"""
    # Match optional whole number + fraction, or just fraction
    pattern = r'(?:(\d+)\s+)?(\d+)/(\d+)'

    def replacer(match):
        whole = int(match.group(1)) if match.group(1) else 0
        numerator = int(match.group(2))
        denominator = int(match.group(3))
        decimal = whole + Fraction(numerator, denominator)
        return "{:.10g}".format(float(decimal))  # Clean, no trailing zeros

    return re.sub(pattern, replacer, text)

In [3]:
from fractions import Fraction

# Fixed conversions
CONVERSIONS = {
    "oz": 30,      # ounce → gram
    "lb": 450,      # pounds → gram
    "pt": 475,      # pint → milliliter
    "qt": 950,      # quart → milliliter
    "inch": 2.5        # inches → centimeter
}

def convert_units(text):
    """Defining a method that converts units by using the fixed conversions"""
    if pd.isna(text):
        return text

    # Convert oz, lb, pt, qt (form: "4 oz", "2.5 lb", etc.)
    for unit, factor in CONVERSIONS.items():
        pattern = r'(\d+(\.\d+)?)\s*' + unit
        text = re.sub(pattern, lambda m: f"{round(float(m.group(1)) * factor, 1)} {unit_to_metric(unit)}", text)

    # Convert measure in inches in pans
    text = re.sub(r'(\d+)\s*x\s*(\d+)[-\s]*inch', lambda
        m: f"{int(m.group(1)) * CONVERSIONS['inch']:.0f} x {int(m.group(2)) * CONVERSIONS['inch']:.0f} cm", text)

    return text

def convert_fahrenheit_to_celsius(text):
    # Match degrees like 275°, 275 °F, 275\u00b0F (escaped), etc.
    pattern = r'(\d+)\s*(?:°|\\u00b0)'

    def replacer(match):
        fahrenheit = int(match.group(1))
        celsius = round((fahrenheit - 32) * 5 / 9)
        return f"{celsius}°C"

    return re.sub(pattern, replacer, text)

def unit_to_metric(unit):
    return {
        "oz": "g",
        "lb": "g",
        "pt": "ml",
        "qt": "ml",
        "inch": "cm"
    }[unit]

# Applying conversions on the INGREDIENTS and DIRECTIONS columns
df['INGREDIENTS'] = df['ingredients'].apply(convert_fractions)
df['DIRECTIONS'] = df['directions'].apply(convert_fractions)

df['INGREDIENTS'] = df['INGREDIENTS'].apply(convert_units)
df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_units)

df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_fahrenheit_to_celsius)

# Renaming the title column and dropping the converted columns
df = df.rename(columns={"title": "TITLE"})
df = df.drop(['ingredients', 'directions'], axis=1)

df[['TITLE', 'INGREDIENTS', 'DIRECTIONS']].head(200)

Unnamed: 0,TITLE,INGREDIENTS,DIRECTIONS
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""0.5 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish...."
2,Creamy Corn,"[""2 (480.0 g.) pkg. frozen corn"", ""1 (240.0 g....","[""In a slow cooker, combine all ingredients. C..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (315.0 g.) cans c...","[""Boil and debone chicken."", ""Put bite size pi..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""0.75 c. graham cracker...","[""Combine first four ingredients and press in ..."
...,...,...,...
195,Golf Balls,"[""1 c. cocoa"", ""1 c. butter, softened"", ""3 (45...","[""Beat all ingredients together except peanut ..."
196,Crispy Herb Bread,"[""4 tsp. olive oil"", ""0.5 tsp. garlic powder"",...","[""In a small bowl, combine oil and garlic powd..."
197,Festive Fruit Salad,"[""1 (600.0 g.) can pineapple chunks, drained (...","[""Combine pineapple, oranges, grapes, strawber..."
198,Fresh Strawberry Pie,"[""950.0 ml. strawberries"", ""0.75 c. water"", ""p...","[""Cut up 1 cup strawberries."", ""Add water."", ""..."


In [4]:
import ast
import re

# Patterns to detect time expressions
TIME_PATTERN = re.compile(
    r'(\d+(?:\.\d+)?)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h|day|days|d)\b',
    re.IGNORECASE
)

RANGE_PATTERN = re.compile(
    r'(\d+)\s*-\s*(\d+)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h)\b',
    re.IGNORECASE
)

# Time estimates for common preparation methods
PREP_ESTIMATES = {
    'bake': 45, 'boil': 20, 'fry': 15, 'grill': 25, 'chill': 120,
    'simmer': 30, 'marinate': 60, 'microwave': 10, 'no-bake': 20,
    'refrigerate': 180, 'freeze': 240
}

# Special cases for recipe types
RECIPE_TYPE_ESTIMATES = {
    'pasta': 10, 'salad': 15, 'cake': 45, 'pie': 60, 'stew': 120,
    'casserole': 60, 'soup': 30, 'cookies': 30, 'bread': 90,
    'fudge': 20, 'candy': 30
}

def convert_to_minutes(qty, unit):
    """Converts time quantity to minutes."""
    qty = float(qty)
    unit = unit.lower()
    if unit.startswith('m'): return qty
    if unit.startswith('h'): return qty * 60
    if unit.startswith('d'): return qty * 1440
    return 0

def estimate_by_recipe_type(title, ingredients):
    """Fallback estimation based on recipe title or ingredients."""
    title = str(title).lower()
    ingredients = str(ingredients).lower()

    # Check for specific recipe types
    for keyword, est in RECIPE_TYPE_ESTIMATES.items():
        if keyword in title:
            return est

    if any(word in ingredients for word in ['raw', 'fresh']):
        return 15
    if 'frozen' in ingredients:
        return 20
    if 'canned' in ingredients:
        return 10

    # Default fallback
    return 30

def clean_instruction_step(step):
    """Normalize range formats and identify special cases."""
    # Replace ranges (e.g., "10-15 minutes" → "15 minutes")
    step = RANGE_PATTERN.sub(lambda m: f"{m.group(2)} {m.group(3)}", step)

    # Handle special keywords
    lowered = step.lower()
    if 'overnight' in lowered:
        return 480
    if 'until set' in lowered or 'until firm' in lowered:
        return 60

    # Sum all time expressions
    return sum(
        convert_to_minutes(qty, unit)
        for qty, unit in TIME_PATTERN.findall(step)
    )

def parse_instructions(instructions):
    """Parses instructions and extracts total estimated time in minutes."""
    total_time = 0

    if not isinstance(instructions, list):
        try:
            instructions = ast.literal_eval(str(instructions))
        except:
            instructions = [instructions]

    for step in instructions:
        if isinstance(step, str):
            total_time += clean_instruction_step(step)

    return total_time

def categorize_time(total_time):
    """Classify total time into labeled categories."""
    if total_time == 0:
        return 'Not specified'
    if total_time < 10:
        return 'Very fast (0-10 mins)'
    if total_time < 20:
        return 'Fast (10-20 mins)'
    if total_time < 40:
        return 'Medium (20-40 mins)'
    if total_time < 90:
        return 'Slow (40-90 mins)'
    return 'Very slow (90+ mins)'

# Calculate total preparation time
df['total_time'] = df['DIRECTIONS'].apply(parse_instructions)

# Estimate time for unspecified recipes
missing = df['total_time'] == 0
df.loc[missing, 'total_time'] = df[missing].apply(
    lambda row: estimate_by_recipe_type(row['TITLE'], row['INGREDIENTS']),
    axis=1
)

# Categorize preparation times
df['PREPARATION_TIME'] = df['total_time'].apply(categorize_time)

df[['TITLE', 'INGREDIENTS', 'DIRECTIONS', 'PREPARATION_TIME']].head(200)

Unnamed: 0,TITLE,INGREDIENTS,DIRECTIONS,PREPARATION_TIME
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""0.5 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",Slow (40-90 mins)
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",Very slow (90+ mins)
2,Creamy Corn,"[""2 (480.0 g.) pkg. frozen corn"", ""1 (240.0 g....","[""In a slow cooker, combine all ingredients. C...",Very slow (90+ mins)
3,Chicken Funny,"[""1 large whole chicken"", ""2 (315.0 g.) cans c...","[""Boil and debone chicken."", ""Put bite size pi...",Medium (20-40 mins)
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""0.75 c. graham cracker...","[""Combine first four ingredients and press in ...",Medium (20-40 mins)
...,...,...,...,...
195,Golf Balls,"[""1 c. cocoa"", ""1 c. butter, softened"", ""3 (45...","[""Beat all ingredients together except peanut ...",Medium (20-40 mins)
196,Crispy Herb Bread,"[""4 tsp. olive oil"", ""0.5 tsp. garlic powder"",...","[""In a small bowl, combine oil and garlic powd...",Very fast (0-10 mins)
197,Festive Fruit Salad,"[""1 (600.0 g.) can pineapple chunks, drained (...","[""Combine pineapple, oranges, grapes, strawber...",Fast (10-20 mins)
198,Fresh Strawberry Pie,"[""950.0 ml. strawberries"", ""0.75 c. water"", ""p...","[""Cut up 1 cup strawberries."", ""Add water."", ""...",Very slow (90+ mins)


In [5]:
'''import pandas as pd
import re
import ast

# 2. Conversione unità → kg
UNIT_CONVERSION = {
    'cup': 0.24, 'c.': 0.24, 'c': 0.24,
    'teaspoon': 0.005, 'tsp': 0.005, 'tsp.': 0.005,
    'tablespoon': 0.015, 'tbsp': 0.015, 'tbsp.': 0.015,
    'package': 0.2, 'pkg': 0.2, 'pkg.': 0.2,
    'can': 0.4, 'carton': 1.0,
    'g': 0.001, 'g.': 0.001, 'gram': 0.001,
    'kg': 1.0, 'kilogram': 1.0,
    'lb': 0.4536, 'pound': 0.4536,
    'oz': 0.02835, 'ounce': 0.02835,
    'large package': 0.5, 'large pkg.': 0.5, 'large pkg': 0.5
}

# 3. Parsing function (qty, unit, name, grams_in_paren)
def parse_ingredient(ing_str):
    # peso fra parentesi
    m = re.search(r'\(\s*([0-9]+(?:\.[0-9]+)?)\s*g\.?\s*\)', ing_str, re.IGNORECASE)
    grams = float(m.group(1)) if m else None

    # quantity e unit
    patt = r'^\s*([\d\/.\s]+)?\s*([a-zA-Z\.]+)?'
    m2 = re.match(patt, ing_str)
    qty_raw = m2.group(1) if m2 else None
    try:
        qty = eval(qty_raw.replace(' ', '+')) if qty_raw else 1.0
    except:
        qty = 1.0
    unit = (m2.group(2) or '').strip('.').lower() if m2 else ''

    # name pulito
    name = re.sub(r'\(.*?\)|optional', '', ing_str, flags=re.IGNORECASE)
    # rimuovo quantità/unità
    name = re.sub(r'^[\d\/.\s]+\s*[a-zA-Z\.]*', '', name).strip().lower()
    return qty, unit, name, grams

# 4. Calcolo posizionale con fallback substring e token
def calculate_recipe_cost_positional(ingredients_list, ner_list):
    total = 0.0
    missing = []

    for ing_str, ner_item in zip(ingredients_list, ner_list):
        key = ner_item.lower()
        qty, unit, name, grams = parse_ingredient(ing_str)

        # scorporo se passo i grams
        if grams is not None:
            kg = (grams / 1000) * qty
        elif unit == '':  # a pezzo
            # prendo prezzo direct key
            price = price_dict.get(key)
            # fallback substring/token
            if price is None:
                # substring match
                for k, v in price_dict.items():
                    if k in key or key in k:
                        price = v
                        break
                # token match
                if price is None:
                    for token in key.split():
                        if token in price_dict:
                            price = price_dict[token]
                            break
            if price:
                total += qty * price
            else:
                missing.append(key)
            continue
        else:
            conv = UNIT_CONVERSION.get(unit, 0.0)
            if conv == 0:
                missing.append(key)
                continue
            kg = qty * conv

        # prezzo per kg
        price = price_dict.get(key)
        if price is None:
            for k, v in price_dict.items():
                if k in key or key in k:
                    price = v
                    break
        if price is None:
            missing.append(key)
            continue

        total += kg * price

    return round(total, 2), missing

# 5. Applica al DataFrame
    df['INGREDIENTS'] = df['INGREDIENTS'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['NER_clean']  = df['NER_clean'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

def compute_cost_row(row):
     cost, _ = calculate_recipe_cost_positional(row['INGREDIENTS'], row['NER_clean'])
     return cost

def categorize_cost(total_cost):
    if total_cost == 0:
        return 'Not specified'
    elif total_cost < 10:
        return 'Very cheap'
    elif 10 <= total_cost < 20:
        return 'Cheap'
    elif 20 <= total_cost < 45:
        return 'Medium'
    elif 45 <= total_cost < 90:
        return 'Expensive'
    else:
        return 'Rich'

df['total_cost'] = df.apply(compute_cost_row, axis=1)

df['CATEGORY_COST'] = df['total_cost'].apply(categorize_cost)

# Mostra risultati
df[['INGREDIENTS', 'NER', 'CATEGORY_COST']].head(200)'''

"import pandas as pd\nimport re\nimport ast\n\n# 2. Conversione unità → kg\nUNIT_CONVERSION = {\n    'cup': 0.24, 'c.': 0.24, 'c': 0.24,\n    'teaspoon': 0.005, 'tsp': 0.005, 'tsp.': 0.005,\n    'tablespoon': 0.015, 'tbsp': 0.015, 'tbsp.': 0.015,\n    'package': 0.2, 'pkg': 0.2, 'pkg.': 0.2,\n    'can': 0.4, 'carton': 1.0,\n    'g': 0.001, 'g.': 0.001, 'gram': 0.001,\n    'kg': 1.0, 'kilogram': 1.0,\n    'lb': 0.4536, 'pound': 0.4536,\n    'oz': 0.02835, 'ounce': 0.02835,\n    'large package': 0.5, 'large pkg.': 0.5, 'large pkg': 0.5\n}\n\n# 3. Parsing function (qty, unit, name, grams_in_paren)\ndef parse_ingredient(ing_str):\n    # peso fra parentesi\n    m = re.search(r'\\(\\s*([0-9]+(?:\\.[0-9]+)?)\\s*g\\.?\\s*\\)', ing_str, re.IGNORECASE)\n    grams = float(m.group(1)) if m else None\n\n    # quantity e unit\n    patt = r'^\\s*([\\d\\/.\\s]+)?\\s*([a-zA-Z\\.]+)?'\n    m2 = re.match(patt, ing_str)\n    qty_raw = m2.group(1) if m2 else None\n    try:\n        qty = eval(qty_raw.repla

In [6]:
import pandas as pd
import ast
import re

df['NER_clean'] = df['NER'].apply(ast.literal_eval)

# List containing keywords
NON_VEGAN_KEYWORDS = {
    'milk', 'cheese', 'butter', 'cream', 'yogurt', 'gelatin', 'lard', 'honey',
    'egg', 'eggs', 'fish', 'meat', 'chicken', 'beef', 'pork', 'gelatina',
    'collagen', 'casein', 'whey', 'lactose', 'ghee', 'isinglass', 'carmine',
    'shellac', 'albumen', 'pepsin', 'royal jelly', 'propolis', 'cocoa butter',
    'bacon', 'sour cream', 'condensed milk', 'shredded cheese', 'cheddar',
    'paraffin', 'marshmallows', 'buttermilk', 'ground beef', 'steak'
}

VEGAN_EXCEPTIONS = {
    'milk': {'soy', 'almond', 'oat', 'rice', 'coconut', 'cashew', 'hazelnut'},
    'cheese': {'vegan', 'nutritional yeast', 'cashew', 'tofu'},
    'meat': {'soy', 'seitan', 'tofu', 'tempeh', 'jackfruit', 'plant-based'},
    'butter': {'vegan', 'plant', 'peanut', 'almond', 'soy'},
    'cream': {'coconut', 'soy', 'oat', 'vegan'},
    'bacon': {'vegan', 'tempeh', 'coconut'}
}

VEGAN_MODIFIERS = {
    'vegan', 'vegetable', 'plant-based', 'no milk', 'no eggs',
    'dairy-free', 'without animal derivatives', 'cruelty-free', '100% vegetable'
}

# --- Utility
def parse_ingredients(ingredients_input):
    """Always returning a parsed list of ingredients"""
    if isinstance(ingredients_input, list):
        return ingredients_input
    if isinstance(ingredients_input, str) and ingredients_input.strip():
        try:
            return ast.literal_eval(ingredients_input)
        except:
            return [
                x.strip().strip('"').strip("'")
                for x in re.split(r',(?![^[]*\])', ingredients_input.strip('[]'))
                if x.strip()
            ]
    return []

def contains_vegan_exception(ingredient, keyword):
    return any(re.search(rf'\b{re.escape(ex)}\b', ingredient, re.IGNORECASE)
               for ex in VEGAN_EXCEPTIONS.get(keyword, set()))

# --- Classification
def classify_vegan(ingredients_input):
    """Returning True or False based on the ingredients"""
    ingredients = parse_ingredients(ingredients_input)
    for ing in ingredients:
        if not isinstance(ing, str):
            continue
        ing_lower = ing.lower()
        if any(skip in ing_lower for skip in ('water', 'salt')):
            continue
        if any(re.search(rf'\b{re.escape(mod)}\b', ing_lower) for mod in VEGAN_MODIFIERS):
            continue
        for keyword in NON_VEGAN_KEYWORDS:
            if re.search(rf'\b{re.escape(keyword)}(?:s)?\b', ing_lower):
                if not contains_vegan_exception(ing_lower, keyword):
                    return False
    return True

def get_non_vegan_ingredients(ingredients_input):
    """Returning list of ingredients without vegan exceptions"""
    ingredients = parse_ingredients(ingredients_input)
    non_vegan = []
    for ing in ingredients:
        if not isinstance(ing, str):
            continue
        ing_lower = ing.lower()
        if any(skip in ing_lower for skip in ('water', 'salt')):
            continue
        for keyword in NON_VEGAN_KEYWORDS:
            if re.search(rf'\b{re.escape(keyword)}(?:s)?\b', ing_lower):
                if not contains_vegan_exception(ing_lower, keyword):
                    non_vegan.append(ing)
                break
    return non_vegan

# --- Apply
df['VEGAN'] = df['NER_clean'].apply(classify_vegan)
df['NON_VEGAN_INGREDIENTS'] = df['NER_clean'].apply(get_non_vegan_ingredients)

# --- Final validation
anti_patterns = re.compile(
    r'\b(not vegan|meat|steak|fish|cheese|egg|eggs|beef|chicken|pork|bacon|cream)\b',
    re.IGNORECASE
)

df['VEGAN'] = df.apply(
    lambda row: False if row['VEGAN'] and isinstance(row['TITLE'], str) and anti_patterns.search(row['TITLE']) else row['VEGAN'],
    axis=1
)

# View results
df[['TITLE', 'NER', 'VEGAN', 'NON_VEGAN_INGREDIENTS']].head(200)

Unnamed: 0,TITLE,NER,VEGAN,NON_VEGAN_INGREDIENTS
0,No-Bake Nut Cookies,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",False,"[milk, butter]"
1,Jewell Ball'S Chicken,"[""beef"", ""chicken breasts"", ""cream of mushroom...",False,"[beef, chicken breasts, cream of mushroom soup..."
2,Creamy Corn,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...",False,"[cream cheese, butter]"
3,Chicken Funny,"[""chicken"", ""chicken gravy"", ""cream of mushroo...",False,"[chicken, chicken gravy, cream of mushroom sou..."
4,Reeses Cups(Candy),"[""peanut butter"", ""graham cracker crumbs"", ""bu...",False,[butter]
...,...,...,...,...
195,Golf Balls,"[""cocoa"", ""butter"", ""powdered sugar"", ""milk"", ...",False,"[butter, milk]"
196,Crispy Herb Bread,"[""olive oil"", ""garlic powder"", ""bread"", ""thyme...",False,[Parmesan cheese]
197,Festive Fruit Salad,"[""pineapple"", ""mandarin oranges"", ""grapes"", ""m...",False,[marshmallows]
198,Fresh Strawberry Pie,"[""strawberries"", ""water"", ""salt"", ""sugar"", ""co...",False,[butter]


In [7]:
# List of non vegetarian ingredients
NON_VEGETARIAN_KEYWORDS = {
    'meat', 'chicken', 'beef', 'pork', 'fish', 'anchovy', 'tuna', 'salmon',
    'shellfish', 'shrimp', 'crab', 'lobster', 'bacon', 'gelatin', 'lard',
    'collagen', 'isinglass', 'pepsin', 'ground beef', 'steak'
}

# Exceptions for vegetarian subs
VEGETARIAN_EXCEPTIONS = {
    'meat': {'soy', 'seitan', 'tofu', 'tempeh', 'jackfruit', 'plant-based'},
    'bacon': {'vegan', 'tempeh', 'coconut'},
    'gelatin': {'agar', 'pectin'},
    'fish': {'banana blossom', 'tofu', 'plant-based'}
}

VEGETARIAN_MODIFIERS = {
    'vegetarian', 'veggie', 'plant-based', 'meatless', 'no meat',
    'without meat', 'cruelty-free'
}

# --- Utility
def parse_ingredients(ingredients_input):
    """Always returning a parsed list of ingredients"""
    if isinstance(ingredients_input, list):
        return ingredients_input
    if isinstance(ingredients_input, str) and ingredients_input.strip():
        try:
            return ast.literal_eval(ingredients_input)
        except:
            return [
                x.strip().strip('"').strip("'")
                for x in re.split(r',(?![^[]*\])', ingredients_input.strip('[]'))
                if x.strip()
            ]
    return []

def contains_vegetarian_exception(ingredient, keyword):
    exceptions = VEGETARIAN_EXCEPTIONS.get(keyword, set())
    return any(re.search(rf'\b{re.escape(ex)}\b', ingredient, re.IGNORECASE) for ex in exceptions)

# --- Classification
def classify_vegetarian(ingredients_input):
    """Returning True or False based on the ingredients"""
    ingredients = parse_ingredients(ingredients_input)
    for ing in ingredients:
        if not isinstance(ing, str):
            continue
        ing_lower = ing.lower()
        if any(skip in ing_lower for skip in ('water', 'salt')):
            continue
        if any(re.search(rf'\b{re.escape(mod)}\b', ing_lower) for mod in VEGETARIAN_MODIFIERS):
            continue
        for keyword in NON_VEGETARIAN_KEYWORDS:
            if re.search(rf'\b{re.escape(keyword)}(?:s)?\b', ing_lower):
                if not contains_vegetarian_exception(ing_lower, keyword):
                    return False
    return True

def get_non_vegetarian_ingredients(ingredients_input):
    """Returning list of ingredients without vegetarian exceptions"""
    ingredients = parse_ingredients(ingredients_input)
    non_vegetarian = []
    for ing in ingredients:
        if not isinstance(ing, str):
            continue
        ing_lower = ing.lower()
        if any(skip in ing_lower for skip in ('water', 'salt')):
            continue
        for keyword in NON_VEGETARIAN_KEYWORDS:
            if re.search(rf'\b{re.escape(keyword)}(?:s)?\b', ing_lower):
                if not contains_vegetarian_exception(ing_lower, keyword):
                    non_vegetarian.append(ing)
                break
    return non_vegetarian

# Apply to dataframe
df['VEGETARIAN'] = df['NER_clean'].apply(classify_vegetarian)
df['NON_VEGETARIAN_INGREDIENTS'] = df['NER_clean'].apply(get_non_vegetarian_ingredients)

anti_patterns = re.compile(
    r'\b(not vegetarian|not veg|meat|steak|fish|beef|chicken|pork|bacon)\b',
    re.IGNORECASE
)

df['VEGETARIAN'] = df.apply(
    lambda row: False if row['VEGETARIAN'] and isinstance(row['TITLE'], str) and anti_patterns.search(row['TITLE']) else row['VEGETARIAN'],
    axis=1
)

# Show results
df[['TITLE', 'NER', 'VEGETARIAN', 'NON_VEGETARIAN_INGREDIENTS']].head(200)

Unnamed: 0,TITLE,NER,VEGETARIAN,NON_VEGETARIAN_INGREDIENTS
0,No-Bake Nut Cookies,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",True,[]
1,Jewell Ball'S Chicken,"[""beef"", ""chicken breasts"", ""cream of mushroom...",False,"[beef, chicken breasts]"
2,Creamy Corn,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...",True,[]
3,Chicken Funny,"[""chicken"", ""chicken gravy"", ""cream of mushroo...",False,"[chicken, chicken gravy]"
4,Reeses Cups(Candy),"[""peanut butter"", ""graham cracker crumbs"", ""bu...",True,[]
...,...,...,...,...
195,Golf Balls,"[""cocoa"", ""butter"", ""powdered sugar"", ""milk"", ...",True,[]
196,Crispy Herb Bread,"[""olive oil"", ""garlic powder"", ""bread"", ""thyme...",True,[]
197,Festive Fruit Salad,"[""pineapple"", ""mandarin oranges"", ""grapes"", ""m...",True,[]
198,Fresh Strawberry Pie,"[""strawberries"", ""water"", ""salt"", ""sugar"", ""co...",True,[]


Qui inizia la magia:

In [8]:
import pandas as pd
import numpy as np
import torch
from transformers import pipeline
import re
from tqdm import tqdm

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

classifier = pipeline(
    task="zero-shot-classification",
    model="MoritzLaurer/deberta-v3-base-zeroshot-v1.1-all-33",
    device=0,
    torch_dtype=torch.float16,
    model_kwargs={"cache_dir": "./cache"},
    batch_size=32,  # Optimal for RTX 3060 Ti
    framework="pt"
)

# %% Data Loading
df = pd.read_csv("dataset_1.csv")
df['NER'] = df['NER'].apply(eval)  # Convert string lists to actual lists

# %% Food Categories & Prices (EUR/kg)
CATEGORIES = [
    "dairy", "meat", "seafood", "grain", "vegetable",
    "fruit", "spice/herb", "processed", "sweetener",
    "condiment", "legume", "oil/fat"
]

MEDIAN_PRICES = {
    "dairy": 3.50,       # Milk, cheese
    "meat": 7.50,        # Chicken, beef
    "seafood": 12.00,    # Fish, shrimp
    "grain": 2.20,       # Flour, rice
    "vegetable": 1.80,   # Onions, garlic
    "fruit": 2.50,       # Tomatoes, bananas
    "spice/herb": 18.00, # Vanilla, cinnamon
    "processed": 4.50,   # Pasta, canned goods
    "sweetener": 2.20,   # Sugar
    "condiment": 5.00,   # Mayo, dressings
    "legume": 3.00,      # Beans, lentils
    "oil/fat": 8.00      # Olive oil
}

# %% Ingredient Cleaning (Fixed)
def clean_ingredient(ingredient: str) -> str:
    """Conservative cleaning preserving ingredient names"""
    # Remove quantities (e.g., "200g", "1/2 cup")
    cleaned = re.sub(r'\b\d+[\d/\.]*\s*[a-z]*\b', '', ingredient, flags=re.IGNORECASE)
    # Remove special chars except spaces
    cleaned = re.sub(r'[^\w\s]', '', cleaned).strip().lower()
    return cleaned if cleaned else "unknown"

# %% Batch Classification (GPU-optimized)
classification_cache = {}

def batch_classify(ingredients: list, batch_size: int = 16) -> dict:  # Reduced batch size
    unique_ingredients = list(set(ingredients))

    with torch.no_grad():  # Disable gradient tracking
        for batch in tqdm([unique_ingredients[i:i+batch_size]
                           for i in range(0, len(unique_ingredients), batch_size)],
                          desc="Classifying Ingredients"):
            # Process batch on GPU
            results = classifier(batch, CATEGORIES, multi_label=False)

            # Cache results
            for ing, result in zip(batch, results):
                classification_cache[ing] = result['labels'][0]

            # Clear GPU cache
            torch.cuda.empty_cache()

    return classification_cache

# %% Process Entire Dataset
# Get all unique ingredients
all_ingredients = [clean_ingredient(ing)
                   for recipe in df['NER']
                   for ing in recipe]
unique_ingredients = list(set(all_ingredients))

# Batch classify with progress bar
_ = batch_classify(unique_ingredients, batch_size=64)

# Calculate recipe costs
df['total_cost'] = df['NER'].apply(
    lambda x: round(sum(
        MEDIAN_PRICES.get(classification_cache[clean_ingredient(ing)], 3.00)
        for ing in x
    ), 2)
)

# Dynamic price categorization
costs = df['total_cost'].values
q33, q66 = np.percentile(costs, [33, 66])

df['price_tag'] = df['total_cost'].apply(
    lambda x: 'cheap' if x <= q33 else 'medium' if x <= q66 else 'expensive'
)

# %% Generate Ingredient-Category Dataset
ingredient_df = pd.DataFrame(
    [(ing, cat) for ing, cat in classification_cache.items()],
    columns=['ingredient', 'category']
)

# Add dietary tags
ingredient_df['vegan'] = ~ingredient_df['category'].isin(['dairy', 'meat', 'seafood'])
ingredient_df['vegetarian'] = ~ingredient_df['category'].isin(['meat', 'seafood'])

# Save outputs
df.to_csv('recipes_with_prices.csv', index=False)
ingredient_df.to_csv('ingredient_categories.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

Qui la magia finisce :)

In [23]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import pickle

# === 1. Pre-elaborazione: selezione prime 10.000 righe e colonne necessarie ===
df_subset = df[["INGREDIENTS", "DIRECTIONS", "PREPARATION_TIME"]]

# === 2. Unione ingredienti + passaggi in full_text ===
df_subset["INGREDIENTS"] = df_subset["INGREDIENTS"].apply(eval)
df_subset["DIRECTIONS"] = df_subset["DIRECTIONS"].apply(eval)
df_subset["full_text"] = df_subset["INGREDIENTS"].apply(lambda x: " ".join(x)) + " " + df_subset["DIRECTIONS"].apply(lambda x: " ".join(x))

# === 3. Codifica etichette PREPARATION_TIME ===
label_encoder = LabelEncoder()
df_subset["PREP_TIME_ENCODED"] = label_encoder.fit_transform(df_subset["PREPARATION_TIME"])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Etichette PREPARATION_TIME:", label_mapping)

# === 4. Train/test split ===
X = df_subset["full_text"]
y = df_subset["PREP_TIME_ENCODED"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# === 5. Tokenizzazione e padding ===
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

max_len = 500
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post", truncating="post")
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding="post", truncating="post")

# === 6. Costruzione del modello ===
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=max_len),
    GlobalMaxPooling1D(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(len(label_mapping), activation='softmax')
])

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

# === 7. Addestramento ===
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=10,
    batch_size=32,
    callbacks=[early_stop]
)

# === 8. Salvataggio ===
model.save("prep_time_classifier.h5")

with open("prep_time_label_mapping.pkl", "wb") as f:
    pickle.dump(label_mapping, f)

print("Modello e label mapping salvati.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["INGREDIENTS"] = df_subset["INGREDIENTS"].apply(eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["DIRECTIONS"] = df_subset["DIRECTIONS"].apply(eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["full_text"] = df_subset["INGREDIENTS"].apply(lambda x: " ".join(x)) 

Etichette PREPARATION_TIME: {'Fast (10-20 mins)': 0, 'Medium (20-40 mins)': 1, 'Slow (40-90 mins)': 2, 'Very fast (0-10 mins)': 3, 'Very slow (90+ mins)': 4}




Epoch 1/10
[1m5578/5578[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 41ms/step - accuracy: 0.6922 - loss: 0.8318 - val_accuracy: 0.8383 - val_loss: 0.4659
Epoch 2/10
[1m5578/5578[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 36ms/step - accuracy: 0.8216 - loss: 0.5015 - val_accuracy: 0.8487 - val_loss: 0.4304
Epoch 3/10
[1m5578/5578[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 33ms/step - accuracy: 0.8403 - loss: 0.4468 - val_accuracy: 0.8556 - val_loss: 0.4061
Epoch 4/10
[1m5578/5578[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 33ms/step - accuracy: 0.8504 - loss: 0.4200 - val_accuracy: 0.8561 - val_loss: 0.4015
Epoch 5/10
[1m5578/5578[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 32ms/step - accuracy: 0.8560 - loss: 0.3981 - val_accuracy: 0.8577 - val_loss: 0.3932
Epoch 6/10
[1m5578/5578[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 32ms/step - accuracy: 0.8618 - loss: 0.3804 - val_accuracy: 0.8590 - val_loss: 0.391



Modello e label mapping salvati.


In [24]:
# Lista di ricette
recipes = [
    {
        "ingredients": "200g spaghetti, 2 cloves garlic, chili pepper, extra virgin olive oil, salt",
        "instructions": "Cook the spaghetti. Meanwhile, sauté garlic and chili in the oil. Mix with the pasta.",
        "expected_time": 15
    },
    {
        "ingredients": "Lettuce, 1 can of tuna, tomatoes, black olives, onion, olive oil, salt",
        "instructions": "Wash and cut the vegetables. Add the drained tuna and olives. Dress with oil and salt.",
        "expected_time": 10
    },
    {
        "ingredients": "300g Carnaroli rice, 1 saffron sachet, vegetable broth, butter, onion, Parmesan cheese",
        "instructions": "Sauté the onion, add rice and deglaze with wine. Gradually pour in the broth. Add saffron and stir in butter and cheese.",
        "expected_time": 30
    },
    {
        "ingredients": "3 eggs, salt, pepper, butter",
        "instructions": "Beat the eggs with salt and pepper. Pour into a pan with melted butter and stir until set.",
        "expected_time": 5
    },
    {
        "ingredients": "Lasagna sheets, meat sauce, béchamel, Parmesan cheese",
        "instructions": "Layer pasta, sauce, béchamel and cheese. Bake at 180°C for 40 minutes.",
        "expected_time": 60
    },
    {
        "ingredients": "Apples, bananas, oranges, strawberries, lemon juice, sugar",
        "instructions": "Cut the fruit into pieces, mix with lemon juice and sugar, refrigerate.",
        "expected_time": 10
    },
    {
        "ingredients": "Chicken breast, onion, curry powder, cream or coconut milk, oil, salt",
        "instructions": "Brown the onion, add diced chicken, curry, and cream. Cook until the chicken is tender.",
        "expected_time": 25
    },
    {
        "ingredients": "Ladyfingers, mascarpone, eggs, sugar, coffee, cocoa powder",
        "instructions": "Make a cream with yolks, sugar and mascarpone. Layer ladyfingers soaked in coffee and the cream. Dust with cocoa and chill.",
        "expected_time": 30
    },
    {
        "ingredients": "2 eggs, 250ml milk, 125g flour, sugar, butter",
        "instructions": "Mix flour, eggs and milk into a batter. Cook in pan with butter. Fill as desired.",
        "expected_time": 20
    },
    {
        "ingredients": "Mixed dried legumes, onion, carrot, celery, vegetable broth, oil, salt",
        "instructions": "Soak legumes for 12 hours. Sauté onion, carrot, celery. Add legumes and broth and cook for at least an hour.",
        "expected_time": 90
    }
]

# Imposta il valore maxlen usato nel training
maxlen = max_len  # ← Cambia questo valore se ne hai usato uno diverso

# Prepara i testi concatenando ingredienti e istruzioni per ogni ricetta
sample_texts = [r["ingredients"] + " " + r["instructions"] for r in recipes]

# Tokenizza e pad
sequences = tokenizer.texts_to_sequences(sample_texts)
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

# Fai la predizione su tutte le ricette
predictions = model.predict(padded_sequences)
predicted_classes = predictions.argmax(axis=1)

# Etichette di output
label_map = {
    0: 'Fast (10-20 mins)',
    1: 'Medium (20-40 mins)',
    2: 'Slow (40-90 mins)',
    3: 'Very fast (0-10 mins)',
    4: 'Very slow (90+ mins)'
}

# Mappa ogni predizione all'etichetta corrispondente
labels = [label_map[pc] for pc in predicted_classes]

# Stampa tutte le etichette predette, in ordine
for i, pc in enumerate(predicted_classes):
    pred_label = label_map[pc]
    expected = recipes[i]["expected_time"]
    print(f"Recipe {i+1} predicted: {pred_label} | Expected time (mins): {expected}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 483ms/step
Recipe 1 predicted: Fast (10-20 mins) | Expected time (mins): 15
Recipe 2 predicted: Fast (10-20 mins) | Expected time (mins): 10
Recipe 3 predicted: Medium (20-40 mins) | Expected time (mins): 30
Recipe 4 predicted: Slow (40-90 mins) | Expected time (mins): 5
Recipe 5 predicted: Slow (40-90 mins) | Expected time (mins): 60
Recipe 6 predicted: Fast (10-20 mins) | Expected time (mins): 10
Recipe 7 predicted: Medium (20-40 mins) | Expected time (mins): 25
Recipe 8 predicted: Medium (20-40 mins) | Expected time (mins): 30
Recipe 9 predicted: Slow (40-90 mins) | Expected time (mins): 20
Recipe 10 predicted: Very slow (90+ mins) | Expected time (mins): 90
