In [1]:
import pandas as pd

# Read File and create a new dataframe called df
df = pd.read_csv('dataset_1.csv')

In [2]:
import re
from fractions import Fraction

def convert_fractions(text):
    # Match optional whole number + fraction, or just fraction
    pattern = r'(?:(\d+)\s+)?(\d+)/(\d+)'

    def replacer(match):
        whole = int(match.group(1)) if match.group(1) else 0
        numerator = int(match.group(2))
        denominator = int(match.group(3))
        decimal = whole + Fraction(numerator, denominator)
        return "{:.10g}".format(float(decimal))  # Clean, no trailing zeros

    return re.sub(pattern, replacer, text)

In [3]:
from fractions import Fraction

# fixed conversions
CONVERSIONS = {
    "oz": 30,      # ounce → gram
    "lb": 450,      # pounds → gram
    "pt": 475,      # pint → milliliter
    "qt": 950,      # quart → milliliter
    "inch": 2.5        # inches → centimeter
}

In [4]:
def convert_units(text):
    if pd.isna(text):
        return text

    # Convert oz, lb, pt, qt (form: "4 oz", "2.5 lb", etc.)
    for unit, factor in CONVERSIONS.items():
        pattern = r'(\d+(\.\d+)?)\s*' + unit
        text = re.sub(pattern, lambda m: f"{round(float(m.group(1)) * factor, 1)} {unit_to_metric(unit)}", text)

    # Convert measure in inches in pans
    text = re.sub(r'(\d+)\s*x\s*(\d+)[-\s]*inch', lambda m: f"{int(m.group(1)) * CONVERSIONS['inch']:.0f} x {int(m.group(2)) * CONVERSIONS['inch']:.0f} cm", text)

    return text

def convert_fahrenheit_to_celsius(text):
    # Match degrees like 275°, 275 °F, 275\u00b0F (escaped), etc.
    pattern = r'(\d+)\s*(?:°|\\u00b0)'

    def replacer(match):
        fahrenheit = int(match.group(1))
        celsius = round((fahrenheit - 32) * 5 / 9)
        return f"{celsius}°C"

    return re.sub(pattern, replacer, text)

def unit_to_metric(unit):
    return {
        "oz": "g",
        "lb": "g",
        "pt": "ml",
        "qt": "ml",
        "inch": "cm"
    }[unit]

In [5]:
#applying conversions on the INGREDIENTS and DIRECTIOND columns
df['INGREDIENTS'] = df['ingredients'].apply(convert_fractions)
df['DIRECTIONS'] = df['directions'].apply(convert_fractions)

df['INGREDIENTS'] = df['INGREDIENTS'].apply(convert_units)
df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_units)

df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_fahrenheit_to_celsius)

#renaming the title column
df = df.rename(columns={"title": "TITLE"})

df[['TITLE', 'INGREDIENTS', 'DIRECTIONS']].head(200)

Unnamed: 0,TITLE,INGREDIENTS,DIRECTIONS
0,Western Sizzlin Bread Pudding,"[""16 eggs, beaten"", ""6 -8 evaporated milk, 360...","[""Distribute rolls and cinnamon roll in (4) 2\..."
1,Creamy Tuna And Bacon Salad (Reduced Fat),"[""1 (6 ounce) can tuna, drained and flaked"", ""...","[""Put everything in a bowl and mix together un..."
2,Spinach And Mushroom Lasagna,"[""1 tablespoon olive oil"", ""1 medium onion , c...","[""Heat oven to 375 degrees."", """", ""In medium s..."
3,Three-Bean Tacos,"[""1 teaspoon olive oil"", ""1 cup diced onion"", ...","[""Heat oil in a large skillet over medium-high..."
4,Hearty Hamburger Soup,"[""900.0 gs lean ground beef"", ""1 white onion, ...","[""Brown ground beef and onion in a large pot. ..."
...,...,...,...
195,Sweet Potato Brotchen (Bread Rolls),"[""450 g sweet potatoes"", ""15 g butter or olive...","[""Peel and cut sweet potatoes into chunks."", ""..."
196,Green Cleaner and Bug Repellant in One (Concen...,"[""1 tablespoon essential oil (part)"", ""3 table...","[""Mix all ingredients."", ""Store in a glass jar..."
197,Chocolate Chip Pie,"[""1 graham cracker crust"", ""20 large marshmall...","[""Make crust (I buy the prepared ones from the..."
198,Poppy Seed Dressing,"[""0.75 c. sugar"", ""2 tsp. dry mustard"", ""2 tsp...","[""Combine first 6 ingredients in a blender con..."


In [6]:
import ast
import re

# patterns to detect time expressions
time_pattern = re.compile(
    r'(\d+\.?\d*)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h|day|days|d)\b',
    re.IGNORECASE
)

range_pattern = re.compile(
    r'(\d+)\s*-\s*(\d+)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h)\b',
    re.IGNORECASE
)

# Time estimates for common preparation methods
PREP_ESTIMATES = {
    'bake': 45,
    'boil': 20,
    'fry': 15,
    'grill': 25,
    'chill': 120,
    'simmer': 30,
    'marinate': 60,
    'microwave': 10,
    'no-bake': 20,
    'refrigerate': 180,
    'freeze': 240
}

# Special cases for recipe types
RECIPE_TYPE_ESTIMATES = {
    'pasta': 10,
    'salad': 15,
    'cake': 45,
    'pie': 60,
    'stew': 120,
    'casserole': 60,
    'soup': 30,
    'cookies': 30,
    'bread': 90,
    'fudge': 20,
    'candy': 30
}

# Convert time quantities to minutes
def convert_to_minutes(qty, unit):
    qty = float(qty)
    unit = unit.lower()
    if unit in ['minute', 'minutes', 'min', 'mins', 'm']:
        return qty
    elif unit in ['hour', 'hours', 'hr', 'hrs', 'h']:
        return qty * 60
    elif unit in ['day', 'days', 'd']:
        return qty * 24 * 60
    return 0

# Estimate time on recipe type
def estimate_by_recipe_type(recipe_name, ingredients):
    recipe_name = str(recipe_name).lower()
    ingredients = str(ingredients).lower()

    # Check for specific recipe types
    for recipe_type, time in RECIPE_TYPE_ESTIMATES.items():
        if recipe_type in recipe_name:
            return time

    # Estimate based on ingredients
    if 'raw' in ingredients or 'fresh' in ingredients:
        return 15
    if 'frozen' in ingredients:
        return 20
    if 'canned' in ingredients:
        return 10

    # Default estimation
    return 30

# Extract and sum all time references from recipe instructions
def parse_instructions(instructions):
    if isinstance(instructions, str):
        try:
            instructions = ast.literal_eval(instructions)
        except:
            instructions = [instructions]

    total_time = 0
    for step in instructions:
        if not isinstance(step, str):
            continue

        # Handle time ranges
        step = re.sub(
            range_pattern,
            lambda m: f'{m.group(2)} {m.group(3)}',
            step
        )

        # Special cases
        if 'overnight' in step.lower():
            total_time += 480  # 8 hours
        elif 'until set' in step.lower() or 'until firm' in step.lower():
            total_time += 60  # 1 hour estimation

        # Find all time references
        matches = time_pattern.findall(step)
        for (qty, unit) in matches:
            total_time += convert_to_minutes(qty, unit)

    return total_time

def categorize_time(total_time):
    if total_time == 0:
        return 'Not specified'
    elif total_time < 10:
        return 'Very fast (0-10 mins)'
    elif 10 <= total_time < 20:
        return 'Fast (10-20 mins)'
    elif 20 <= total_time < 40:
        return 'Medium (20-40 mins)'
    elif 40 <= total_time < 90:
        return 'Slow (40-90 mins)'
    else:
        return 'Very slow (90+ mins)'

# Calculate total preparation time
df['total_time'] = df['DIRECTIONS'].apply(parse_instructions)

# Estimate time for unspecified recipes
mask = df['total_time'] == 0
df.loc[mask, 'total_time'] = df[mask].apply(
    lambda x: estimate_by_recipe_type(x['TITLE'], x['INGREDIENTS']),
    axis=1
)

# Categorize preparation times
df['PREPARATION_TIME'] = df['total_time'].apply(categorize_time)

df[['TITLE', 'INGREDIENTS', 'DIRECTIONS', 'PREPARATION_TIME']].head(200)

Unnamed: 0,TITLE,INGREDIENTS,DIRECTIONS,PREPARATION_TIME
0,Western Sizzlin Bread Pudding,"[""16 eggs, beaten"", ""6 -8 evaporated milk, 360...","[""Distribute rolls and cinnamon roll in (4) 2\...",Very slow (90+ mins)
1,Creamy Tuna And Bacon Salad (Reduced Fat),"[""1 (6 ounce) can tuna, drained and flaked"", ""...","[""Put everything in a bowl and mix together un...",Fast (10-20 mins)
2,Spinach And Mushroom Lasagna,"[""1 tablespoon olive oil"", ""1 medium onion , c...","[""Heat oven to 375 degrees."", """", ""In medium s...",Slow (40-90 mins)
3,Three-Bean Tacos,"[""1 teaspoon olive oil"", ""1 cup diced onion"", ...","[""Heat oil in a large skillet over medium-high...",Medium (20-40 mins)
4,Hearty Hamburger Soup,"[""900.0 gs lean ground beef"", ""1 white onion, ...","[""Brown ground beef and onion in a large pot. ...",Slow (40-90 mins)
...,...,...,...,...
195,Sweet Potato Brotchen (Bread Rolls),"[""450 g sweet potatoes"", ""15 g butter or olive...","[""Peel and cut sweet potatoes into chunks."", ""...",Very slow (90+ mins)
196,Green Cleaner and Bug Repellant in One (Concen...,"[""1 tablespoon essential oil (part)"", ""3 table...","[""Mix all ingredients."", ""Store in a glass jar...",Medium (20-40 mins)
197,Chocolate Chip Pie,"[""1 graham cracker crust"", ""20 large marshmall...","[""Make crust (I buy the prepared ones from the...",Slow (40-90 mins)
198,Poppy Seed Dressing,"[""0.75 c. sugar"", ""2 tsp. dry mustard"", ""2 tsp...","[""Combine first 6 ingredients in a blender con...",Very fast (0-10 mins)


In [7]:
import ast

# Converte la colonna da stringhe a liste Python vere e proprie
df['NER_clean'] = df['NER'].apply(ast.literal_eval)

# Appiattisce tutte le liste in un'unica lista di ingredienti
all_ner = [item.strip().lower() for sublist in df['NER_clean'] for item in sublist]

# Rimuove duplicati e ordina
unique_ingredients = sorted(set(all_ner))

# Crea un DataFrame
df_unique_ingredients = pd.DataFrame(unique_ingredients, columns=['Ingredient'])

#df_unique_ingredients

In [8]:
import pandas as pd
import re
import ast

# 1. Carica prezzi e normalizza le chiavi
prices_df = pd.read_csv('ingredients_prices')  # Assicurati dell'estensione .csv
price_dict = {k.strip().lower(): v for k, v in prices_df.set_index('Ingredient')['Cost'].items()}

# 2. Conversione unità → kg
UNIT_CONVERSION = {
    'cup': 0.24, 'c.': 0.24, 'c': 0.24,
    'teaspoon': 0.005, 'tsp': 0.005, 'tsp.': 0.005,
    'tablespoon': 0.015, 'tbsp': 0.015, 'tbsp.': 0.015,
    'package': 0.2, 'pkg': 0.2, 'pkg.': 0.2,
    'can': 0.4, 'carton': 1.0,
    'g': 0.001, 'g.': 0.001, 'gram': 0.001,
    'kg': 1.0, 'kilogram': 1.0,
    'lb': 0.4536, 'pound': 0.4536,
    'oz': 0.02835, 'ounce': 0.02835,
    'large package': 0.5, 'large pkg.': 0.5, 'large pkg': 0.5
}

# 3. Parsing function (qty, unit, name, grams_in_paren)
def parse_ingredient(ing_str):
    # peso fra parentesi
    m = re.search(r'\(\s*([0-9]+(?:\.[0-9]+)?)\s*g\.?\s*\)', ing_str, re.IGNORECASE)
    grams = float(m.group(1)) if m else None

    # quantity e unit
    patt = r'^\s*([\d\/.\s]+)?\s*([a-zA-Z\.]+)?'
    m2 = re.match(patt, ing_str)
    qty_raw = m2.group(1) if m2 else None
    try:
        qty = eval(qty_raw.replace(' ', '+')) if qty_raw else 1.0
    except:
        qty = 1.0
    unit = (m2.group(2) or '').strip('.').lower() if m2 else ''

    # name pulito
    name = re.sub(r'\(.*?\)|optional', '', ing_str, flags=re.IGNORECASE)
    # rimuovo quantità/unità
    name = re.sub(r'^[\d\/.\s]+\s*[a-zA-Z\.]*', '', name).strip().lower()
    return qty, unit, name, grams

# 4. Calcolo posizionale con fallback substring e token
def calculate_recipe_cost_positional(ingredients_list, ner_list):
    total = 0.0
    missing = []

    for ing_str, ner_item in zip(ingredients_list, ner_list):
        key = ner_item.lower()
        qty, unit, name, grams = parse_ingredient(ing_str)

        # scorporo se passo i grams
        if grams is not None:
            kg = (grams / 1000) * qty
        elif unit == '':  # a pezzo
            # prendo prezzo direct key
            price = price_dict.get(key)
            # fallback substring/token
            if price is None:
                # substring match
                for k, v in price_dict.items():
                    if k in key or key in k:
                        price = v
                        break
                # token match
                if price is None:
                    for token in key.split():
                        if token in price_dict:
                            price = price_dict[token]
                            break
            if price:
                total += qty * price
            else:
                missing.append(key)
            continue
        else:
            conv = UNIT_CONVERSION.get(unit, 0.0)
            if conv == 0:
                missing.append(key)
                continue
            kg = qty * conv

        # prezzo per kg
        price = price_dict.get(key)
        if price is None:
            for k, v in price_dict.items():
                if k in key or key in k:
                    price = v
                    break
        if price is None:
            missing.append(key)
            continue

        total += kg * price

    return round(total, 2), missing

# 5. Applica al DataFrame
    df['INGREDIENTS'] = df['INGREDIENTS'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['NER_clean']  = df['NER_clean'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

def compute_cost_row(row):
     cost, _ = calculate_recipe_cost_positional(row['INGREDIENTS'], row['NER_clean'])
     return cost

def categorize_cost(total_cost):
    if total_cost == 0:
        return 'Not specified'
    elif total_cost < 10:
        return 'Very cheap'
    elif 10 <= total_cost < 20:
        return 'Cheap'
    elif 20 <= total_cost < 45:
        return 'Medium'
    elif 45 <= total_cost < 90:
        return 'Expensive'
    else:
        return 'Rich'

df['total_cost'] = df.apply(compute_cost_row, axis=1)

df['CATEGORY_COST'] = df['total_cost'].apply(categorize_cost)

# Mostra risultati
df[['INGREDIENTS', 'NER', 'CATEGORY_COST']].head(200)

Unnamed: 0,INGREDIENTS,NER,CATEGORY_COST
0,"[""16 eggs, beaten"", ""6 -8 evaporated milk, 360...","[""eggs"", ""milk"", ""sugar"", ""vanilla"", ""gallon w...",Expensive
1,"[""1 (6 ounce) can tuna, drained and flaked"", ""...","[""tuna"", ""mayonnaise"", ""low-fat sour cream"", ""...",Medium
2,"[""1 tablespoon olive oil"", ""1 medium onion , c...","[""olive oil"", ""onion"", ""garlic"", ""salt"", ""grou...",Medium
3,"[""1 teaspoon olive oil"", ""1 cup diced onion"", ...","[""olive oil"", ""onion"", ""red bell pepper"", ""gre...",Expensive
4,"[""900.0 gs lean ground beef"", ""1 white onion, ...","[""lean ground beef"", ""white onion"", ""ground bl...",Rich
...,...,...,...
195,"[""450 g sweet potatoes"", ""15 g butter or olive...","[""sweet potatoes"", ""butter"", ""salt"", ""flour"", ...",Medium
196,"[""1 tablespoon essential oil (part)"", ""3 table...","[""essential oil"", ""vodka"", ""liquid soap""]",Medium
197,"[""1 graham cracker crust"", ""20 large marshmall...","[""graham cracker crust"", ""marshmallows"", ""milk...",Medium
198,"[""0.75 c. sugar"", ""2 tsp. dry mustard"", ""2 tsp...","[""sugar"", ""dry mustard"", ""salt"", ""vinegar"", ""o...",Expensive


In [9]:
import pandas as pd

# Carica il file CSV
vegan_df = pd.read_csv('ingredients_prices')

# Pulizia degli ingredienti (rimuovi prefissi speciali)
vegan_df['Ingredient'] = vegan_df['Ingredient'].str.replace(r'^[s+()\\.",:/-]+\s*', '', regex=True)

# Lista di parole chiave non vegane
NON_VEGAN_KEYWORDS = {
    'milk', 'cheese', 'butter', 'cream', 'yogurt', 'gelatin', 'lard', 'honey',
    'egg', 'eggs', 'fish', 'meat', 'chicken', 'beef', 'pork', 'gelatina',
    'collagen', 'casein', 'whey', 'lactose', 'ghee', 'isinglass', 'carmine',
    'shellac', 'albumen', 'pepsin', 'royal jelly', 'propolis', 'cocoa butter'
}

VEGAN_EXCEPTIONS = {
    'milk': {'soy', 'almond', 'oat', 'rice', 'coconut', 'cashew', 'hazelnut'},
    'cheese': {'vegan', 'nutritional yeast', 'cashew', 'tofu'},
    'meat': {'soy', 'seitan', 'tofu', 'tempeh', 'jackfruit', 'plant-based'},
    'butter': {'vegan', 'plant', 'peanut', 'almond', 'soy'},
    'cream': {'coconut', 'soy', 'oat', 'vegan'}
}

VEGAN_MODIFIERS = {
    'vegan', 'vegetale', 'plant-based', 'senza latte', 'senza uova',
    'dairy-free', 'senza derivati animali', 'cruelty-free', '100% vegetale'
}

def contains_vegan_exception(ingredient, keyword):
    exceptions = VEGAN_EXCEPTIONS.get(keyword, set())
    return any(re.search(rf'\b{ex}\b', ingredient, re.IGNORECASE) for ex in exceptions)

# Funzione di classificazione
def is_vegan(ingredient):
    ingredient_lower = ingredient.lower()

    # Controlla parole chiave non vegane
    for keyword in NON_VEGAN_KEYWORDS:
        if keyword in ingredient_lower:
            return 'No'

    # Eccezioni speciali
    if 'margarine' in ingredient_lower and 'vegan' not in ingredient_lower:
        return 'No'

    if 'broth' in ingredient_lower and ('chicken' in ingredient_lower or 'beef' in ingredient_lower):
        return 'No'

    return 'Yes'

# Applica la classificazione
vegan_df['Vegan'] = vegan_df['Ingredient'].apply(is_vegan)

vegan_df

Unnamed: 0,Ingredient,Cost,Vegan
0,A Cassell Courtyard,8.0,Yes
1,A heartfol,8.0,Yes
2,A.P.,8.0,Yes
3,ABCs,8.0,Yes
4,Abates tomato sauce,8.0,Yes
...,...,...,...
25693,zuckini,8.0,Yes
25694,zuke,8.0,Yes
25695,zwieback,8.0,Yes
25696,zwieback crackers,8.0,Yes


In [4]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import ast

# ---------- 1. Category Rules ----------
CATEGORY_RULES = {
    "Meat": ["bacon", "beef", "chicken", "ham", "turkey", "liver", "hamburger", "pancetta", "prosciutto"],
    "Seafood": ["shrimp", "tuna", "anchovy", "salmon"],
    "Vegetable": ["onion", "garlic", "lettuce", "carrot", "pepper", "tomato", "potato", "celery", "scallion", "bean sprouts"],
    "Fruit": ["lemon", "banana", "orange", "raisins", "apple", "avocado"],
    "Grain": ["flour", "cornmeal", "tortilla", "rice", "bread", "wafer", "cake", "noodles", "linguine", "rolls"],
    "Dairy": ["milk", "cheese", "butter", "sour cream", "cream cheese", "parmesan", "whiz", "pecorino"],
    "Fat/Oil": ["olive oil", "vegetable oil", "sunflower oil", "margarine", "sesame oil"],
    "Spice": ["salt", "pepper", "cinnamon", "nutmeg", "ginger", "paprika", "oregano", "thyme", "sage", "cayenne"],
    "Sweetener": ["sugar", "molasses", "honey", "chocolate", "syrup"],
    "Other": []
}

# ---------- 2. Ingredient Classification from DataFrame ----------
def extract_ingredients_and_categories(df, column="NER"):
    data = []
    for _, row in df.iterrows():
        try:
            items = ast.literal_eval(row[column])
        except Exception:
            continue  # skip malformed rows
        for item in items:
            item_lower = item.lower()
            matched = False
            for category, keywords in CATEGORY_RULES.items():
                if any(k in item_lower for k in keywords):
                    data.append({"ingredient": item.strip(), "category": category})
                    matched = True
                    break
            if not matched:
                data.append({"ingredient": item.strip(), "category": "Other"})
    return pd.DataFrame(data)

# ---------- 3. Load and process your CSV ----------
df = pd.read_csv("dataset_1.csv")  # your original dataset
ingredient_df = extract_ingredients_and_categories(df)  # expanded dataset

# ---------- 4. Label encoding ----------
label_encoder = LabelEncoder()
ingredient_df["category"] = ingredient_df["category"].astype(str)
label_encoder.fit(ingredient_df["category"])
ingredient_df["label"] = label_encoder.transform(ingredient_df["category"])

# Debug info
print(f"Number of categories: {len(label_encoder.classes_)}")
print(f"Label range: {ingredient_df['label'].min()} - {ingredient_df['label'].max()}")
print(f"Classes: {label_encoder.classes_}")
print(f"Labels: {sorted(ingredient_df['label'].unique())}")

# ---------- 5. Prepare HuggingFace dataset ----------
dataset = Dataset.from_pandas(ingredient_df[["ingredient", "label"]])

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["ingredient"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)

# ---------- 6. Load model ----------
# FIX: Explicitly set num_labels parameter to match the actual number of classes
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels-1
)

training_args = TrainingArguments(
    output_dir="./results",  # Add an output directory
    per_device_train_batch_size=16,
    num_train_epochs=3,
    save_strategy="no",
    logging_steps=10,
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

# ---------- 7. Save final model and labels ----------
model_path = "ingredient_classifier_model"
label_mapping_path = "category_labels.txt"

trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

with open(label_mapping_path, "w") as f:
    for i, c in enumerate(label_encoder.classes_):
        f.write(f"{i},{c}\n")

Number of categories: 10
Label range: 0 - 9
Classes: ['Dairy' 'Fat/Oil' 'Fruit' 'Grain' 'Meat' 'Other' 'Seafood' 'Spice'
 'Sweetener' 'Vegetable']
Labels: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


Map:   0%|          | 0/378537 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: expected sequence of length 9 at dim 1 (got 10)

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_path = "ingredient_classifier_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load label mapping
label_map = {}
with open("category_labels.txt", "r") as f:
    for line in f:
        idx, label = line.strip().split(",", 1)
        label_map[int(idx)] = label

# Function to classify one ingredient
def classify_ingredient(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
    return label_map[prediction]

# Example usage
ingredients = [
    "ground beef",
    "sugar",
    "cheddar cheese",
    "fresh basil",
    "shrimp",
    "coconut milk"
]

for ing in ingredients:
    category = classify_ingredient(ing)
    print(f"{ing:20} ➜ {category}")


ground beef          ➜ Vegetable
sugar                ➜ Vegetable
cheddar cheese       ➜ Vegetable
fresh basil          ➜ Vegetable
shrimp               ➜ Vegetable
coconut milk         ➜ Vegetable


In [9]:
from ast import literal_eval
import pandas as pd
import numpy as np
from transformers import pipeline
import re

data = {'NER': [
    ["eggs", "milk", "sugar", "vanilla", "gallon water", "dinner rolls", "cinnamon roll"],
    ["tuna", "mayonnaise", "low-fat sour cream", "bacon bits", "dill", "salt"],
    ["olive oil", "onion", "garlic", "salt", "ground black pepper", "tomatoes",
     "tomatoes", "ricotta cheese", "Parmesan cheese", "fresh basil", "egg", "salt",
     "ground black pepper", "lasagna noodles", "mozzarella", "mushrooms"],
    ["olive oil", "onion", "red bell pepper", "green bell pepper", "chili powder",
     "oregano", "ground cumin", "garlic", "garbanzo beans", "black beans", "pinto beans",
     "tomato sauce", "taco", "shredded iceberg lettuce", "tomato", "cheddar cheese", "salsa"]
]}

df = pd.DataFrame(data)

# Food categories and prices
CATEGORIES = [
    "dairy", "meat", "seafood", "grain", "vegetable",
    "fruit", "spice/herb", "processed", "sweetener",
    "condiment", "legume", "oil/fat"
]

MEDIAN_PRICES = {
    "dairy": 3.00,       # Milk, cheese (per kg)
    "meat": 8.00,        # Chicken, beef (per kg)
    "seafood": 12.00,    # Fish, shrimp (per kg)
    "grain": 2.00,       # Flour, rice (per kg)
    "vegetable": 1.50,   # Onions, garlic (per kg)
    "fruit": 2.00,       # Tomatoes, bananas (per kg)
    "spice/herb": 15.00, # Vanilla, cinnamon (per kg)
    "processed": 5.00,   # Pasta, canned goods (per kg)
    "sweetener": 1.80,   # Sugar (per kg)
    "condiment": 4.00,   # Mayo, dressings (per kg)
    "legume": 2.50,      # Beans, lentils (per kg)
    "oil/fat": 6.00      # Olive oil (per kg)
}

# Initialize classifier
classifier = pipeline(
    "zero-shot-classification",
    model="valhalla/distilbart-mnli-12-3",
    device=0,  # Requires CUDA-enabled environment
    torch_dtype=torch.float16  # Enable mixed precision
)

classification_cache = {}

# Improved cleaning function
def clean_ingredient(ingredient):
    """More conservative cleaning"""
    # Remove quantities like "1/2 cup" or "200g"
    cleaned = re.sub(r'\b\d+[\d/\.°]*\s*(\w+)?\b', '', ingredient, flags=re.IGNORECASE)
    # Remove special characters but keep spaces
    cleaned = re.sub(r'[^\w\s]', '', cleaned).strip().lower()
    return cleaned if cleaned else "unknown_ingredient"

def categorize_ingredient(ingredient):
    """Safe classification with error handling"""
    cleaned = clean_ingredient(ingredient)

    if not cleaned or cleaned == "unknown_ingredient":
        return "processed"  # Default category

    if cleaned in classification_cache:
        return classification_cache[cleaned]

    try:
        result = classifier(cleaned, CATEGORIES, multi_label=False)
        category = result['labels'][0]
        classification_cache[cleaned] = category
        return category
    except Exception as e:
        print(f"Error classifying '{cleaned}': {str(e)}")
        return "processed"

def calculate_recipe_cost(ingredients):
    """Cost calculation with validation"""
    valid_ingredients = [ing for ing in ingredients if ing.strip()]
    return round(sum(MEDIAN_PRICES.get(categorize_ingredient(ing), 3.00)
                     for ing in valid_ingredients), 2)

# Process recipes
df['total_cost'] = df['NER'].apply(calculate_recipe_cost)

# Dynamic price categorization
costs = df['total_cost'].values
q33, q66 = np.percentile(costs, [33, 66]) if len(costs) > 0 else (0, 0)

def price_category(cost):
    if cost <= q33: return 'cheap'
    elif cost <= q66: return 'medium'
    return 'expensive'


df['price_tag'] = df['total_cost'].apply(price_category)


ingredient_df = pd.DataFrame.from_dict(classification_cache,
                                       orient='index',
                                       columns=['category']).reset_index()
ingredient_df.columns = ['ingredient', 'category']

# Add vegan/vegetarian tags
VEGAN_UNSAFE = {'meat', 'seafood', 'dairy', 'egg'}
VEGETARIAN_UNSAFE = {'meat', 'seafood'}

# ingredient_df['vegan'] = ~ingredient_df['category'].isin(VEGAN_UNSAFE)
# ingredient_df['vegetarian'] = ~ingredient_df['category'].isin(VEGETARIAN_UNSAFE)

# Save comprehensive dataset
ingredient_df.to_csv('ingredient_categories.csv', index=False)

# Display sample
print("Ingredient Categories with Dietary Tags:")
print(ingredient_df.sample(5))

df[['NER', 'total_cost', 'price_tag']]

Device set to use cpu


KeyboardInterrupt: 