In [1]:
import pandas as pd

# Read File and create a new dataframe called df
df = pd.read_csv('dataset_1.csv')

In [2]:
import re
from fractions import Fraction

def convert_fractions(text):
    # Match optional whole number + fraction, or just fraction
    pattern = r'(?:(\d+)\s+)?(\d+)/(\d+)'

    def replacer(match):
        whole = int(match.group(1)) if match.group(1) else 0
        numerator = int(match.group(2))
        denominator = int(match.group(3))
        decimal = whole + Fraction(numerator, denominator)
        return "{:.10g}".format(float(decimal))  # Clean, no trailing zeros

    return re.sub(pattern, replacer, text)

In [3]:
from fractions import Fraction

# fixed conversions
CONVERSIONS = {
    "oz": 30,      # ounce → gram
    "lb": 450,      # pounds → gram
    "pt": 475,      # pint → milliliter
    "qt": 950,      # quart → milliliter
    "inch": 2.5        # inches → centimeter
}

In [4]:
def convert_units(text):
    if pd.isna(text):
        return text

    # Convert oz, lb, pt, qt (form: "4 oz", "2.5 lb", etc.)
    for unit, factor in CONVERSIONS.items():
        pattern = r'(\d+(\.\d+)?)\s*' + unit
        text = re.sub(pattern, lambda m: f"{round(float(m.group(1)) * factor, 1)} {unit_to_metric(unit)}", text)

    # Convert measure in inches in pans
    text = re.sub(r'(\d+)\s*x\s*(\d+)[-\s]*inch', lambda m: f"{int(m.group(1)) * CONVERSIONS['inch']:.0f} x {int(m.group(2)) * CONVERSIONS['inch']:.0f} cm", text)

    return text

def convert_fahrenheit_to_celsius(text):
    # Match degrees like 275°, 275 °F, 275\u00b0F (escaped), etc.
    pattern = r'(\d+)\s*(?:°|\\u00b0)'

    def replacer(match):
        fahrenheit = int(match.group(1))
        celsius = round((fahrenheit - 32) * 5 / 9)
        return f"{celsius}°C"

    return re.sub(pattern, replacer, text)

def unit_to_metric(unit):
    return {
        "oz": "g",
        "lb": "g",
        "pt": "ml",
        "qt": "ml",
        "inch": "cm"
    }[unit]

In [5]:
#applying conversions on the INGREDIENTS and DIRECTIOND columns
df['INGREDIENTS'] = df['ingredients'].apply(convert_fractions)
df['DIRECTIONS'] = df['directions'].apply(convert_fractions)

df['INGREDIENTS'] = df['INGREDIENTS'].apply(convert_units)
df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_units)

df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_fahrenheit_to_celsius)

#renaming the title column
df = df.rename(columns={"title": "TITLE"})

df[['TITLE', 'INGREDIENTS', 'DIRECTIONS']].head(200)

Unnamed: 0,TITLE,INGREDIENTS,DIRECTIONS
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""0.5 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish...."
2,Creamy Corn,"[""2 (480.0 g.) pkg. frozen corn"", ""1 (240.0 g....","[""In a slow cooker, combine all ingredients. C..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (315.0 g.) cans c...","[""Boil and debone chicken."", ""Put bite size pi..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""0.75 c. graham cracker...","[""Combine first four ingredients and press in ..."
...,...,...,...
195,Golf Balls,"[""1 c. cocoa"", ""1 c. butter, softened"", ""3 (45...","[""Beat all ingredients together except peanut ..."
196,Crispy Herb Bread,"[""4 tsp. olive oil"", ""0.5 tsp. garlic powder"",...","[""In a small bowl, combine oil and garlic powd..."
197,Festive Fruit Salad,"[""1 (600.0 g.) can pineapple chunks, drained (...","[""Combine pineapple, oranges, grapes, strawber..."
198,Fresh Strawberry Pie,"[""950.0 ml. strawberries"", ""0.75 c. water"", ""p...","[""Cut up 1 cup strawberries."", ""Add water."", ""..."


In [6]:
import ast
import re

# patterns to detect time expressions
time_pattern = re.compile(
    r'(\d+\.?\d*)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h|day|days|d)\b',
    re.IGNORECASE
)

range_pattern = re.compile(
    r'(\d+)\s*-\s*(\d+)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h)\b',
    re.IGNORECASE
)

# Time estimates for common preparation methods
PREP_ESTIMATES = {
    'bake': 45,
    'boil': 20,
    'fry': 15,
    'grill': 25,
    'chill': 120,
    'simmer': 30,
    'marinate': 60,
    'microwave': 10,
    'no-bake': 20,
    'refrigerate': 180,
    'freeze': 240
}

# Special cases for recipe types
RECIPE_TYPE_ESTIMATES = {
    'pasta': 10,
    'salad': 15,
    'cake': 45,
    'pie': 60,
    'stew': 120,
    'casserole': 60,
    'soup': 30,
    'cookies': 30,
    'bread': 90,
    'fudge': 20,
    'candy': 30
}

# Convert time quantities to minutes
def convert_to_minutes(qty, unit):
    qty = float(qty)
    unit = unit.lower()
    if unit in ['minute', 'minutes', 'min', 'mins', 'm']:
        return qty
    elif unit in ['hour', 'hours', 'hr', 'hrs', 'h']:
        return qty * 60
    elif unit in ['day', 'days', 'd']:
        return qty * 24 * 60
    return 0

# Estimate time on recipe type
def estimate_by_recipe_type(recipe_name, ingredients):
    recipe_name = str(recipe_name).lower()
    ingredients = str(ingredients).lower()

    # Check for specific recipe types
    for recipe_type, time in RECIPE_TYPE_ESTIMATES.items():
        if recipe_type in recipe_name:
            return time

    # Estimate based on ingredients
    if 'raw' in ingredients or 'fresh' in ingredients:
        return 15
    if 'frozen' in ingredients:
        return 20
    if 'canned' in ingredients:
        return 10

    # Default estimation
    return 30

# Extract and sum all time references from recipe instructions
def parse_instructions(instructions):
    if isinstance(instructions, str):
        try:
            instructions = ast.literal_eval(instructions)
        except:
            instructions = [instructions]

    total_time = 0
    for step in instructions:
        if not isinstance(step, str):
            continue

        # Handle time ranges
        step = re.sub(
            range_pattern,
            lambda m: f'{m.group(2)} {m.group(3)}',
            step
        )

        # Special cases
        if 'overnight' in step.lower():
            total_time += 480  # 8 hours
        elif 'until set' in step.lower() or 'until firm' in step.lower():
            total_time += 60  # 1 hour estimation

        # Find all time references
        matches = time_pattern.findall(step)
        for (qty, unit) in matches:
            total_time += convert_to_minutes(qty, unit)

    return total_time

def categorize_time(total_time):
    if total_time == 0:
        return 'Not specified'
    elif total_time < 10:
        return 'Very fast (0-10 mins)'
    elif 10 <= total_time < 20:
        return 'Fast (10-20 mins)'
    elif 20 <= total_time < 40:
        return 'Medium (20-40 mins)'
    elif 40 <= total_time < 90:
        return 'Slow (40-90 mins)'
    else:
        return 'Very slow (90+ mins)'

# Calculate total preparation time
df['total_time'] = df['DIRECTIONS'].apply(parse_instructions)

# Estimate time for unspecified recipes
mask = df['total_time'] == 0
df.loc[mask, 'total_time'] = df[mask].apply(
    lambda x: estimate_by_recipe_type(x['TITLE'], x['INGREDIENTS']),
    axis=1
)

# Categorize preparation times
df['PREPARATION_TIME'] = df['total_time'].apply(categorize_time)

df[['TITLE', 'INGREDIENTS', 'DIRECTIONS', 'PREPARATION_TIME']].head(200)

Unnamed: 0,TITLE,INGREDIENTS,DIRECTIONS,PREPARATION_TIME
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""0.5 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",Very slow (90+ mins)
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",Very slow (90+ mins)
2,Creamy Corn,"[""2 (480.0 g.) pkg. frozen corn"", ""1 (240.0 g....","[""In a slow cooker, combine all ingredients. C...",Very slow (90+ mins)
3,Chicken Funny,"[""1 large whole chicken"", ""2 (315.0 g.) cans c...","[""Boil and debone chicken."", ""Put bite size pi...",Medium (20-40 mins)
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""0.75 c. graham cracker...","[""Combine first four ingredients and press in ...",Medium (20-40 mins)
...,...,...,...,...
195,Golf Balls,"[""1 c. cocoa"", ""1 c. butter, softened"", ""3 (45...","[""Beat all ingredients together except peanut ...",Medium (20-40 mins)
196,Crispy Herb Bread,"[""4 tsp. olive oil"", ""0.5 tsp. garlic powder"",...","[""In a small bowl, combine oil and garlic powd...",Very fast (0-10 mins)
197,Festive Fruit Salad,"[""1 (600.0 g.) can pineapple chunks, drained (...","[""Combine pineapple, oranges, grapes, strawber...",Fast (10-20 mins)
198,Fresh Strawberry Pie,"[""950.0 ml. strawberries"", ""0.75 c. water"", ""p...","[""Cut up 1 cup strawberries."", ""Add water."", ""...",Very slow (90+ mins)


In [7]:
import ast

# Converte la colonna da stringhe a liste Python vere e proprie
df['NER_clean'] = df['NER'].apply(ast.literal_eval)

# Appiattisce tutte le liste in un'unica lista di ingredienti
all_ner = [item.strip().lower() for sublist in df['NER_clean'] for item in sublist]

# Rimuove duplicati e ordina
unique_ingredients = sorted(set(all_ner))

# Crea un DataFrame
df_unique_ingredients = pd.DataFrame(unique_ingredients, columns=['Ingredient'])

#df_unique_ingredients

In [8]:
import pandas as pd
import re
import ast

# 1. Carica prezzi e normalizza le chiavi
prices_df = pd.read_csv('ingredients_prices')  # Assicurati dell'estensione .csv
price_dict = {k.strip().lower(): v for k, v in prices_df.set_index('Ingredient')['Cost'].items()}

# 2. Conversione unità → kg
UNIT_CONVERSION = {
    'cup': 0.24, 'c.': 0.24, 'c': 0.24,
    'teaspoon': 0.005, 'tsp': 0.005, 'tsp.': 0.005,
    'tablespoon': 0.015, 'tbsp': 0.015, 'tbsp.': 0.015,
    'package': 0.2, 'pkg': 0.2, 'pkg.': 0.2,
    'can': 0.4, 'carton': 1.0,
    'g': 0.001, 'g.': 0.001, 'gram': 0.001,
    'kg': 1.0, 'kilogram': 1.0,
    'lb': 0.4536, 'pound': 0.4536,
    'oz': 0.02835, 'ounce': 0.02835,
    'large package': 0.5, 'large pkg.': 0.5, 'large pkg': 0.5
}

# 3. Parsing function (qty, unit, name, grams_in_paren)
def parse_ingredient(ing_str):
    # peso fra parentesi
    m = re.search(r'\(\s*([0-9]+(?:\.[0-9]+)?)\s*g\.?\s*\)', ing_str, re.IGNORECASE)
    grams = float(m.group(1)) if m else None

    # quantity e unit
    patt = r'^\s*([\d\/.\s]+)?\s*([a-zA-Z\.]+)?'
    m2 = re.match(patt, ing_str)
    qty_raw = m2.group(1) if m2 else None
    try:
        qty = eval(qty_raw.replace(' ', '+')) if qty_raw else 1.0
    except:
        qty = 1.0
    unit = (m2.group(2) or '').strip('.').lower() if m2 else ''

    # name pulito
    name = re.sub(r'\(.*?\)|optional', '', ing_str, flags=re.IGNORECASE)
    # rimuovo quantità/unità
    name = re.sub(r'^[\d\/.\s]+\s*[a-zA-Z\.]*', '', name).strip().lower()
    return qty, unit, name, grams

# 4. Calcolo posizionale con fallback substring e token
def calculate_recipe_cost_positional(ingredients_list, ner_list):
    total = 0.0
    missing = []

    for ing_str, ner_item in zip(ingredients_list, ner_list):
        key = ner_item.lower()
        qty, unit, name, grams = parse_ingredient(ing_str)

        # scorporo se passo i grams
        if grams is not None:
            kg = (grams / 1000) * qty
        elif unit == '':  # a pezzo
            # prendo prezzo direct key
            price = price_dict.get(key)
            # fallback substring/token
            if price is None:
                # substring match
                for k, v in price_dict.items():
                    if k in key or key in k:
                        price = v
                        break
                # token match
                if price is None:
                    for token in key.split():
                        if token in price_dict:
                            price = price_dict[token]
                            break
            if price:
                total += qty * price
            else:
                missing.append(key)
            continue
        else:
            conv = UNIT_CONVERSION.get(unit, 0.0)
            if conv == 0:
                missing.append(key)
                continue
            kg = qty * conv

        # prezzo per kg
        price = price_dict.get(key)
        if price is None:
            for k, v in price_dict.items():
                if k in key or key in k:
                    price = v
                    break
        if price is None:
            missing.append(key)
            continue

        total += kg * price

    return round(total, 2), missing

# 5. Applica al DataFrame
    df['INGREDIENTS'] = df['INGREDIENTS'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['NER_clean']  = df['NER_clean'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

def compute_cost_row(row):
     cost, _ = calculate_recipe_cost_positional(row['INGREDIENTS'], row['NER_clean'])
     return cost

def categorize_cost(total_cost):
    if total_cost == 0:
        return 'Not specified'
    elif total_cost < 10:
        return 'Very cheap'
    elif 10 <= total_cost < 20:
        return 'Cheap'
    elif 20 <= total_cost < 45:
        return 'Medium'
    elif 45 <= total_cost < 90:
        return 'Expensive'
    else:
        return 'Rich'

df['total_cost'] = df.apply(compute_cost_row, axis=1)

df['CATEGORY_COST'] = df['total_cost'].apply(categorize_cost)

# Mostra risultati
df[['INGREDIENTS', 'NER', 'CATEGORY_COST']].head(200)

Unnamed: 0,INGREDIENTS,NER,CATEGORY_COST
0,"[""1 c. firmly packed brown sugar"", ""0.5 c. eva...","[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",Medium
1,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""beef"", ""chicken breasts"", ""cream of mushroom...",Medium
2,"[""2 (480.0 g.) pkg. frozen corn"", ""1 (240.0 g....","[""frozen corn"", ""cream cheese"", ""butter"", ""gar...",Rich
3,"[""1 large whole chicken"", ""2 (315.0 g.) cans c...","[""chicken"", ""chicken gravy"", ""cream of mushroo...",Medium
4,"[""1 c. peanut butter"", ""0.75 c. graham cracker...","[""peanut butter"", ""graham cracker crumbs"", ""bu...",Medium
...,...,...,...
195,"[""1 c. cocoa"", ""1 c. butter, softened"", ""3 (45...","[""cocoa"", ""butter"", ""powdered sugar"", ""milk"", ...",Medium
196,"[""4 tsp. olive oil"", ""0.5 tsp. garlic powder"",...","[""olive oil"", ""garlic powder"", ""bread"", ""thyme...",Expensive
197,"[""1 (600.0 g.) can pineapple chunks, drained (...","[""pineapple"", ""mandarin oranges"", ""grapes"", ""m...",Rich
198,"[""950.0 ml. strawberries"", ""0.75 c. water"", ""p...","[""strawberries"", ""water"", ""salt"", ""sugar"", ""co...",Medium
