In [11]:
import pandas as pd

# Read File
df = pd.read_csv('dataset_1.csv')

In [12]:
import re
from fractions import Fraction

def convert_fractions(text):
    # Match optional whole number + fraction, or just fraction
    pattern = r'(?:(\d+)\s+)?(\d+)/(\d+)'

    def replacer(match):
        whole = int(match.group(1)) if match.group(1) else 0
        numerator = int(match.group(2))
        denominator = int(match.group(3))
        decimal = whole + Fraction(numerator, denominator)
        return "{:.10g}".format(float(decimal))  # Clean, no trailing zeros

    return re.sub(pattern, replacer, text)

In [13]:
from fractions import Fraction

# fixed conversions
CONVERSIONS = {
    "oz": 30,      # ounce → gram
    "lb": 450,      # pounds → gram
    "pt": 475,      # pint → milliliter
    "qt": 950,      # quart → milliliter
    "inch": 2.5        # inches → centimeter
}

In [14]:
def convert_units(text):
    if pd.isna(text):
        return text

    # Convert oz, lb, pt, qt (form: "4 oz", "2.5 lb", etc.)
    for unit, factor in CONVERSIONS.items():
        pattern = r'(\d+(\.\d+)?)\s*' + unit
        text = re.sub(pattern, lambda m: f"{round(float(m.group(1)) * factor, 1)} {unit_to_metric(unit)}", text)

    # Convert measure in inches nei pan
    text = re.sub(r'(\d+)\s*x\s*(\d+)[-\s]*inch', lambda m: f"{int(m.group(1)) * CONVERSIONS['inch']:.0f} x {int(m.group(2)) * CONVERSIONS['inch']:.0f} cm", text)

    return text

def convert_fahrenheit_to_celsius(text):
    # Match degrees like 275°, 275 °F, 275\u00b0F (escaped), etc.
    pattern = r'(\d+)\s*(?:°|\\u00b0)'

    def replacer(match):
        fahrenheit = int(match.group(1))
        celsius = round((fahrenheit - 32) * 5 / 9)
        return f"{celsius}°C"

    return re.sub(pattern, replacer, text)

def unit_to_metric(unit):
    return {
        "oz": "g",
        "lb": "g",
        "pt": "ml",
        "qt": "ml",
        "inch": "cm"
    }[unit]

In [15]:
df['INGREDIENTS'] = df['ingredients'].apply(convert_fractions)
df['DIRECTIONS'] = df['directions'].apply(convert_fractions)

df['INGREDIENTS'] = df['INGREDIENTS'].apply(convert_units)
df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_units)

df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_fahrenheit_to_celsius)

df = df.drop(['Unnamed: 0', 'ingredients', 'directions', 'link', 'source'], axis=1)

df = df.rename(columns={"title": "TITLE"})

df

Unnamed: 0,TITLE,NER,INGREDIENTS,DIRECTIONS
0,No-Bake Nut Cookies,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...","[""1 c. firmly packed brown sugar"", ""0.5 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar..."
1,Jewell Ball'S Chicken,"[""beef"", ""chicken breasts"", ""cream of mushroom...","[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish...."
2,Creamy Corn,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...","[""2 (480.0 g.) pkg. frozen corn"", ""1 (240.0 g....","[""In a slow cooker, combine all ingredients. C..."
3,Chicken Funny,"[""chicken"", ""chicken gravy"", ""cream of mushroo...","[""1 large whole chicken"", ""2 (315.0 g.) cans c...","[""Boil and debone chicken."", ""Put bite size pi..."
4,Reeses Cups(Candy),"[""peanut butter"", ""graham cracker crumbs"", ""bu...","[""1 c. peanut butter"", ""0.75 c. graham cracker...","[""Combine first four ingredients and press in ..."
...,...,...,...,...
223111,Hot Rolls,"[""water"", ""sugar"", ""salt"", ""butter"", ""yeast"", ...","[""1.75 c. warm water"", ""0.5 c. sugar"", ""1 Tbsp...","[""Combine water, sugar, salt and butter, yeast..."
223112,Cheesy Vegetable Casserole,"[""American cheese"", ""butter"", ""containing broc...","[""225.0 g. American cheese"", ""0.5 c. butter"", ...","[""Cut cheese into cubes and place in saucepan ..."
223113,Volcanic Shake,"[""bananas"", ""cubes"", ""yogurt"", ""orange juice"",...","[""2 medium bananas"", ""4 ice cubes"", ""0.5 c. yo...","[""Put all ingredients in blender. Blend on low..."
223114,Yogurt Popsicles,"[""yogurt"", ""fruit juice"", ""vanilla""]","[""2 cartons plain yogurt"", ""1 (360.0 g.) can c...","[""Combine ingredients in a bowl. Mix well. Fre..."


In [16]:
import ast
import re

# patterns to detect time expressions
time_pattern = re.compile(
    r'(\d+\.?\d*)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h|day|days|d)\b',
    re.IGNORECASE
)

range_pattern = re.compile(
    r'(\d+)\s*-\s*(\d+)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h)\b',
    re.IGNORECASE
)

# Time estimates for common preparation methods
PREP_ESTIMATES = {
    'bake': 45,
    'boil': 20,
    'fry': 15,
    'grill': 25,
    'chill': 120,
    'simmer': 30,
    'marinate': 60,
    'microwave': 10,
    'no-bake': 20,
    'refrigerate': 180,
    'freeze': 240
}

# Special cases for recipe types
RECIPE_TYPE_ESTIMATES = {
    'pasta': 10,
    'salad': 15,
    'cake': 45,
    'pie': 60,
    'stew': 120,
    'casserole': 60,
    'soup': 30,
    'cookies': 30,
    'bread': 90,
    'fudge': 20,
    'candy': 30
}

# Convert time quantities to minutes
def convert_to_minutes(qty, unit):
    qty = float(qty)
    unit = unit.lower()
    if unit in ['minute', 'minutes', 'min', 'mins', 'm']:
        return qty
    elif unit in ['hour', 'hours', 'hr', 'hrs', 'h']:
        return qty * 60
    elif unit in ['day', 'days', 'd']:
        return qty * 24 * 60
    return 0

# Estimate time on recipe type
def estimate_by_recipe_type(recipe_name, ingredients):
    recipe_name = str(recipe_name).lower()
    ingredients = str(ingredients).lower()

    # Check for specific recipe types
    for recipe_type, time in RECIPE_TYPE_ESTIMATES.items():
        if recipe_type in recipe_name:
            return time

    # Estimate based on ingredients
    if 'raw' in ingredients or 'fresh' in ingredients:
        return 15
    if 'frozen' in ingredients:
        return 20
    if 'canned' in ingredients:
        return 10

    # Default estimation
    return 30

# Extract and sum all time references from recipe instructions
def parse_instructions(instructions):
    if isinstance(instructions, str):
        try:
            instructions = ast.literal_eval(instructions)
        except:
            instructions = [instructions]

    total_time = 0
    for step in instructions:
        if not isinstance(step, str):
            continue

        # Handle time ranges
        step = re.sub(
            range_pattern,
            lambda m: f'{m.group(2)} {m.group(3)}',
            step
        )

        # Special cases
        if 'overnight' in step.lower():
            total_time += 480  # 8 hours
        elif 'until set' in step.lower() or 'until firm' in step.lower():
            total_time += 60  # 1 hour estimation

        # Find all time references
        matches = time_pattern.findall(step)
        for (qty, unit) in matches:
            total_time += convert_to_minutes(qty, unit)

    return total_time

def categorize_time(total_time):
    if total_time == 0:
        return 'Not specified'
    elif total_time < 10:
        return 'Very fast (0-10 mins)'
    elif 10 <= total_time < 20:
        return 'Fast (10-20 mins)'
    elif 20 <= total_time < 40:
        return 'Medium (20-40 mins)'
    elif 40 <= total_time < 90:
        return 'Slow (40-90 mins)'
    else:
        return 'Very slow (90+ mins)'

# Calculate total preparation time
df['total_time'] = df['DIRECTIONS'].apply(parse_instructions)

# Estimate time for unspecified recipes
mask = df['total_time'] == 0
df.loc[mask, 'total_time'] = df[mask].apply(
    lambda x: estimate_by_recipe_type(x['TITLE'], x['INGREDIENTS']),
    axis=1
)

# Categorize preparation times
df['PREPARATION_TIME'] = df['total_time'].apply(categorize_time)

df = df.drop(['total_time'], axis=1)

df

Unnamed: 0,TITLE,NER,INGREDIENTS,DIRECTIONS,PREPARATION_TIME
0,No-Bake Nut Cookies,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...","[""1 c. firmly packed brown sugar"", ""0.5 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",Very slow (90+ mins)
1,Jewell Ball'S Chicken,"[""beef"", ""chicken breasts"", ""cream of mushroom...","[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",Very slow (90+ mins)
2,Creamy Corn,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...","[""2 (480.0 g.) pkg. frozen corn"", ""1 (240.0 g....","[""In a slow cooker, combine all ingredients. C...",Very slow (90+ mins)
3,Chicken Funny,"[""chicken"", ""chicken gravy"", ""cream of mushroo...","[""1 large whole chicken"", ""2 (315.0 g.) cans c...","[""Boil and debone chicken."", ""Put bite size pi...",Medium (20-40 mins)
4,Reeses Cups(Candy),"[""peanut butter"", ""graham cracker crumbs"", ""bu...","[""1 c. peanut butter"", ""0.75 c. graham cracker...","[""Combine first four ingredients and press in ...",Medium (20-40 mins)
...,...,...,...,...,...
223111,Hot Rolls,"[""water"", ""sugar"", ""salt"", ""butter"", ""yeast"", ...","[""1.75 c. warm water"", ""0.5 c. sugar"", ""1 Tbsp...","[""Combine water, sugar, salt and butter, yeast...",Medium (20-40 mins)
223112,Cheesy Vegetable Casserole,"[""American cheese"", ""butter"", ""containing broc...","[""225.0 g. American cheese"", ""0.5 c. butter"", ...","[""Cut cheese into cubes and place in saucepan ...",Medium (20-40 mins)
223113,Volcanic Shake,"[""bananas"", ""cubes"", ""yogurt"", ""orange juice"",...","[""2 medium bananas"", ""4 ice cubes"", ""0.5 c. yo...","[""Put all ingredients in blender. Blend on low...",Fast (10-20 mins)
223114,Yogurt Popsicles,"[""yogurt"", ""fruit juice"", ""vanilla""]","[""2 cartons plain yogurt"", ""1 (360.0 g.) can c...","[""Combine ingredients in a bowl. Mix well. Fre...",Medium (20-40 mins)


In [17]:
import ast

# Converte la colonna da stringhe a liste Python vere e proprie
df['NER_clean'] = df['NER'].apply(ast.literal_eval)

# Appiattisce tutte le liste in un'unica lista di ingredienti
all_ner = [item.strip().lower() for sublist in df['NER_clean'] for item in sublist]

# Rimuove duplicati e ordina
unique_ingredients = sorted(set(all_ner))

# Crea un DataFrame
df_unique_ingredients = pd.DataFrame(unique_ingredients, columns=['Ingredient'])

df_unique_ingredients

Unnamed: 0,Ingredient
0,""" sauce"
1,"""great garlic"
2,' powder
3,'lil smokies
4,'s
...,...
23978,zuke
23979,zwieback
23980,zwieback crackers
23981,zwieback crumbs


In [22]:
import pandas as pd
import re

prices_df = pd.read_csv('ingredients_prices')

# Crea un dizionario con i prezzi
price_dict = prices_df.set_index('Ingredient')['Cost'].to_dict()

# Funzione per convertire le quantità in kg
def convert_to_kg(quantity, unit):
    unit = unit.lower()
    conversion = {
        'c.': 0.24,        # tazza (cup) ≈ 240ml ≈ 0.24kg per ingredienti secchi
        'tsp': 0.005,      # cucchiaino ≈ 5ml ≈ 0.005kg
        'Tbsp': 0.015,     # cucchiaio ≈ 15ml ≈ 0.015kg
        'pkg': 0.1,        # pacchetto ≈ 100g
        'can': 0.4,        # lattina ≈ 400g
        'carton': 1.0,     # cartone ≈ 1kg
        'g.': 0.001,       # grammi -> kg
        'kg': 1.0          # chilogrammi
        }
    return quantity * conversion.get(unit, 0)  # 0 se unità sconosciuta

# Funzione per processare gli ingredienti
def calculate_recipe_cost(ingredients_list):
    total_cost = 0

    for ingredient in ingredients_list:
        # Estrai quantità, unità e nome
        match = re.match(r'(\d+\.?\d*)\s*([a-zA-Z]+)\s+(.*)', ingredient)
        if not match:
            continue

        quantity = float(match.group(1))
        unit = match.group(2)
        ingredient_name = match.group(3).strip().lower()

        # Converti in kg
        kg = convert_to_kg(quantity, unit)

        # Calcola costo
        price_per_kg = price_dict.get(ingredient_name, 0)
        total_cost += kg * price_per_kg

    return round(total_cost, 2)

# Applica la funzione a ogni ricetta
df['total_cost'] = df['INGREDIENTS'].apply(
    lambda x: calculate_recipe_cost(eval(x))  # Usa eval se la colonna è memorizzata come stringa di lista
)

df