In [15]:
import pandas as pd

# Read File
data = pd.read_csv('dataset_1.csv')

In [16]:
import re
from fractions import Fraction

def convert_fractions(text):
    # Match optional whole number + fraction, or just fraction
    pattern = r'(?:(\d+)\s+)?(\d+)/(\d+)'

    def replacer(match):
        whole = int(match.group(1)) if match.group(1) else 0
        numerator = int(match.group(2))
        denominator = int(match.group(3))
        decimal = whole + Fraction(numerator, denominator)
        return "{:.10g}".format(float(decimal))  # Clean, no trailing zeros

    return re.sub(pattern, replacer, text)

In [17]:
import pandas as pd
from fractions import Fraction
import re

# fixed conversions
CONVERSIONS = {
    "oz": 30,      # ounce → gram
    "lb": 450,      # pounds → gram
    "pt": 475,      # pint → milliliter
    "qt": 950,      # quart → milliliter
    "inch": 2.5        # inches → centimeter
}



In [18]:
import re

def convert_units(text):
    if pd.isna(text):
        return text

    # Convert oz, lb, pt, qt (form: "4 oz", "2.5 lb", etc.)
    for unit, factor in CONVERSIONS.items():
        pattern = r'(\d+(\.\d+)?)\s*' + unit
        text = re.sub(pattern, lambda m: f"{round(float(m.group(1)) * factor, 1)} {unit_to_metric(unit)}", text)

    # Convert measure in inches nei pan
    text = re.sub(r'(\d+)\s*x\s*(\d+)[-\s]*inch', lambda m: f"{int(m.group(1)) * CONVERSIONS['inch']:.0f} x {int(m.group(2)) * CONVERSIONS['inch']:.0f} cm", text)

    return text

def convert_fahrenheit_to_celsius(text):
    # Match degrees like 275°, 275 °F, 275\u00b0F (escaped), etc.
    pattern = r'(\d+)\s*(?:°|\\u00b0)'

    def replacer(match):
        fahrenheit = int(match.group(1))
        celsius = round((fahrenheit - 32) * 5 / 9)
        return f"{celsius}°C"

    return re.sub(pattern, replacer, text)

def unit_to_metric(unit):
    return {
        "oz": "g",
        "lb": "g",
        "pt": "ml",
        "qt": "ml",
        "inch": "cm"
    }[unit]


In [19]:
df = pd.read_csv('dataset_1.csv')

df['INGREDIENTS'] = df['ingredients'].apply(convert_fractions)
df['DIRECTIONS'] = df['directions'].apply(convert_fractions)

df['INGREDIENTS'] = df['INGREDIENTS'].apply(convert_units)
df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_units)

df['DIRECTIONS'] = df['DIRECTIONS'].apply(convert_fahrenheit_to_celsius)

df = df.drop(['Unnamed: 0', 'ingredients', 'directions', 'link', 'source'], axis=1)

df = df.rename(columns={"title": "TITLE"})

df

Unnamed: 0,TITLE,NER,INGREDIENTS,DIRECTIONS
0,No-Bake Nut Cookies,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...","[""1 c. firmly packed brown sugar"", ""0.5 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar..."
1,Jewell Ball'S Chicken,"[""beef"", ""chicken breasts"", ""cream of mushroom...","[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish...."
2,Creamy Corn,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...","[""2 (480.0 g.) pkg. frozen corn"", ""1 (240.0 g....","[""In a slow cooker, combine all ingredients. C..."
3,Chicken Funny,"[""chicken"", ""chicken gravy"", ""cream of mushroo...","[""1 large whole chicken"", ""2 (315.0 g.) cans c...","[""Boil and debone chicken."", ""Put bite size pi..."
4,Reeses Cups(Candy),"[""peanut butter"", ""graham cracker crumbs"", ""bu...","[""1 c. peanut butter"", ""0.75 c. graham cracker...","[""Combine first four ingredients and press in ..."
...,...,...,...,...
223111,Hot Rolls,"[""water"", ""sugar"", ""salt"", ""butter"", ""yeast"", ...","[""1.75 c. warm water"", ""0.5 c. sugar"", ""1 Tbsp...","[""Combine water, sugar, salt and butter, yeast..."
223112,Cheesy Vegetable Casserole,"[""American cheese"", ""butter"", ""containing broc...","[""225.0 g. American cheese"", ""0.5 c. butter"", ...","[""Cut cheese into cubes and place in saucepan ..."
223113,Volcanic Shake,"[""bananas"", ""cubes"", ""yogurt"", ""orange juice"",...","[""2 medium bananas"", ""4 ice cubes"", ""0.5 c. yo...","[""Put all ingredients in blender. Blend on low..."
223114,Yogurt Popsicles,"[""yogurt"", ""fruit juice"", ""vanilla""]","[""2 cartons plain yogurt"", ""1 (360.0 g.) can c...","[""Combine ingredients in a bowl. Mix well. Fre..."


In [20]:
import pandas as pd
import ast
import re
from collections import defaultdict

# patterns to detect time expressions
time_pattern = re.compile(
    r'(\d+\.?\d*)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h|day|days|d)\b',
    re.IGNORECASE
)

range_pattern = re.compile(
    r'(\d+)\s*-\s*(\d+)\s*(minute|minutes|min|mins|m|hour|hours|hr|hrs|h)\b',
    re.IGNORECASE
)

# Time estimates for common preparation methods
PREP_ESTIMATES = {
    'bake': 45,
    'boil': 20,
    'fry': 15,
    'grill': 25,
    'chill': 120,
    'simmer': 30,
    'marinate': 60,
    'microwave': 10,
    'no-bake': 20,
    'refrigerate': 180,
    'freeze': 240
}

# Special cases for recipe types
RECIPE_TYPE_ESTIMATES = {
    'pasta': 10,
    'salad': 15,
    'cake': 45,
    'pie': 60,
    'stew': 120,
    'casserole': 60,
    'soup': 30,
    'cookies': 30,
    'bread': 90,
    'fudge': 20,
    'candy': 30
}

# Convert time quantities to minutes
def convert_to_minutes(qty, unit):
    qty = float(qty)
    unit = unit.lower()
    if unit in ['minute', 'minutes', 'min', 'mins', 'm']:
        return qty
    elif unit in ['hour', 'hours', 'hr', 'hrs', 'h']:
        return qty * 60
    elif unit in ['day', 'days', 'd']:
        return qty * 24 * 60
    return 0

# Estimate time on recipe type
def estimate_by_recipe_type(recipe_name, ingredients):
    recipe_name = str(recipe_name).lower()
    ingredients = str(ingredients).lower()

    # Check for specific recipe types
    for recipe_type, time in RECIPE_TYPE_ESTIMATES.items():
        if recipe_type in recipe_name:
            return time

    # Estimate based on ingredients
    if 'raw' in ingredients or 'fresh' in ingredients:
        return 15
    if 'frozen' in ingredients:
        return 20
    if 'canned' in ingredients:
        return 10

    # Default estimation
    return 30

# Extract and sum all time references from recipe instructions
def parse_instructions(instructions):
    if isinstance(instructions, str):
        try:
            instructions = ast.literal_eval(instructions)
        except:
            instructions = [instructions]

    total_time = 0
    for step in instructions:
        if not isinstance(step, str):
            continue

        # Handle time ranges
        step = re.sub(
            range_pattern,
            lambda m: f'{m.group(2)} {m.group(3)}',
            step
        )

        # Special cases
        if 'overnight' in step.lower():
            total_time += 480  # 8 hours
        elif 'until set' in step.lower() or 'until firm' in step.lower():
            total_time += 60  # 1 hour estimation

        # Find all time references
        matches = time_pattern.findall(step)
        for (qty, unit) in matches:
            total_time += convert_to_minutes(qty, unit)

    return total_time

def categorize_time(total_time):
    if total_time == 0:
        return 'Not specified'
    elif total_time < 10:
        return 'Very fast (0-10 mins)'
    elif 10 <= total_time < 20:
        return 'Fast (10-20 mins)'
    elif 20 <= total_time < 40:
        return 'Medium (20-40 mins)'
    elif 40 <= total_time < 90:
        return 'Slow (40-90 mins)'
    else:
        return 'Very slow (90+ mins)'

# Calculate total preparation time
df['total_time'] = df['DIRECTIONS'].apply(parse_instructions)

# Estimate time for unspecified recipes
mask = df['total_time'] == 0
df.loc[mask, 'total_time'] = df[mask].apply(
    lambda x: estimate_by_recipe_type(x['TITLE'], x['INGREDIENTS']),
    axis=1
)

# Categorize preparation times
df['PREPARATION_TIME'] = df['total_time'].apply(categorize_time)

df = df.drop(['total_time'], axis=1)

df

Unnamed: 0,TITLE,NER,INGREDIENTS,DIRECTIONS,PREPARATION_TIME
0,No-Bake Nut Cookies,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...","[""1 c. firmly packed brown sugar"", ""0.5 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",Very slow (90+ mins)
1,Jewell Ball'S Chicken,"[""beef"", ""chicken breasts"", ""cream of mushroom...","[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",Very slow (90+ mins)
2,Creamy Corn,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...","[""2 (480.0 g.) pkg. frozen corn"", ""1 (240.0 g....","[""In a slow cooker, combine all ingredients. C...",Very slow (90+ mins)
3,Chicken Funny,"[""chicken"", ""chicken gravy"", ""cream of mushroo...","[""1 large whole chicken"", ""2 (315.0 g.) cans c...","[""Boil and debone chicken."", ""Put bite size pi...",Medium (20-40 mins)
4,Reeses Cups(Candy),"[""peanut butter"", ""graham cracker crumbs"", ""bu...","[""1 c. peanut butter"", ""0.75 c. graham cracker...","[""Combine first four ingredients and press in ...",Medium (20-40 mins)
...,...,...,...,...,...
223111,Hot Rolls,"[""water"", ""sugar"", ""salt"", ""butter"", ""yeast"", ...","[""1.75 c. warm water"", ""0.5 c. sugar"", ""1 Tbsp...","[""Combine water, sugar, salt and butter, yeast...",Medium (20-40 mins)
223112,Cheesy Vegetable Casserole,"[""American cheese"", ""butter"", ""containing broc...","[""225.0 g. American cheese"", ""0.5 c. butter"", ...","[""Cut cheese into cubes and place in saucepan ...",Medium (20-40 mins)
223113,Volcanic Shake,"[""bananas"", ""cubes"", ""yogurt"", ""orange juice"",...","[""2 medium bananas"", ""4 ice cubes"", ""0.5 c. yo...","[""Put all ingredients in blender. Blend on low...",Fast (10-20 mins)
223114,Yogurt Popsicles,"[""yogurt"", ""fruit juice"", ""vanilla""]","[""2 cartons plain yogurt"", ""1 (360.0 g.) can c...","[""Combine ingredients in a bowl. Mix well. Fre...",Medium (20-40 mins)


In [21]:
"""import pandas as pd
import ast
import re

# Preprocessa e ripulisce ogni elemento
def clean_ingredient(ingredient):
    if not isinstance(ingredient, str):
        return ""

    # Rimuovi numeri, frazioni, unità di misura comuni
    ingredient = re.sub(r'\b\d*\.?\d+\b', '', ingredient)  # rimuove numeri decimali
    ingredient = re.sub(r'\b(?:tsp|tbsp|c|cup|oz|g|kg|ml|l|stick|bottle|box|block|bar|approx|approximately|to|or|and|optional|if desired|very fine|chopped|diced|minced|crushed|defatted)\b', '', ingredient, flags=re.IGNORECASE)

    # Rimuovi caratteri speciali e spazi extra
    ingredient = re.sub(r'[^a-zA-Zà-ùÀ-Ù0-9 ]+', '', ingredient)
    ingredient = re.sub(r'\s+', ' ', ingredient).strip().lower()
    return ingredient

# Applica parsing e pulizia alla colonna INGREDIENTS
def parse_and_clean(val):
    try:
        items = ast.literal_eval(val) if isinstance(val, str) else []
    except:
        items = []

    return [clean_ingredient(item) for item in items if clean_ingredient(item)]

# Applichiamo tutto
df["CLEAN_INGREDIENTS"] = df["INGREDIENTS"].apply(parse_and_clean)

# Otteniamo tutti gli ingredienti unici puliti
unique_ingredients = set()
for ing_list in df["CLEAN_INGREDIENTS"]:
    unique_ingredients.update(ing_list)

df_unique_cleaned = pd.DataFrame(sorted(unique_ingredients), columns=["ingredient"])
pd.set_option('display.max_rows', None)
display(df_unique_cleaned)
"""

"""# non ricarico df, uso quello già trasformato
ingredient_prices = {
    # qui popoli i prezzi basandoti sulla tabella visibile
}

def estimate_cost(ings):
    return sum(ingredient_prices.get(ing.strip().lower(),0) for ing in ings)

df["estimated_cost"] = df["ingredients"].apply(estimate_cost)

def tag_cost(c):
    if c<3:  return "very cheap"
    if c<5:  return "cheap"
    if c<10: return "medium"
    if c<20: return "expensive"
    return "rich"

df["cost_tag"] = df["estimated_cost"].apply(tag_cost)
display(df[["TITLE","estimated_cost","cost_tag"]])
"""

'# non ricarico df, uso quello già trasformato\ningredient_prices = {\n    # qui popoli i prezzi basandoti sulla tabella visibile\n}\n\ndef estimate_cost(ings):\n    return sum(ingredient_prices.get(ing.strip().lower(),0) for ing in ings)\n\ndf["estimated_cost"] = df["ingredients"].apply(estimate_cost)\n\ndef tag_cost(c):\n    if c<3:  return "very cheap"\n    if c<5:  return "cheap"\n    if c<10: return "medium"\n    if c<20: return "expensive"\n    return "rich"\n\ndf["cost_tag"] = df["estimated_cost"].apply(tag_cost)\ndisplay(df[["TITLE","estimated_cost","cost_tag"]])\n'

In [22]:
import pandas as pd
import ast
import re

ingredients_col = df.iloc[:, 1]  # Adjust if needed

# Function to clean and extract ingredient name (remove quantity/unit)
def clean_ingredient(text):
    text = re.sub(r'^[\d\/\.\(\)]+[\s\w\.]*\s+', '', text)  # remove quantity and unit
    text = re.sub(r'\(.*?\)', '', text)  # remove anything in parentheses
    return text.lower().strip()

# Collect all ingredients
all_ingredients = set()
for row in ingredients_col:
    try:
        ingredients = ast.literal_eval(row)
        for item in ingredients:
            cleaned = clean_ingredient(item)
            if cleaned:
                all_ingredients.add(cleaned)
    except:
        continue

# Convert to sorted list
unique_ingredients = sorted(all_ingredients)
#print(unique_ingredients)
'''df['Ingredient'] = unique_ingredients

# Funzioni di conversione
def convert_to_kg(text):
    text = text.lower()
    conversions = {
        'a bunch': 0.1, 'a little': 0.02, 'a bit': 0.02, 'a pinch': 0.001, 'a dash': 0.001,
        'a few': 0.03, 'a slice': 0.025, 'a piece': 0.05, 'a handful': 0.05, 'a scoop': 0.05,
        'a chunk': 0.05, 'a spoon': 0.015, 'a big chunk': 0.1, 'a dab': 0.005,
        'a couple dashes': 0.002, 'a couple': 0.06, 'a small jar': 0.25, 'a large jar': 0.5,
        'a can': 0.4, 'a jar': 0.4, 'a bottle': 0.75, 'a bag': 0.5, 'a box': 0.5,
        'a container': 0.5, 'a head': 0.4, 'a stick': 0.113,
    }
    for phrase, kg in conversions.items():
        if phrase in text:
            return kg
    return None

def parse_quantity_to_kg(text):
    text = text.lower()
    if match := re.search(r'(\d+(\.\d+)?)\s*(g|ml)', text):
        return float(match.group(1)) / 1000
    if match := re.search(r'(\d+(\.\d+)?)\s*(kg|l)', text):
        return float(match.group(1))
    if match := re.search(r'(\d+(\.\d+)?)\s*c\.', text):
        return float(match.group(1)) * 0.24
    if match := re.search(r'(\d+(\.\d+)?)\s*tbsp', text):
        return float(match.group(1)) * 0.015
    if match := re.search(r'(\d+(\.\d+)?)\s*tsp', text):
        return float(match.group(1)) * 0.005
    return convert_to_kg(text)

def estimate_missing_quantities(text):
    text = text.lower()
    if 'whole chicken' in text:
        return 1.5
    if 'boned chicken breasts' in text:
        return 0.6
    if 'jar' in text:
        return 0.4
    if 'carton' in text:
        return 0.5
    if 'can' in text:
        return 0.4
    if 'pkg' in text:
        return 0.5
    if 'box' in text:
        return 0.5
    return None

# Calcolo delle quantità
ingredient_quantities_kg = []
for row in ingredients_col:
    try:
        ingredient_list = ast.literal_eval(row)
        row_estimations = []
        for ing in ingredient_list:
            est_kg = parse_quantity_to_kg(ing)
            if est_kg is None:
                est_kg = estimate_missing_quantities(ing)
            row_estimations.append((ing, est_kg))
        ingredient_quantities_kg.append(row_estimations)
    except Exception:
        ingredient_quantities_kg.append([])

# Costruzione DataFrame
flattened_data = []
for row in ingredient_quantities_kg:
    for ing, qty in row:
        flattened_data.append({
            'Ingredient': ing,
            'Estimated_kg': qty
        })

final_df = pd.DataFrame(flattened_data)

final_df'''

"df['Ingredient'] = unique_ingredients\n\n# Funzioni di conversione\ndef convert_to_kg(text):\n    text = text.lower()\n    conversions = {\n        'a bunch': 0.1, 'a little': 0.02, 'a bit': 0.02, 'a pinch': 0.001, 'a dash': 0.001,\n        'a few': 0.03, 'a slice': 0.025, 'a piece': 0.05, 'a handful': 0.05, 'a scoop': 0.05,\n        'a chunk': 0.05, 'a spoon': 0.015, 'a big chunk': 0.1, 'a dab': 0.005,\n        'a couple dashes': 0.002, 'a couple': 0.06, 'a small jar': 0.25, 'a large jar': 0.5,\n        'a can': 0.4, 'a jar': 0.4, 'a bottle': 0.75, 'a bag': 0.5, 'a box': 0.5,\n        'a container': 0.5, 'a head': 0.4, 'a stick': 0.113,\n    }\n    for phrase, kg in conversions.items():\n        if phrase in text:\n            return kg\n    return None\n\ndef parse_quantity_to_kg(text):\n    text = text.lower()\n    if match := re.search(r'(\\d+(\\.\\d+)?)\\s*(g|ml)', text):\n        return float(match.group(1)) / 1000\n    if match := re.search(r'(\\d+(\\.\\d+)?)\\s*(kg|l)', text):

In [23]:
df_costs = unique_ingredients

df_costs

['" sauce',
 '"great garlic',
 "' powder",
 "'lil smokies",
 "'s",
 "'s applesauce",
 "'s beans",
 "'s beefy onion soup",
 "'s cajun",
 "'s cheese",
 "'s cherries",
 "'s chili",
 "'s chili beans",
 "'s chili mix",
 "'s chili seasoning",
 "'s chips",
 "'s chocolate",
 "'s choice",
 "'s chunky",
 "'s cocoa",
 "'s coconut macaroons",
 "'s country",
 "'s crabmeat",
 "'s fajita marinating sauce",
 "'s feet",
 "'s great northern beans",
 "'s italian",
 "'s ketchup",
 "'s kidneys",
 "'s kraut",
 "'s krazy mixed",
 "'s krazy mixed-up salt",
 "'s krazy salt",
 "'s margarine",
 "'s mayonnaise",
 "'s meat",
 "'s meat marinade",
 "'s natural",
 "'s natural apple sauce",
 "'s natures seasoning",
 "'s noodles",
 "'s original",
 "'s red",
 "'s regular",
 "'s relish",
 "'s rub",
 "'s salsa",
 "'s sauce",
 "'s sauerkraut",
 "'s sausage",
 "'s seasoning",
 "'s seasons",
 "'s secret ingredients",
 "'s special recipe chili",
 "'s special tomato sauce",
 "'s stuffing mix",
 "'s sugar",
 "'s sugar substitut