In [None]:
import os
from pathlib import Path
import pandas as pd
from utils.html_parser import extract_dishes_from_html
from utils.diet_rules import apply_heuristics
from utils.diet_agent import classify_dish

"""
This script processes raw HTML data of dishes, applies both heuristic
and LLM-based classification logic, compares both results, and finalizes
a dataset with inferred dietary tags for further usage.

Key idea:
- Heuristic rules are fast and reliable for obvious patterns (e.g., low kcal).
- LLM inference is preferred when domain judgment is required (e.g., is_gourmet).
- Final flags are taken from the LLM agent (`df_agent`), assuming deeper reasoning.
  However, manual overrides or forced heuristic rules can be applied later if needed.
"""

# 1. Parse HTML input into structured DataFrame
df = extract_dishes_from_html(Path("data/raw.html"))
print(f"‚úÖ Parsed {len(df)} dishes")

# 2. Apply rule-based heuristics to extract simple, logic-driven tags
df_heur = df.copy()
heuristics_df = df_heur.apply(apply_heuristics, axis=1).tolist()
df_heur = df_heur.join(pd.DataFrame(heuristics_df))
print("‚úÖ Heuristics applied")

# 3. Use LLM-based agent to classify the dishes, considering full context
records = df_heur.to_dict(orient="records")
df_agent = pd.DataFrame([classify_dish(r).model_dump() for r in records])
print("‚úÖ Agent classification done")

# 4. Compare Heuristic vs LLM output to spot conflicts or review borderline cases

# Reset indexes to align both DataFrames for row-wise comparison
df_heur = df_heur.reset_index(drop=True)
df_agent = df_agent.reset_index(drop=True)

# Identify which flags will be compared (all boolean classification tags)
flag_cols = [c for c in df_heur.columns if c.startswith((
    "is_", "bajo_en_calorias", "es_postre", "de_cuchara",
    "alto_proteina", "sin_lactosa", "is_gourmet", "para_diabeticos", "sin_gluten", "congelar"
))]

# Combine both outputs for side-by-side review
df_cmp = pd.concat([df_heur, df_agent.add_suffix("_agent")], axis=1)

# Identify rows where any flag differs between heuristic and agent
mask_diff = pd.DataFrame({
    col: df_cmp[col] != df_cmp[col + "_agent"] 
    for col in flag_cols
})
rows_diff = df_cmp[mask_diff.any(axis=1)]

print(f"üîç Found {len(rows_diff)} mismatches between heuristic and agent")

# Print details of each conflict to aid human-in-the-loop inspection
for _, r in rows_diff.iterrows():
    print("-" * 50)
    print(f"üçΩÔ∏è Dish: {r['nombre_plato']}")
    print(f"Ingredients: {r['ingredientes']}")
    print(f"Kcal: {r['kcal']}")
    print("Differences found:")
    for col in flag_cols:
        if r[col] != r[col + "_agent"]:
            print(f" ‚Ä¢ {col}: heuristic={r[col]}, agent={r[col + '_agent']}")
    print("-" * 50 + "\n")

# 5. Finalize output DataFrame using LLM results

"""
At this point, we build the final dataset by combining the original dish data
with the flags inferred by the LLM. For this version, we **trust the LLM agent's
inference** as the most comprehensive classification source, given its broader context awareness.

However, if necessary, one could force specific tags using:
  - a rule-based override for certain fields
  - a manual correction for certain dishes

This allows full flexibility between automation and human review.
"""

df_final_official = pd.concat([df, df_agent], axis=1)
print("‚úÖ df_final_official is ready:")
display(df_final_official.head())


‚úÖ Parsed 241 dishes
‚úÖ Heuristics applied
‚úÖ Agent classification done
üîç Found 165 mismatches between heuristic and agent
--------------------------------------------------
üçΩÔ∏è Nombre: Aj√≠ de Pollo
Ingredientes: Pollo, 26.38%, Agua, Cebolla, Huevo (HUEVO), leche (L√ÅCTEOS), Arroz, Pan (GLUTEN), Almendra (FRUTOS DE C√ÅSCARA), Queso (L√ÅCTEOS), Aceite de Oliva, Ajo, Aceitunas, Aj√≠ Amarillo 0.73%, Mantequilla (L√ÅCTEOS), Sal,Pollo, C√∫rcuma, Comino, Pimienta Negra y Laurel
Kcal: 146.0
Diferencias detectadas:
 ‚Ä¢ sin_gluten: heur√≠stico=True, agente=False
--------------------------------------------------

--------------------------------------------------
üçΩÔ∏è Nombre: Arroz Campero
Ingredientes: Agua, Pollo, Pimientos, Arroz 13.23%, Chorizo, Morcilla, Tomate, Aceite de Oliva, Piment√≥n Dulce, Ajo, Sal y Pimienta Blanca
Kcal: 138.0
Diferencias detectadas:
 ‚Ä¢ es_postre: heur√≠stico=True, agente=False
 ‚Ä¢ sin_gluten: heur√≠stico=True, agente=False
 ‚Ä¢ congelar: heur√≠sti

Unnamed: 0,nombre_plato,ingredientes,precio,kcal,proteinas,hidratos,grasas,peso,alergenos,is_vegetariano,...,is_keto,bajo_en_calorias,es_postre,de_cuchara,alto_proteina,sin_lactosa,is_gourmet,para_diabeticos,sin_gluten,congelar
0,Aj√≠ de Pollo,"Pollo, 26.38%, Agua, Cebolla, Huevo (HUEVO), l...",6.95,146.0,8.9,13.6,6.0,380.0,"HUEVO, L√ÅCTEOS, GLUTEN, FRUTOS DE C√ÅSCARA",False,...,False,False,False,False,False,False,True,False,False,False
1,Arroz Campero,"Agua, Pollo, Pimientos, Arroz 13.23%, Chorizo,...",6.45,138.0,6.2,13.3,6.5,430.0,,False,...,False,False,False,False,False,True,False,False,False,False
2,Arroz con Carrilleras al Pedro Xim√©nez,"Agua, Cerdo 22.67%, Arroz 13.60%, Vino, Ceboll...",6.45,198.0,10.0,14.1,11.1,430.0,,False,...,False,False,False,False,False,True,True,False,True,False
3,Arroz con Pollo,"Pollo 29.33%, Agua, Arroz 19.69%, Cebolla, Vin...",6.45,154.0,6.0,19.2,5.7,430.0,,False,...,False,False,False,False,False,True,False,False,True,False
4,Arroz Meloso al Parmesano y Setas,"Agua, Arroz, Setas, Leche Sin Lactosa(L√ÅCTEOS...",5.55,146.0,3.4,17.0,6.5,430.0,L√ÅCTEOS,True,...,False,False,False,True,False,True,True,False,True,False


In [None]:
# Saving
df_final_official = df_final_official[~df_final_official['nombre_plato'].str.contains("pack", case=False, na=False)]
df_final_official.to_csv("data/df_final_official.csv", index=False)
print("‚úÖ Saved df_final_official.csv")

‚úÖ Saved df_final_official.csv


In [3]:
df_final_official

Unnamed: 0,nombre_plato,ingredientes,precio,kcal,proteinas,hidratos,grasas,peso,alergenos,is_vegetariano,...,is_keto,bajo_en_calorias,es_postre,de_cuchara,alto_proteina,sin_lactosa,is_gourmet,para_diabeticos,sin_gluten,congelar
0,Aj√≠ de Pollo,"Pollo, 26.38%, Agua, Cebolla, Huevo (HUEVO), l...",6.95,146.0,8.9,13.6,6.0,380.0,"HUEVO, L√ÅCTEOS, GLUTEN, FRUTOS DE C√ÅSCARA",False,...,False,False,False,False,False,False,True,False,False,False
1,Arroz Campero,"Agua, Pollo, Pimientos, Arroz 13.23%, Chorizo,...",6.45,138.0,6.2,13.3,6.5,430.0,,False,...,False,False,False,False,False,True,False,False,False,False
2,Arroz con Carrilleras al Pedro Xim√©nez,"Agua, Cerdo 22.67%, Arroz 13.60%, Vino, Ceboll...",6.45,198.0,10.0,14.1,11.1,430.0,,False,...,False,False,False,False,False,True,True,False,True,False
3,Arroz con Pollo,"Pollo 29.33%, Agua, Arroz 19.69%, Cebolla, Vin...",6.45,154.0,6.0,19.2,5.7,430.0,,False,...,False,False,False,False,False,True,False,False,True,False
4,Arroz Meloso al Parmesano y Setas,"Agua, Arroz, Setas, Leche Sin Lactosa(L√ÅCTEOS...",5.55,146.0,3.4,17.0,6.5,430.0,L√ÅCTEOS,True,...,False,False,False,True,False,True,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,Lentejas Riojanas FAMILIAR,"Agua, Lentejas 18.25%, Patata, Chorizo 10.48%,...",15.55,94.4,6.3,9.3,2.8,1100.0,,False,...,False,True,False,True,False,True,False,True,False,False
237,Carrilleras de Cerdo en Salsa con Crema de Pat...,"Cerdo 60.82%, Agua, Patatas 8.12%, Cebolla, Za...",20.05,172.0,13.5,14.5,5.9,1100.0,L√ÅCTEOS,False,...,False,False,False,True,True,False,True,False,True,False
238,Lentejas con Verduras FAMILIAR,"Agua, Lentejas 13.61%, Patata 10.20%, Zanahori...",15.55,73.7,4.0,8.2,2.0,1100.0,,True,...,False,True,False,True,False,True,False,True,True,False
239,Arroz Tikka Masala con Heura,"Agua, concentrado de Prote√≠na de Soja (SOJA), ...",5.45,188.0,8.9,17.7,8.9,380.0,SOJA,True,...,False,False,False,False,False,True,True,True,True,True
