In [2]:
import requests
import pandas as pd
import streamlit as st


In [4]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [5]:
fields = [
    "product_name",               # Nom du produit
    "code",                       # Code-barres (identifiant unique)
    "brands",                     # Marque
    "categories_tags",           # Catégorie (utile pour filtrer viande/lait)
    "nutriscore_grade",          # Nutri-Score (A-E)
    "nova_group",                # Groupe NOVA (1 à 4)
    "ingredients_text",          # Liste brute des ingrédients
    "ingredients",               # Liste structurée (parfois mieux que text)
    "additives_tags",            # Additifs présents
    "labels_tags",               # Labels (bio, AB, etc.)
    "origins_tags",              # Origine des ingrédients
    "countries_tags",            # Pays de commercialisation
    "manufacturing_places_tags", # Lieux de fabrication (quand dispo)
    "packaging_tags",            # Emballage (recyclable ou non)
    "ecoscore_score",            # Score environnemental (si dispo)
    "ecoscore_grade",            # Grade (A-E) éco-score
    "environment_impact_level_tags",  # Impact environnemental
]


In [8]:
def fetch_openfoodfacts_products(category_tags="salty-snacks", page_size=2):
    url = f"https://world.openfoodfacts.org/api/v2/search"
    params = {
        "categories_tags": category_tags,
        "fields": ",".join(fields),
        "page_size": page_size,
    }
    response = requests.get(url, params=params)
    data = response.json()
    products = data.get("products", [])
    return pd.DataFrame(products)

df = fetch_openfoodfacts_products()
# df.head()

# display(df)

In [None]:
import requests

def fetch_categories(lang="en", limit=10):
    url = "https://world.openfoodfacts.org/api/v2/taxonomy_suggestions"
    params = {
        "tagtype": "categories",
        "lc": lang,
        "get_synonyms": 0,
        "limit": limit
    }
    data = requests.get(url, params=params).json()
    # Chaque entrée est une string "en:category-name"
    cats = [c.split(":", 1)[1] for c in data.get("suggestions", [])]
    return cats

cats = fetch_categories(lang="en")
print(f"{len(cats)} catégories trouvées :")
print(cats[:20], "...")


0 catégories trouvées :
[] ...


In [29]:
display(df[['origins_tags', 'countries_tags']])

Unnamed: 0,origins_tags,countries_tags
0,[],"[en:france, en:united-kingdom]"
1,[en:union-europeenne],[en:france]
2,"[en:united-kingdom, en:ireland]",[en:united-kingdom]
3,[],[en:france]
4,[fr:viande-de-porc-union-europeenne],[en:france]
5,,[en:spain]
6,[en:united-kingdom],[en:united-kingdom]
7,[en:european-union],[en:france]
8,[en:european-union],[en:france]
9,[en:france],[en:france]


In [None]:
# Liste d'ingrédients suspects pour la pénalisation
SUSPECT_INGREDIENTS = ["sirop de glucose", "colorant", "conservateur", "arôme artificiel"]
CONTROVERSIAL_ADDITIVES = ["e951", "e150d", "e102"]

In [24]:

# Récupérer les produits par nom
def fetch_products_by_name(product_name, page_size=10):
    url = "https://world.openfoodfacts.org/api/v2/search"
    params = {
        "page_size": page_size,
        "fields": "product_name,ingredients_text,nutriscore_grade,nova_group,labels_tags",
        "search_terms": product_name,
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        return pd.DataFrame(data.get("products", []))
    return pd.DataFrame()

# Nettoyage et calcul du NutriScore+
def compute_nutriscore_plus(row):
    score = 0
    ingr = str(row.get("ingredients_text", "")).lower()
    labels = row.get("labels_tags", [])

    # Pénalités
    if row.get("nova_group") == 4:
        score -= 2
    if any(add in ingr for add in SUSPECT_INGREDIENTS):
        score -= 1
    if any(add in ingr for add in CONTROVERSIAL_ADDITIVES):
        score -= 1
    if len(ingr.split(",")) > 15:
        score -= 1
    if not any(lab in labels for lab in ["en:organic", "en:vegan", "en:no-added-sugars"]):
        score -= 0.5

    # Bonus
    if any(lab in labels for lab in ["en:organic", "en:vegan", "en:no-added-sugars"]):
        score += 1

    # Conversion de Nutri-score lettre -> base
    base = {"a": 5, "b": 4, "c": 3, "d": 2, "e": 1}
    nutri_letter = str(row.get("nutriscore_grade", "c")).lower()
    nutri_value = base.get(nutri_letter, 3)

    final_score = nutri_value + score
    final_score = max(1, min(5, final_score))  # entre 1 et 5

    inverse = {5: "A", 4: "B", 3: "C", 2: "D", 1: "E"}
    return inverse[final_score]

# Interface Streamlit
st.title("NutriScore+ – Recherche Produit")

product_query = st.text_input("Entrez le nom d'un produit :", value="cereal")

if product_query:
    with st.spinner("Recherche des produits..."):
        df = fetch_products_by_name(product_query)
    
    if not df.empty:
        df["nutriscore_plus"] = df.apply(compute_nutriscore_plus, axis=1)
        display_df = df[["product_name", "nutriscore_grade", "nova_group", "nutriscore_plus", "ingredients_text"]]
        st.dataframe(display_df)

        csv = display_df.to_csv(index=False).encode("utf-8")
        st.download_button("📥 Télécharger CSV", csv, file_name="nutriscore_plus.csv", mime="text/csv")
    else:
        st.warning("Aucun produit trouvé.")

2025-07-11 10:20:07.471 
  command:

    streamlit run /home/clara/.local/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-07-11 10:20:07.478 Session state does not function when running a script without `streamlit run`


In [10]:
df = pd.read_parquet("../nutri_score_project/data/raw/open_food_data.parquet")
df.head(1)

Unnamed: 0,additives_tags,brands,categories_tags,code,countries_tags,ecoscore_grade,ecoscore_score,environment_impact_level_tags,ingredients,ingredients_text,labels_tags,manufacturing_places_tags,nova_group,nutriscore_grade,origins_tags,packaging_tags,product_name
0,[],Tyrrell's,"[en:plant-based-foods-and-beverages, en:plant-based-foods, en:snacks, en:cereals-and-potatoes, en:salty-snacks, en:appetizers, en:chips-and-fries, en:crisps, en:potato-crisps, en:potato-crisps-in-sunflower-oil, fr:chips-de-pommes-de-terre-classiques]",5060042641000,"[en:france, en:germany, en:spain, en:united-kingdom]",a,83.0,[],"[{'ciqual_food_code': '4003', 'ciqual_proxy_food_code': None, 'ecobalyse_code': 'potato-industry-fr', 'from_palm_oil': None, 'id': 'en:potato', 'ingredients': None, 'is_in_taxonomy': 1, 'labels': None, 'origins': None, 'percent': None, 'percent_estimate': 74.5875, 'percent_max': 100.0, 'percent_min': 49.175, 'processing': None, 'quantity': None, 'quantity_g': None, 'text': 'Pommes de terre', 'vegan': 'yes', 'vegetarian': 'yes'}, {'ciqual_food_code': '17440', 'ciqual_proxy_food_code': None, 'ecobalyse_code': '3cfab110-363e-442a-9170-1af337fe9ea3', 'from_palm_oil': 'no', 'id': 'en:sunflower-oil', 'ingredients': None, 'is_in_taxonomy': 1, 'labels': None, 'origins': None, 'percent': None, 'percent_estimate': 12.70625, 'percent_max': 50.0, 'percent_min': 0.0, 'processing': None, 'quantity': None, 'quantity_g': None, 'text': 'Huile de tournesol', 'vegan': 'yes', 'vegetarian': 'yes'}, {'ciqual_food_code': '11082', 'ciqual_proxy_food_code': None, 'ecobalyse_code': None, 'from_palm_oil': None, 'id': 'en:sea-salt', 'ingredients': None, 'is_in_taxonomy': 1, 'labels': None, 'origins': None, 'percent': None, 'percent_estimate': 12.70625, 'percent_max': 0.825, 'percent_min': 0.0, 'processing': None, 'quantity': None, 'quantity_g': None, 'text': 'Sel de mer', 'vegan': 'yes', 'vegetarian': 'yes'}]","Pommes de terre, Huile de tournesol, Sel de mer","[en:vegetarian, en:no-gluten, en:no-artificial-flavors, en:vegan, en:assured-food-standards, en:green-dot, en:no-artificial-colors, en:no-flavour-enhancer, en:no-msg, fr:triman, en:made-in-england, en:terracycle]",[united-kingdom],3.0,c,[en:united-kingdom],"[en:mixedplasticfilm-packet, en:mixed-plastic-film-packet]",Lightly sea salted crisps


In [11]:
df.columns

Index(['additives_tags', 'brands', 'categories_tags', 'code', 'countries_tags',
       'ecoscore_grade', 'ecoscore_score', 'environment_impact_level_tags',
       'ingredients', 'ingredients_text', 'labels_tags',
       'manufacturing_places_tags', 'nova_group', 'nutriscore_grade',
       'origins_tags', 'packaging_tags', 'product_name'],
      dtype='object')

In [39]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
import re

# Charger le modèle français
nlp = spacy.load("fr_core_news_sm")

# 🟢 Bons ingrédients pour salty snacks
GOOD_INGREDIENTS = [
    "pomme de terre", "patate douce", "betterave", "carotte", "courge", "pois", "maïs", "pois chiche",
    "blé complet", "riz complet", "avoine", "quinoa", "orge", "épeautre", "millet",
    "amande", "noix", "noisette", "cacahuète", "pistache", "graines de tournesol",
    "graines de courge", "graines de lin", "graines de sésame",
    "lentille", "haricot", "pois cassé", "paprika", "curcuma", "cumin", "piment", "ail", "oignon",
    "herbes de Provence", "thym", "romarin",
    "sel marin", "sel de mer", "huile d'olive", "huile de colza", "huile de tournesol non raffinée"
]

# 🔴 Mauvais ingrédients
BAD_INGREDIENTS = [
    "sucre", "glucose", "fructose", "saccharose", "maltose", "maltodextrine",
    "sirop de glucose", "sirop de maïs", "sirop de fructose", "sirop inverti",
    "sirop d'agave", "huile de palme", "huile de coco raffinée", "huile de soja raffinée",
    "huile végétale partiellement hydrogénée", "margarine industrielle",
    "graisse végétale hydrogénée", "farine raffinée", "farine blanche",
    "produit instantané", "arômes artificiels", "arôme artificiel", "arôme synthétique",
    "exhausteur de goût", "stabilisant chimique", "émulsifiant industriel",
    "agent de conservation", "gomme xanthane synthétique", "colorant artificiel",
    "concentré de jus industriel", "produit enrichi artificiellement", "poudre aromatisée",
    "boisson instantanée sucrée", "protéines végétales texturées", "sirop de riz",
    "huile partiellement hydrogénée"
]

# Regex pour E-numbers
E_NUMBER_PATTERN = re.compile(r"\bE\d{2,3}\b", re.IGNORECASE)

# PhraseMatchers
matcher_good = PhraseMatcher(nlp.vocab, attr="LOWER")
matcher_bad = PhraseMatcher(nlp.vocab, attr="LOWER")

patterns_good = [nlp(text) for text in GOOD_INGREDIENTS]
patterns_bad = [nlp(text) for text in BAD_INGREDIENTS]

matcher_good.add("GOOD", patterns_good)
matcher_bad.add("BAD", patterns_bad)

def score_ingredients_spacy(text: str) -> int:
    if not text or pd.isna(text):
        return 0
    doc = nlp(str(text).lower())
    score = 0

    # Bons ingrédients
    matches = matcher_good(doc)
    score += len(matches)

    # Mauvais ingrédients
    matches = matcher_bad(doc)
    score -= len(matches)

    # E-numbers
    score -= len(E_NUMBER_PATTERN.findall(str(text)))

    return score

def nutriscore_plus_spacy_row(row: pd.Series) -> int:
    score = 0

    # 1️⃣ Ingrédients
    ingredients_text = row.get("ingredients_text", "")
    score += score_ingredients_spacy(ingredients_text)

    # 2️⃣ NOVA
    nova = row.get("nova_group", 4)
    if pd.isna(nova):
        nova = 4  # valeur par défaut si vide
    score -= (int(nova) - 1) * 2

    # 3️⃣ Labels
    labels = row.get("labels_tags", [])
    if not isinstance(labels, list):
        labels = []
    if any("bio" in str(l).lower() for l in labels):
        score += 3
    if any("label-rouge" in str(l).lower() for l in labels):
        score += 2

    return score

# Ajouter la colonne NutriScore+ dans df
df["nutriscore_plus"] = df.apply(nutriscore_plus_spacy_row, axis=1)

# Vérification
df[df['nutriscore_plus'] >= -1].head()


Unnamed: 0,additives_tags,brands,categories_tags,code,countries_tags,ecoscore_grade,ecoscore_score,environment_impact_level_tags,ingredients,ingredients_text,labels_tags,manufacturing_places_tags,nova_group,nutriscore_grade,origins_tags,packaging_tags,product_name,nutriscore_plus
88,[],Lidl,"[en:snacks, en:salty-snacks, en:appetizers, en...",4056489546092,[en:france],a-plus,92.0,,"[{'ciqual_food_code': None, 'ciqual_proxy_food...","Farine complète de blé (70 g), flocons d'avoin...","[en:organic, en:eu-organic, en:source-of-fibre...",[],3.0,c,[],[fr:sachet-plastique],Bio crackers,3
