In [None]:
import gzip
import json
import csv

INPUT_FILE  = './data/openfoodfacts-products.jsonl.gz'
OUTPUT_FILE = './data/ingredients.csv'

with gzip.open(INPUT_FILE, 'rt', encoding='utf-8') as source, \
     open(OUTPUT_FILE, 'w', newline='', encoding='utf-8') as target:

    writer = csv.writer(target)
    writer.writerow(['code', 'ingredients'])

    for line in source:
        line = line.strip()
        if not line:
            continue
        try:
            product = json.loads(line)
        except json.JSONDecodeError:
            # ligne corrompue ou incomplète : on l'ignore
            continue

        # on préfère le texte français si dispo
        ing = product.get('ingredients_text_fr') or product.get('ingredients_text')
        if ing:
            writer.writerow([
                product.get('code', ''),
                ing.replace('\n', ' ').strip()
            ])



In [None]:
import pandas as pd
import re

# 1. Lire le CSV
df = pd.read_csv(
    './data/ingredients.csv',
    dtype={'ingredients': str},
    low_memory=False
)


df['ingredients'] = df['ingredients'].fillna('')

# 2. Nettoyage et tokenisation simple
def clean_and_tokenize(text):
    # minuscules, retirer ponctuation sauf ‘;’
    text = text.lower()
    text = re.sub(r'[^a-z0-9éèàçùœ \-;]', ' ', text)
    # split sur ‘;’ puis strip des blancs
    return [tok.strip() for tok in text.split(';') if tok.strip()]

df['tokens'] = df['ingredients'].apply(clean_and_tokenize)

print(df[['ingredients','tokens']].head())

In [None]:
from gensim.models import Word2Vec

sentences = df['tokens'].tolist()

w2v = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=5,
    sg=1,
    epochs=10
)

vec_tomate = w2v.wv['tomate']


In [None]:
import numpy as np

def list_embedding(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

df['list_emb'] = df['tokens'].apply(lambda toks: list_embedding(toks, w2v))

In [None]:
X = np.vstack(df['list_emb'].values)

y = df['score'].values / 100.0


In [None]:
import torch, torch.nn as nn, torch.optim as optim

class ScoringNet(nn.Module):
    def __init__(self, emb_dim=100):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(emb_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),   # sortie scalaire
            nn.Sigmoid()        # donne un score dans [0,1]
        )
    def forward(self, x):
        return self.net(x)

# Préparer DataLoader…
# Convertir X_train, y_train en TensorDataset puis DataLoader
# Boucle d’entraînement classique avec MSELoss et AdamW

