In [1]:
import requests
import gzip
import pandas as pd
from io import BytesIO
import os
import shutil
from requests.exceptions import RequestException
import warnings
from datetime import datetime

pd.set_option('display.max_rows', 50)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [2]:
url = "https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz"
fileNbr = '04'
projectPath = "/home/carolus/Documents/school/green_ia/" 
jsonGz = projectPath + "data/" + fileNbr + "_openfoodfacts" + ".jsonl.gz"
csvPath = projectPath + "data/" + fileNbr + "_openfoodfacts.csv" 

colToSave = ['allergens_from_ingredients',
            'nutriscore_tags',
            'labels_old',
            'categories_old',
            'pnns_groups_1',
            'ecoscore_data',
            'brand_owner_imported',
            'ingredients_tags',
            'packaging',
            'ingredients_hierarchy',
            'product_name',
            'food_groups_tags',
            'ecoscore_tags',
            'nova_group',
            'ingredients_from_or_that_may_be_from_palm_oil_n',
            'categories_tags',
            'brand_owner',
            'nutrient_levels_tags',
            'allergens_tags',
            'ecoscore_extended_data',
            'categories',
            'nutriments',
            'nutriscore_2021_tags',
            'additives_old_n',
            'ecoscore_score',
            'labels_tags',
            'countries']

# récupérer la date du jour 
currentDateTime = datetime.now()
formattedDate = currentDateTime.strftime("%d/%m/%Y %H:%M:%S.%f")[:-3]
dateCode = currentDateTime.strftime('%d%m%Y%H%M%S') + f"{currentDateTime.microsecond // 1000:03d}"

In [3]:
def addLogs(logData):
    print(logData)
    with open(f"{projectPath}logs/{dateCode}_logs.txt", "a") as logFile:
        logFile.write(f'{logData}\n')

In [4]:
addLogs(f"start date: {formattedDate}")
addLogs(f"url: {url} \nfileNbr: {fileNbr} \nprojectPath: {projectPath} \njsonGz: {jsonGz} \ncsvPath: {csvPath} \ncolToSave: {colToSave}")

start date: 10/08/2024 16:30:04.706
url: https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz 
fileNbr: 04 
projectPath: /home/carolus/Documents/school/green_ia/ 
jsonGz: /home/carolus/Documents/school/green_ia/data/04_openfoodfacts.jsonl.gz 
csvPath: /home/carolus/Documents/school/green_ia/data/04_openfoodfacts.csv 
colToSave: ['allergens_from_ingredients', 'nutriscore_tags', 'labels_old', 'categories_old', 'pnns_groups_1', 'ecoscore_data', 'brand_owner_imported', 'ingredients_tags', 'packaging', 'ingredients_hierarchy', 'product_name', 'food_groups_tags', 'ecoscore_tags', 'nova_group', 'ingredients_from_or_that_may_be_from_palm_oil_n', 'categories_tags', 'brand_owner', 'nutrient_levels_tags', 'allergens_tags', 'ecoscore_extended_data', 'categories', 'nutriments', 'nutriscore_2021_tags', 'additives_old_n', 'ecoscore_score', 'labels_tags', 'countries']


In [5]:
# fonction pour reprendre le téléchargement
def downloadFile(url, jsonGz):
    addLogs("start downloading file from Open Food Facts")
    while True:
        try:
            # vérifier si fichier existe déjà et obtenir sa taille
            fileSize = 0
            if os.path.exists(jsonGz):
                fileSize = os.path.getsize(jsonGz)

            headers = {"range": f"bytes={fileSize}-"}
            response = requests.get(url, headers=headers, stream=True)

            if response.status_code in [200, 206]:
                mode = 'ab' if fileSize else 'wb'
                with open(jsonGz, mode) as file:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            file.write(chunk)
                addLogs(f"downloaded: {jsonGz}")
                break  # sortir boucle une fois téléchargement terminé

            else:
                addLogs(f"ERROR while downloading: {response.status_code}")
                break  # sortir boucle si erreur statut

        except RequestException as e:
            addLogs(f"warning, continue downloading: {e}")

In [6]:
# décompresser du fichier jsonl
def unGzFile(jsonGz, fileNbr, projectPath):
    addLogs("start unzziping jsonl compressed")
    jsonl = projectPath + "data/" + fileNbr + '_openfoodfacts.jsonl'
    with gzip.open(jsonGz, 'rb') as f_in:
        with open(jsonl, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    addLogs(f'unzziping completed: {jsonl}')
    return jsonl

In [7]:
# conversion en fichier csv
def convertToCsv(jsonl, fileNbr, projectPath):
    addLogs("converting jsonl file to csv file")
    heavyCsv = projectPath + "data/" + fileNbr + '_openfoodfacts_00.csv'
    chunksize = 10000  
    chunkIter = pd.read_json(jsonl, lines=True, chunksize=chunksize)

    for i, chunk in enumerate(chunkIter):
        if i == 0:
            chunk.to_csv(heavyCsv, index=False, escapechar='\\')
        else:
            chunk.to_csv(heavyCsv, mode='a', header=False, index=False, escapechar='\\')

    addLogs(f"convert jsonl to heavy csv terminated: {heavyCsv}")
    return heavyCsv

In [8]:
def deleteFile(filePath):
    if os.path.exists(filePath):
        os.remove(filePath)
        addLogs(f"file deleted: {filePath}")
    else:
        addLogs(f"ERROR, does not exists: {filePath}")


In [9]:
def createFolder(folderPath):
    try:
        os.makedirs(folderPath, exist_ok=True)
        addLogs(f"new folder: {folderPath}")
    except OSError as e:
        addLogs(f"ERROR while creating folder: {e}")

In [10]:
# main algo
downloadFile(url, jsonGz)
jsonl = unGzFile(jsonGz, fileNbr, projectPath)
#deleteFile(jsonGz)
heavyCsv = convertToCsv(jsonl, fileNbr, projectPath)
#deleteFile(jsonl)

start downloading file from Open Food Facts
downloaded: /home/carolus/Documents/school/green_ia/data/04_openfoodfacts.jsonl.gz
start unzziping jsonl compressed
unzziping completed: /home/carolus/Documents/school/green_ia/data/04_openfoodfacts.jsonl
converting jsonl file to csv file
convert jsonl to heavy csv terminated: /home/carolus/Documents/school/green_ia/data/04_openfoodfacts_00.csv


In [11]:
currentDateTime = datetime.now()
formattedDate = currentDateTime.strftime("%d/%m/%Y %H:%M:%S.%f")[:-3]
addLogs(f"end date: {formattedDate}")

end date: 10/08/2024 17:41:04.319
