In [8]:
import requests
import gzip
import pandas as pd
from io import BytesIO
import os
import shutil
from requests.exceptions import RequestException
import warnings
from datetime import datetime

pd.set_option('display.max_rows', 50)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

In [9]:
download_url = "https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz"
file_id = '03'
project_path = "/home/carolus/Documents/school/green_ia/" 
jsonl_gz = project_path + "data/" + file_id + "_openfoodfacts_00" + ".jsonl.gz"

In [10]:
print(f"download_url: {download_url} \nfile_id: {file_id} \nproject_path: {project_path} \njsonl_gz: {jsonl_gz}")

download_url: https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz 
file_id: 03 
project_path: /home/carolus/Documents/school/green_ia/ 
jsonl_gz: /home/carolus/Documents/school/green_ia/data/03_openfoodfacts_00.jsonl.gz


In [11]:
# fonction pour reprendre le téléchargement
def download_file(download_url, jsonl_gz):
    print("start downloading file from Open Food Facts")
    while True:
        try:
            # vérifier si fichier existe déjà et obtenir sa taille
            file_size = 0
            if os.path.exists(jsonl_gz):
                file_size = os.path.getsize(jsonl_gz)

            headers = {"range": f"bytes={file_size}-"}
            response = requests.get(download_url, headers=headers, stream=True)

            if response.status_code in [200, 206]:
                mode = 'ab' if file_size else 'wb'
                with open(jsonl_gz, mode) as file:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            file.write(chunk)
                print(f"downloaded: {jsonl_gz}")
                break  # sortir boucle une fois téléchargement terminé

            else:
                print(f"ERROR while downloading: {response.status_code}")
                break  # sortir boucle si erreur statut

        except RequestException as e:
            print(f"warning, continue downloading: {e}")

In [12]:
# décompresser du fichier jsonl
def un_gz_file(jsonl_gz, file_id, project_path):
    print("start unzziping jsonl compressed")
    jsonl = project_path + "data/" + file_id + '_openfoodfacts_00.jsonl'
    with gzip.open(jsonl_gz, 'rb') as f_in:
        with open(jsonl, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    print(f'unzziping completed: {jsonl}')
    return jsonl

In [13]:
def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"file deleted: {file_path}")
    else:
        print(f"ERROR, does not exists: {file_path}")


In [14]:
# main algo
download_file(download_url, jsonl_gz)
jsonl = un_gz_file(jsonl_gz, file_id, project_path)
delete_file(jsonl_gz)

start downloading file from Open Food Facts
downloaded: /home/carolus/Documents/school/green_ia/data/03_openfoodfacts_00.jsonl.gz
start unzziping jsonl compressed
unzziping completed: /home/carolus/Documents/school/green_ia/data/03_openfoodfacts_00.jsonl
file deleted: /home/carolus/Documents/school/green_ia/data/03_openfoodfacts_00.jsonl.gz
