In [2]:
import pandas as pd
import requests
import json
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


Extract chocolate product data using:
https://world.openfoodfacts.org/api/v2/search?categories=chocolate
s&fields=code,product_name,brands,nutriments&page_size=100&page=1

In [3]:
all_products = []
url = "https://world.openfoodfacts.org/api/v2/search"
for page in range(1, 121):
    params = {
        "categories": "chocolates",
        "fields": "code,product_name,brands,nutriments",
        "page_size": 100,
        "page": page
    }

    response = requests.get(url, params=params)
    data = response.json()
    
    all_products.extend(data["products"])

df = pd.json_normalize(all_products) 

df.head()

Unnamed: 0,brands,code,product_name,nutriments.carbohydrates,nutriments.carbohydrates_100g,nutriments.carbohydrates_serving,nutriments.carbohydrates_unit,nutriments.carbohydrates_value,nutriments.energy,nutriments.energy-kcal,...,nutriments.pantothenic-acid_prepared_100g,nutriments.vitamin-b9_prepared_100g,nutriments.vitamin-pp_prepared_100g,nutriments.phylloquinone_label,nutriments.plant-stanols,nutriments.plant-stanols_100g,nutriments.plant-stanols_label,nutriments.plant-stanols_serving,nutriments.plant-stanols_unit,nutriments.plant-stanols_value
0,Sidi Ali,6111035000430,Sidi Ali,42.0,4.2,42.0,g,42.0,20.0,0.0,...,,,,,,,,,,
1,Jaouda,6111242100992,Perly,9.4,9.4,9.4,g,9.4,406.0,97.0,...,,,,,,,,,,
2,sidi ali,6111035002175,Sidi Ali,,,,,,,,...,,,,,,,,,,
3,"Les Eaux Minérales d'oulmès,Sidi Ali",6111035000058,Eau minérale naturelle,,,,,,,,...,,,,,,,,,,
4,AQUAFINA,6111252421568,اكوافينا,0.0,0.0,0.0,g,0.0,0.0,0.0,...,,,,,,,,,,


In [4]:
# remove "nutriments." prefix from all column names
df.columns = df.columns.str.replace("nutriments.", "", regex=False)


df.head()

Unnamed: 0,brands,code,product_name,carbohydrates,carbohydrates_100g,carbohydrates_serving,carbohydrates_unit,carbohydrates_value,energy,energy-kcal,...,pantothenic-acid_prepared_100g,vitamin-b9_prepared_100g,vitamin-pp_prepared_100g,phylloquinone_label,plant-stanols,plant-stanols_100g,plant-stanols_label,plant-stanols_serving,plant-stanols_unit,plant-stanols_value
0,Sidi Ali,6111035000430,Sidi Ali,42.0,4.2,42.0,g,42.0,20.0,0.0,...,,,,,,,,,,
1,Jaouda,6111242100992,Perly,9.4,9.4,9.4,g,9.4,406.0,97.0,...,,,,,,,,,,
2,sidi ali,6111035002175,Sidi Ali,,,,,,,,...,,,,,,,,,,
3,"Les Eaux Minérales d'oulmès,Sidi Ali",6111035000058,Eau minérale naturelle,,,,,,,,...,,,,,,,,,,
4,AQUAFINA,6111252421568,اكوافينا,0.0,0.0,0.0,g,0.0,0.0,0.0,...,,,,,,,,,,


In [5]:
required_cols = [
    "energy-kcal_value",
    "energy-kj_value",
    "carbohydrates_value",
    "sugars_value",
    "fat_value",
    "saturated-fat_value",
    "proteins_value",
    "fiber_value",
    "salt_value",
    "sodium_value",
    "nova-group",
    "nutrition-score-fr",
    "fruits-vegetables-nuts-estimate-from-ingredients_100g",
    "brands",
    "code",
    "product_name"
]

df = df[[c for c in required_cols if c in df.columns]]

df.rename(columns={
    "energy-kcal_value": "energy_kcal",
    "energy-kj_value": "energy_kj",
    "carbohydrates_value": "carbohydrates",
    "sugars_value": "sugars",
    "fat_value": "fat",
    "saturated-fat_value": "saturated_fat",
    "proteins_value": "proteins",
    "fiber_value": "fiber",
    "salt_value": "salt",
    "sodium_value": "sodium",
    "nova-group": "nova_group",
    "nutrition-score-fr": "nutrition_score",
    "fruits-vegetables-nuts-estimate-from-ingredients_100g": "fruits_veg_nuts_percent"
}, inplace=True)

df.head()

Unnamed: 0,energy_kcal,energy_kj,carbohydrates,sugars,fat,saturated_fat,proteins,fiber,salt,sodium,nova_group,nutrition_score,fruits_veg_nuts_percent,brands,code,product_name
0,0.0,20.0,42.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,0.0,Sidi Ali,6111035000430,Sidi Ali
1,97.0,406.0,9.4,,3.0,,8.0,,,,3.0,,0.0,Jaouda,6111242100992,Perly
2,,,,,,,,,65.0,26.0,,0.0,0.0,sidi ali,6111035002175,Sidi Ali
3,,,,,,,,,65.0,26.0,1.0,0.0,0.0,"Les Eaux Minérales d'oulmès,Sidi Ali",6111035000058,Eau minérale naturelle
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00508,0.002032,,0.0,0.0,AQUAFINA,6111252421568,اكوافينا


In [6]:
# Save the raw dataframe to CSV
df.to_csv("choco.csv", index=False)

In [7]:
choco_df = pd.read_csv("choco.csv")
choco_df.head()

Unnamed: 0,energy_kcal,energy_kj,carbohydrates,sugars,fat,saturated_fat,proteins,fiber,salt,sodium,nova_group,nutrition_score,fruits_veg_nuts_percent,brands,code,product_name
0,0.0,20.0,42.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,0.0,Sidi Ali,6111035000430,Sidi Ali
1,97.0,406.0,9.4,,3.0,,8.0,,,,3.0,,0.0,Jaouda,6111242100992,Perly
2,,,,,,,,,65.0,26.0,,0.0,0.0,sidi ali,6111035002175,Sidi Ali
3,,,,,,,,,65.0,26.0,1.0,0.0,0.0,"Les Eaux Minérales d'oulmès,Sidi Ali",6111035000058,Eau minérale naturelle
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00508,0.002032,,0.0,0.0,AQUAFINA,6111252421568,اكوافينا
