# Food Explorer
Produced using garden-level FAOstat datasets

- [x] QCL
- [ ] FBS
- [ ] FBSH

## Parameters

In [1]:
dest_dir = "/tmp/food_explorer"

## Imports & paths

In [2]:
from owid import catalog
import pandas as pd
from pathlib import Path
import numpy as np

In [3]:
from etl.paths import BASE_DIR as base_path

In [4]:
path_dataset_qcl = base_path / "data/garden/faostat/2021-03-18/faostat_QCL"
path_map_item = (
    base_path / "etl/steps/data/garden/explorers/2021/food_explorer.items.std.csv"
)
path_map_elem = (
    base_path / "etl/steps/data/garden/explorers/2021/food_explorer.elements.std.csv"
)

## Load meadow dataset

In [5]:
qcl_garden = catalog.Dataset(path_dataset_qcl)

In [6]:
# Bulk data and items metadata
qcl_bulk = qcl_garden["bulk"]

In [7]:
qcl_bulk.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Value
Country,Item Code,Element Code,Variable Name,Year,Flag,Unnamed: 6_level_1
Armenia,221,5312,"Almonds, with shell - Area harvested (ha)",1992,M,
Armenia,221,5312,"Almonds, with shell - Area harvested (ha)",1993,M,
Armenia,221,5312,"Almonds, with shell - Area harvested (ha)",1994,M,
Armenia,221,5312,"Almonds, with shell - Area harvested (ha)",1995,M,
Armenia,221,5312,"Almonds, with shell - Area harvested (ha)",1996,M,


## Select Flags

In [8]:
qcl_bulk = qcl_bulk.reset_index()
qcl_bulk = qcl_bulk.drop(columns=["Variable Name"])

In [9]:
i_og = qcl_bulk.index.tolist()
i_ne = qcl_bulk.drop_duplicates(
    subset=["Country", "Item Code", "Element Code", "Year"]
).index.tolist()
print(f"og: {len(i_og)}, new: {len(i_ne)}")

og: 2796737, new: 2766049


In [10]:
# Checks
qcl_bulk = qcl_bulk.set_index(["Country", "Item Code", "Element Code", "Year"])
dups = qcl_bulk.index.duplicated()
print(f"{dups.sum()}, {len(i_ne) == len(i_og)-dups.sum()}")
dups = qcl_bulk.index.duplicated(keep=False)
qcl_bulk = qcl_bulk.reset_index()

30688, True


In [11]:
# Create flag priority (add to df)
flag_priorities = {
    "R": 0,
    "M": 1,
    "*": 2,
    "Fc": 6,
    "A": 7,
    "Im": 8,
    "F": 9,
    np.nan: 10,
}
qcl_bulk.loc[:, "Flag_priority"] = qcl_bulk.Flag.replace(flag_priorities).tolist()
qcl_bulk.loc[qcl_bulk.Value.isna(), "Flag_priority"] = -1
# Remove duplicates based on Flag value
qcl_bulk = qcl_bulk.sort_values("Flag_priority")
qcl_bulk = qcl_bulk.drop_duplicates(
    subset=["Country", "Item Code", "Element Code", "Year"], keep="last"
)
qcl_bulk.drop(columns=["Flag_priority", "Flag"], inplace=True)
qcl_bulk.shape

(2766049, 5)

## Element Overview

In [12]:
# Where do each element appear?
res = qcl_bulk.reset_index().groupby("Element Code")["Item Code"].nunique()
df = pd.read_csv(path_map_elem, index_col="code")
elem_map = df["name"] + " -- " + df["unit"] + " -- " + df.index.astype(str)
res.rename(index=elem_map.to_dict()).sort_values(ascending=False)

Element Code
Production -- tonnes -- 5510                          281
Area harvested -- ha -- 5312                          172
Yield -- hg/ha -- 5419                                171
Producing Animals/Slaughtered -- Head -- 5320          31
Yield/Carcass Weight -- hg/An -- 5417                  14
Stocks -- Head -- 5111                                 12
Yield -- hg/An -- 5420                                 10
Producing Animals/Slaughtered -- 1000 Head -- 5321      8
Yield/Carcass Weight -- 0.1g/An -- 5424                 8
Stocks -- 1000 Head -- 5112                             7
Laying -- 1000 Head -- 5313                             3
Yield -- 100mg/An -- 5410                               3
Yield -- hg -- 5422                                     2
Production -- 1000 No -- 5513                           2
Stocks -- No -- 5114                                    1
Name: Item Code, dtype: int64

## Reshape dataset

In [13]:
qcl_bulk = qcl_bulk.reset_index()
qcl_bulk = qcl_bulk.pivot(
    index=["Country", "Item Code", "Year"], columns="Element Code", values="Value"
)

---

## Rename Elements & Items

### Input from Hannah (OPTIONAL)
This step here is to:
- Generate table with item and element codes
- Share with Hannah
- Get standardisations for elements and items 

#### Elements

In [14]:
qcl_elem = qcl_garden["meta_element"]

In [15]:
elements = pd.DataFrame(qcl_bulk.notna().sum()).reset_index()
elements = elements.sort_values(0, ascending=False)
# Add names + unit info
elements = elements.merge(
    qcl_elem[["Element", "Unit", "Unit Description"]],
    left_on="Element Code",
    right_index=True,
)
# Rename
elements = elements.rename(
    columns={
        "Element Code": "code",
        0: "number_occurrences",
        "Element": "name",
        "Unit": "unit",
        "Unit Description": "unit_description",
    }
)[["code", "name", "unit", "unit_description", "number_occurrences"]]

In [16]:
elements.to_csv("ign.food.elements.csv", index=False)

#### Items

In [17]:
qcl_item = qcl_garden["meta_item"]

In [18]:
x = qcl_item.reset_index().astype(str)
# Group
cols = ["Item Group Code", "Item Group"]
y = x.drop_duplicates(subset=cols)[cols]
map_item_g = dict(zip(y[cols[0]], y[cols[1]]))
# Item
cols = ["Item Code", "Item"]
y = x.drop_duplicates(subset=cols)[cols]
map_item = dict(zip(y[cols[0]], y[cols[1]]))

# Correct
a = set(map_item_g.keys()).intersection(set(map_item.keys()))
b = set(map_item.keys()).intersection(set(map_item_g.keys()))
assert a == b
map_item = {k: v for k, v in map_item.items() if k not in map_item_g}

In [19]:
# Load item occurences
items = (
    pd.DataFrame(qcl_bulk.reset_index()["Item Code"].value_counts())
    .reset_index()
    .astype(str)
    .rename(
        columns={
            "index": "code",
            "Item Code": "number_occurences",
        }
    )
)
# Add flag for groups
items["type"] = items["code"].isin(map_item_g).apply(lambda x: "Group" if x else None)
# Add name
map_item_all = {**map_item, **map_item_g}
items["name"] = items.code.replace(map_item_all)
# Order columns
items = items[["code", "name", "type", "number_occurences"]]

In [20]:
items.to_csv("ign.food.items.csv", index=False)

### Renaming
#### Element

In [21]:
# Get standardised values
df = pd.read_csv(path_map_elem, index_col="code")
df = df.dropna(subset=["name_standardised"])

In [22]:
# Filter elements of interest
qcl_bulk = qcl_bulk[df.index]
# Factor
qcl_bulk = qcl_bulk.multiply(df.loc[qcl_bulk.columns, "unit_factor"])

In [23]:
# Merge 5417,5420,5424,5410 --> 5417
qcl_bulk[5417] = qcl_bulk[5417].fillna(
    qcl_bulk[5420].fillna(qcl_bulk[5424].fillna(qcl_bulk[5410]))
)
qcl_bulk = qcl_bulk.drop(columns=[5420, 5424, 5410])

In [24]:
# Build element name
a = df["name_standardised"].apply(lambda x: x.lower().replace(" ", "_")).astype(str)
b = (
    df["unit_name_standardised_with_conversion"]
    .apply(lambda x: x.lower().replace(" ", "_"))
    .astype(str)
)
df["element_name"] = (a + "__" + b).tolist()
# Obtain dict Element Code -> element name
map_elem = df["element_name"].to_dict()
# Change columns names
qcl_bulk = qcl_bulk.rename(columns=map_elem)

#### Item

In [25]:
# Get standardised values
df = pd.read_csv(path_map_item, index_col="code")
map_item_std = df.dropna(subset=["name_standardised"])["name_standardised"].to_dict()

In [26]:
qcl_bulk = qcl_bulk.reset_index()
qcl_bulk = qcl_bulk[qcl_bulk["Item Code"].isin(map_item_std)]
qcl_bulk.loc[:, "Product"] = qcl_bulk["Item Code"].replace(map_item_std).tolist()
qcl_bulk = qcl_bulk.drop(columns=["Item Code"])

## Final processing

In [27]:
# Set index
qcl_bulk = qcl_bulk.set_index(["Product", "Country", "Year"])

In [28]:
# Drop nulls (some products dont have any value for the elements of interest)
qcl_bulk = qcl_bulk.dropna(how="all")

In [29]:
print(qcl_bulk.shape)
qcl_bulk.head()

(829841, 4)


Unnamed: 0_level_0,Unnamed: 1_level_0,Element Code,yield__kg/animal,yield__tonnes/ha,area_harvested__ha,production__tonnes
Product,Country,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Wheat,Afghanistan,1961,,1.022,2230000.0,2279000.0
Wheat,Afghanistan,1962,,0.9735,2341000.0,2279000.0
Wheat,Afghanistan,1963,,0.8317,2341000.0,1947000.0
Wheat,Afghanistan,1964,,0.951,2345000.0,2230000.0
Wheat,Afghanistan,1965,,0.9723,2347000.0,2282000.0


## Export

In [30]:
from owid.catalog.meta import DatasetMeta

In [31]:
# Initialize dataset
fe_garden = catalog.Dataset.create_empty(dest_dir)
fe_garden.metadata = DatasetMeta(
    namespace="explorers",
    short_name="food_explorer",
    sources=qcl_garden.metadata.sources,
    licenses=qcl_garden.metadata.licenses,
)
fe_garden.save()

### In bulk

Preserve the bulk file for QA or manual analysis.

In [32]:
t = catalog.Table(qcl_bulk)
t.metadata.short_name = "bulk"
fe_garden.add(t)

### One file per product

To work in an explorer, we need to add the table in CSV format. To make it more scalable for use, we want
to split that dataset into many small files, one per product.

In [33]:
def to_short_name(raw):
    return (
        raw.lower()
        .replace(" ", "_")
        .replace(",", "")
        .replace("(", "")
        .replace(")", "")
        .replace(".", "")
    )


# the index contains values like "Asses" which have already been filtered out from the data,
# let's remove them
qcl_bulk.index = qcl_bulk.index.remove_unused_levels()

for product in sorted(qcl_bulk.index.levels[0]):
    short_name = to_short_name(product)
    print(f"{product} --> {short_name}.csv")

    t = catalog.Table(qcl_bulk.loc[[product]])
    t.metadata.short_name = short_name
    fe_garden.add(t, format="csv")  # <-- note we choose CSV format here

Almonds --> almonds.csv
Apples --> apples.csv
Apricots --> apricots.csv
Areca nuts --> areca_nuts.csv
Artichokes --> artichokes.csv
Asparagus --> asparagus.csv
Avocados --> avocados.csv
Bananas --> bananas.csv
Barley --> barley.csv
Beans, dry --> beans_dry.csv
Beans, green --> beans_green.csv
Beef and Buffalo Meat --> beef_and_buffalo_meat.csv
Beeswax --> beeswax.csv
Blueberries --> blueberries.csv
Brazil nuts, with shell --> brazil_nuts_with_shell.csv
Broad beans --> broad_beans.csv
Buckwheat --> buckwheat.csv
Buffalo hides --> buffalo_hides.csv
Butter and Ghee --> butter_and_ghee.csv
Cabbages --> cabbages.csv
Canary seed --> canary_seed.csv
Carrots and turnips --> carrots_and_turnips.csv
Cashew nuts --> cashew_nuts.csv
Cassava --> cassava.csv
Castor oil seed --> castor_oil_seed.csv
Cattle hides --> cattle_hides.csv
Cauliflowers and broccoli --> cauliflowers_and_broccoli.csv
Cereals --> cereals.csv
Cheese --> cheese.csv
Cherries --> cherries.csv
Chestnut --> chestnut.csv
Chickpeas -->

Let's check that the biggest files are still an ok size for an explorer.

In [34]:
!du -hs {dest_dir}/*.csv | sort -hr | head -n 10

700K	/tmp/food_explorer/oilcrops_oil_equivalent.csv
700K	/tmp/food_explorer/oilcrops_cake_equivalent.csv
636K	/tmp/food_explorer/roots_and_tubers.csv
612K	/tmp/food_explorer/milk_excluding_butter.csv
584K	/tmp/food_explorer/beef_and_buffalo_meat.csv
568K	/tmp/food_explorer/vegetables.csv
536K	/tmp/food_explorer/sheep_and_goat_meat.csv
524K	/tmp/food_explorer/oilcrops.csv
524K	/tmp/food_explorer/meat_poultry.csv
520K	/tmp/food_explorer/fruit.csv


The biggest is 712kB, we're ok ✓ 