In [1]:
import pandas as pd

In [2]:
fao_fbs = pd.read_csv(r"..\data\FAOSTAT\raw\FoodBalanceSheets_E_All_Data_(Normalized).csv")
fao_fs = pd.read_csv(r"..\data\FAOSTAT\raw\Food_Security_Data_E_All_Data_(Normalized).csv")
fao_emissions = pd.read_csv(r"..\data\FAOSTAT\raw\Climate_change_Emissions_indicators_E_All_Data_(Normalized).csv")
fao_cpi = pd.read_csv(r"..\data\FAOSTAT\raw\ConsumerPriceIndices_E_All_Data_(Normalized).csv")
fao_population = pd.read_csv(r"..\data\FAOSTAT\raw\Population_E_All_Data_(Normalized).csv")
fao_prices = pd.read_csv(r"..\data\FAOSTAT\raw\Prices_E_All_Data_(Normalized).csv")
fao_production_indices = pd.read_csv(r"..\data\FAOSTAT\raw\Production_Indices_E_All_Data_(Normalized).csv")

In [3]:
# Create a dictionary to hold the DataFrames
dfs = {
    "fao_fbs": fao_fbs,
    "fao_fs": fao_fs,
    "fao_emissions": fao_emissions,
    "fao_cpi": fao_cpi,
    "fao_population": fao_population,
    "fao_prices": fao_prices,
    "fao_production_indices": fao_production_indices
}

## Relevant Columns

In [4]:
# Print column names for each dataframe
for name, df in dfs.items():
    print(f"{name}: {df.columns.to_list()}")

fao_fbs: ['Area Code', 'Area Code (M49)', 'Area', 'Item Code', 'Item Code (FBS)', 'Item', 'Element Code', 'Element', 'Year Code', 'Year', 'Unit', 'Value', 'Flag', 'Note']
fao_fs: ['Area Code', 'Area Code (M49)', 'Area', 'Item Code', 'Item', 'Element Code', 'Element', 'Year Code', 'Year', 'Unit', 'Value', 'Flag', 'Note']
fao_emissions: ['Area Code', 'Area Code (M49)', 'Area', 'Item Code', 'Item', 'Element Code', 'Element', 'Year Code', 'Year', 'Unit', 'Value', 'Flag']
fao_cpi: ['Area Code', 'Area Code (M49)', 'Area', 'Item Code', 'Item', 'Element Code', 'Element', 'Months Code', 'Months', 'Year Code', 'Year', 'Unit', 'Value', 'Flag', 'Note']
fao_population: ['Area Code', 'Area Code (M49)', 'Area', 'Item Code', 'Item', 'Element Code', 'Element', 'Year Code', 'Year', 'Unit', 'Value', 'Flag', 'Note']
fao_prices: ['Area Code', 'Area Code (M49)', 'Area', 'Item Code', 'Item Code (CPC)', 'Item', 'Element Code', 'Element', 'Year Code', 'Year', 'Months Code', 'Months', 'Unit', 'Value', 'Flag']
f

In [5]:
# Drop unnecessary columns
# Create new dataframes with dropped columns and store in a new dictionary
dfs_relevant = {}
for name, df in dfs.items():
    cols_to_drop = [col for col in df.columns if any(x in col.lower() for x in ['code', 'note'])]
    dfs_relevant[name] = df.drop(columns=cols_to_drop)

# Print column names for each dataframe
for name, df in dfs_relevant.items():
    print(f"{name}: {df.columns.to_list()}")

fao_fbs: ['Area', 'Item', 'Element', 'Year', 'Unit', 'Value', 'Flag']
fao_fs: ['Area', 'Item', 'Element', 'Year', 'Unit', 'Value', 'Flag']
fao_emissions: ['Area', 'Item', 'Element', 'Year', 'Unit', 'Value', 'Flag']
fao_cpi: ['Area', 'Item', 'Element', 'Months', 'Year', 'Unit', 'Value', 'Flag']
fao_population: ['Area', 'Item', 'Element', 'Year', 'Unit', 'Value', 'Flag']
fao_prices: ['Area', 'Item', 'Element', 'Year', 'Months', 'Unit', 'Value', 'Flag']
fao_production_indices: ['Area', 'Item', 'Element', 'Year', 'Unit', 'Value', 'Flag']


In [6]:
# Print dtypes for each dataframe
for name, df in dfs_relevant.items():
    print(f"\n=== {name} ===")
    print(df.dtypes)


=== fao_fbs ===
Area        object
Item        object
Element     object
Year         int64
Unit        object
Value      float64
Flag        object
dtype: object

=== fao_fs ===
Area       object
Item       object
Element    object
Year       object
Unit       object
Value      object
Flag       object
dtype: object

=== fao_emissions ===
Area        object
Item        object
Element     object
Year         int64
Unit        object
Value      float64
Flag        object
dtype: object

=== fao_cpi ===
Area        object
Item        object
Element     object
Months      object
Year         int64
Unit        object
Value      float64
Flag        object
dtype: object

=== fao_population ===
Area        object
Item        object
Element     object
Year         int64
Unit        object
Value      float64
Flag        object
dtype: object

=== fao_prices ===
Area        object
Item        object
Element     object
Year         int64
Months      object
Unit        object
Value      float64
Fla

## Items and Elements Overview

In [7]:
# Print all unique Item and Element values from each dataframe
for name, df in dfs_relevant.items():
    print(f"\n=== {name} ===")
    items = sorted(df['Item'].dropna().unique())[:5]
    elements = sorted(df['Element'].dropna().unique())[:5]
    print("Items:")
    for item in items:
        print(f"  - {item}")
    print("Elements:")
    for element in elements:
        print(f"  - {element}")


=== fao_fbs ===
Items:
  - Alcohol, Non-Food
  - Alcoholic Beverages
  - Animal Products
  - Animal fats
  - Apples and products
Elements:
  - Domestic supply quantity
  - Export quantity
  - Fat supply quantity (g/capita/day)
  - Fat supply quantity (t)
  - Feed

=== fao_fs ===
Items:
  - Average dietary energy requirement (kcal/cap/day)
  - Average dietary energy supply adequacy (percent) (3-year average)
  - Average fat supply (g/cap/day) (3-year average)
  - Average protein supply (g/cap/day) (3-year average)
  - Average supply of protein of animal origin (g/cap/day) (3-year average)
Elements:
  - Confidence interval: Lower bound
  - Confidence interval: Upper bound
  - Value

=== fao_emissions ===
Items:
  - AFOLU
  - Agrifood systems
  - All sectors with LULUCF
  - All sectors without LULUCF
  - Emissions from crops
Elements:
  - Emissions Share (CH4)
  - Emissions Share (CO2)
  - Emissions Share (CO2eq) (AR5)
  - Emissions Share (CO2eq) (AR5) (F-gases)
  - Emissions Share (N2O)

## Year

In [8]:
# Find unique value for 'Year' in each dataframe
for name, df in dfs_relevant.items():
    print(f"\n=== {name} ===")
    unique_years = sorted(df['Year'].dropna().unique())
    print("Unique Years:")
    print(unique_years)


=== fao_fbs ===
Unique Years:
[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

=== fao_fs ===
Unique Years:
['2000', '2000-2002', '2001', '2001-2003', '2002', '2002-2004', '2003', '2003-2005', '2004', '2004-2006', '2005', '2005-2007', '2006', '2006-2008', '2007', '2007-2009', '2008', '2008-2010', '2009', '2009-2011', '2010', '2010-2012', '2011', '2011-2013', '2012', '2012-2014', '2013', '2013-2015', '2014', '2014-2016', '2015', '2015-2017', '2016', '2016-2018', '2017', '2017-2019', '2018', '2018-2020', '2019', '2019-2021', '2020', '2020-2022', '2021', '2021-2023', '2022', '2023']

=== fao_emissions ===
Unique Years:
[1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

=== fao_cpi ===
Unique Years:
[2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,

In [9]:
fao_fs_relevant = dfs_relevant['fao_fs']
fao_fs_relevant

Unnamed: 0,Area,Item,Element,Year,Unit,Value,Flag
0,Afghanistan,Average dietary energy supply adequacy (percen...,Value,2000-2002,%,87,E
1,Afghanistan,Average dietary energy supply adequacy (percen...,Value,2001-2003,%,88,E
2,Afghanistan,Average dietary energy supply adequacy (percen...,Value,2002-2004,%,91,E
3,Afghanistan,Average dietary energy supply adequacy (percen...,Value,2003-2005,%,92,E
4,Afghanistan,Average dietary energy supply adequacy (percen...,Value,2004-2006,%,93,E
...,...,...,...,...,...,...,...
283875,Upper-middle-income economies,Average fat supply (g/cap/day) (3-year average),Value,2016-2018,g/cap/d,90.8,E
283876,Upper-middle-income economies,Average fat supply (g/cap/day) (3-year average),Value,2017-2019,g/cap/d,92.5,E
283877,Upper-middle-income economies,Average fat supply (g/cap/day) (3-year average),Value,2018-2020,g/cap/d,94.2,E
283878,Upper-middle-income economies,Average fat supply (g/cap/day) (3-year average),Value,2019-2021,g/cap/d,96.7,E


In [15]:
# Show unique Item values where the 'Year' column contains a dash (i.e., is a range like '2000-2002')
fao_fs_relevant[fao_fs_relevant['Year'].str.contains('-')]['Item'].unique()

array(['Average dietary energy supply adequacy (percent) (3-year average)',
       'Dietary energy supply used in the estimation of the prevalence of undernourishment (kcal/cap/day) (3-year average)',
       'Share of dietary energy supply derived from cereals, roots and tubers (percent) (3-year average)',
       'Average protein supply (g/cap/day) (3-year average)',
       'Average supply of protein of animal origin (g/cap/day) (3-year average)',
       'Prevalence of undernourishment (percent) (3-year average)',
       'Number of people undernourished (million) (3-year average)',
       'Prevalence of severe food insecurity in the total population (percent) (3-year average)',
       'Prevalence of severe food insecurity in the male adult population (percent) (3-year average)',
       'Prevalence of severe food insecurity in the female adult population (percent) (3-year average)',
       'Prevalence of moderate or severe food insecurity in the total population (percent) (3-year averag

In [25]:
items_with_avg = fao_fs_relevant['Item'][fao_fs_relevant['Item'].str.contains(r'\(3-year average\)', regex=True)].unique()
items_no_avg = fao_fs_relevant['Item'][~fao_fs_relevant['Item'].str.contains(r'\(3-year average\)', regex=True)].unique()

In [22]:
items_with_avg

array(['Average dietary energy supply adequacy (percent) (3-year average)',
       'Dietary energy supply used in the estimation of the prevalence of undernourishment (kcal/cap/day) (3-year average)',
       'Share of dietary energy supply derived from cereals, roots and tubers (percent) (3-year average)',
       'Average protein supply (g/cap/day) (3-year average)',
       'Average supply of protein of animal origin (g/cap/day) (3-year average)',
       'Prevalence of undernourishment (percent) (3-year average)',
       'Number of people undernourished (million) (3-year average)',
       'Prevalence of severe food insecurity in the total population (percent) (3-year average)',
       'Prevalence of severe food insecurity in the male adult population (percent) (3-year average)',
       'Prevalence of severe food insecurity in the female adult population (percent) (3-year average)',
       'Prevalence of moderate or severe food insecurity in the total population (percent) (3-year averag

In [23]:
items_no_avg

array(['Dietary energy supply used in the estimation of the prevalence of undernourishment (kcal/cap/day)',
       'Gross domestic product per capita, PPP, (constant 2017 international $)',
       'Political stability and absence of violence/terrorism (index)',
       'Per capita food supply variability (kcal/cap/day)',
       'Percentage of population using safely managed drinking water services (percent)',
       'Percentage of population using at least basic drinking water services (percent)',
       'Percentage of population using at least basic sanitation services (percent)',
       'Percentage of children under 5 years affected by wasting (percent)',
       'Number of children under 5 years affected by wasting (million)',
       'Percentage of children under 5 years of age who are stunted (modelled estimates) (percent)',
       'Number of children under 5 years of age who are stunted (modeled estimates) (million)',
       'Percentage of children under 5 years of age who are overw

In [28]:
# For each item with '(3-year average)', remove the suffix and find matching items_no_avg that start with the base name
print("Items with '(3-year average)' and their matching non-average alternatives (by prefix):")
for item in items_with_avg:
    base_item = item.replace(' (3-year average)', '')
    matches = [i for i in items_no_avg if i.startswith(base_item)]
    if matches:
        for match in matches:
            print(f"- '{match}' → '{item}'")
    else:
        print(f"- (no non-average alternative found) '{item}'")

Items with '(3-year average)' and their matching non-average alternatives (by prefix):
- (no non-average alternative found) 'Average dietary energy supply adequacy (percent) (3-year average)'
- 'Dietary energy supply used in the estimation of the prevalence of undernourishment (kcal/cap/day)' → 'Dietary energy supply used in the estimation of the prevalence of undernourishment (kcal/cap/day) (3-year average)'
- (no non-average alternative found) 'Share of dietary energy supply derived from cereals, roots and tubers (percent) (3-year average)'
- (no non-average alternative found) 'Average protein supply (g/cap/day) (3-year average)'
- (no non-average alternative found) 'Average supply of protein of animal origin (g/cap/day) (3-year average)'
- 'Prevalence of undernourishment (percent) (annual value)' → 'Prevalence of undernourishment (percent) (3-year average)'
- 'Number of people undernourished (million) (annual value)' → 'Number of people undernourished (million) (3-year average)'
- '

## Months

In [11]:
# Find unique value for 'Months' in each dataframe
for name, df in dfs_relevant.items():
    print(f"\n=== {name} ===")
    if 'Months' in df.columns:
        unique_months = sorted(df['Months'].dropna().unique())
        print("Unique Months:")
        print(unique_months)
    else:
        print("No 'Months' column in this dataframe.")


=== fao_fbs ===
No 'Months' column in this dataframe.

=== fao_fs ===
No 'Months' column in this dataframe.

=== fao_emissions ===
No 'Months' column in this dataframe.

=== fao_cpi ===
Unique Months:
['April', 'August', 'December', 'February', 'January', 'July', 'June', 'March', 'May', 'November', 'October', 'September']

=== fao_population ===
No 'Months' column in this dataframe.

=== fao_prices ===
Unique Months:
['Annual value', 'April', 'August', 'December', 'February', 'January', 'July', 'June', 'March', 'May', 'November', 'October', 'September']

=== fao_production_indices ===
No 'Months' column in this dataframe.


## Flag

In [12]:
# Find unique value for 'Flag' in each dataframe
for name, df in dfs_relevant.items():
    print(f"\n=== {name} ===")
    unique_flags = sorted(df['Flag'].dropna().unique())
    print("Unique Flags:")
    print(unique_flags)


=== fao_fbs ===
Unique Flags:
['A', 'E', 'I', 'X']

=== fao_fs ===
Unique Flags:
['A', 'E', 'O', 'Q', 'X']

=== fao_emissions ===
Unique Flags:
['E']

=== fao_cpi ===
Unique Flags:
['A', 'E', 'I', 'X', 'x']

=== fao_population ===
Unique Flags:
['E', 'X']

=== fao_prices ===
Unique Flags:
['A', 'I', 'X']

=== fao_production_indices ===
Unique Flags:
['E']


```python
flag_mapping = {
    "A": "Official figure",
    "E": "Estimated value",
    "I": "Imputed value",
    "O": "Missing value",
    "Q": "Missing value; suppressed",
    "X": "Figure from international organization",
}
```

In [13]:
# # Mapping flag codes to their meanings
# flag_mapping = {
#     "A": "Official figure",
#     "E": "Estimated value",
#     "I": "Imputed value",
#     "O": "Missing value",
#     "Q": "Missing value; suppressed",
#     "X": "Figure from international organization",
# }

# # Replace flag codes with their meanings in each dataframe
# for name, df in dfs_relevant.items():
#     if 'Flag' in df.columns:
#         df['Flag'] = df['Flag'].str.upper().map(flag_mapping).fillna(df['Flag'])
#         print(f"\n=== {name} ===")
#         print(df['Flag'].unique())
#     else:
#         print(f"\n=== {name} ===")
#         print("No 'Flag' column in this dataframe.")