# Data Cleaning and Processing

In [1]:
import pandas as pd
import numpy as np
import regex as re

Dataframe contains characters not recognised by default 'utf-8' codec.

In [2]:
#!pip install chardet

In [3]:
import chardet

with open('raw data/Production_Crops_Livestock_E_All_Data/Production_Crops_Livestock_E_All_Data_NOFLAG.csv', 'rb') as f:
    result = chardet.detect(f.read())

encoding = result['encoding']
encoding

'ISO-8859-1'

In [4]:
# 'utf-8' could not read é character, so we will use 'ISO-8859-1'

production = pd.read_csv('raw data/Production_Crops_Livestock_E_All_Data/Production_Crops_Livestock_E_All_Data_NOFLAG.csv', encoding='ISO-8859-1')
display(production.shape)
production.head()

(79297, 70)

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Unit,Y1961,...,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017,Y2018,Y2019,Y2020,Y2021
0,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5312,Area harvested,ha,,...,13490.0,14114.0,13703.0,14676.0,19481.0,19793.0,20053.0,29203.0,22134.0,21685.0
1,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5419,Yield,hg/ha,,...,45960.0,29910.0,19996.0,16521.0,16859.0,13788.0,17161.0,13083.0,17759.0,18748.0
2,2,'004,Afghanistan,221,'01371,"Almonds, in shell",5510,Production,tonnes,,...,62000.0,42215.0,27400.0,24246.0,32843.0,27291.0,34413.0,38205.0,39307.0,40655.23
3,2,'004,Afghanistan,711,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",5312,Area harvested,ha,,...,18500.0,18500.0,30000.0,25000.0,24500.0,26160.0,25220.0,27387.0,26255.0,26287.0
4,2,'004,Afghanistan,711,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",5419,Yield,hg/ha,,...,6757.0,6757.0,7167.0,7200.0,7075.0,6970.0,7866.0,6902.0,7409.0,7379.0


Data is in pivot-type format, so needs to be 'melted'. Example with a subset.

In [5]:
sample = production.iloc[0:30,0:30]
sample = sample[['Area','Item Code','Element','Unit','Y1961','Y1962','Y1963','Y1964']]
sample

Unnamed: 0,Area,Item Code,Element,Unit,Y1961,Y1962,Y1963,Y1964
0,Afghanistan,221,Area harvested,ha,,,,
1,Afghanistan,221,Yield,hg/ha,,,,
2,Afghanistan,221,Production,tonnes,,,,
3,Afghanistan,711,Area harvested,ha,,,,
4,Afghanistan,711,Yield,hg/ha,,,,
5,Afghanistan,711,Production,tonnes,,,,
6,Afghanistan,515,Area harvested,ha,2220.0,2220.0,2220.0,2350.0
7,Afghanistan,515,Yield,hg/ha,68018.0,68018.0,68018.0,78298.0
8,Afghanistan,515,Production,tonnes,15100.0,15100.0,15100.0,18400.0
9,Afghanistan,526,Area harvested,ha,4820.0,4820.0,4820.0,5100.0


In [6]:
sample.isna().sum()

Area         0
Item Code    0
Element      0
Unit         0
Y1961        9
Y1962        9
Y1963        9
Y1964        9
dtype: int64

In [7]:
pd.melt(sample, id_vars = ['Area','Item Code','Element','Unit'], value_vars=['Y1961','Y1962','Y1963'])

Unnamed: 0,Area,Item Code,Element,Unit,variable,value
0,Afghanistan,221,Area harvested,ha,Y1961,
1,Afghanistan,221,Yield,hg/ha,Y1961,
2,Afghanistan,221,Production,tonnes,Y1961,
3,Afghanistan,711,Area harvested,ha,Y1961,
4,Afghanistan,711,Yield,hg/ha,Y1961,
...,...,...,...,...,...,...
85,Afghanistan,869,Producing Animals/Slaughtered,Head,Y1963,
86,Afghanistan,1021,Production,tonnes,Y1963,3600.0
87,Afghanistan,984,Production,tonnes,Y1963,11772.0
88,Afghanistan,1057,Stocks,1000 Head,Y1963,5000.0


In [8]:
production.columns

Index(['Area Code', 'Area Code (M49)', 'Area', 'Item Code', 'Item Code (CPC)',
       'Item', 'Element Code', 'Element', 'Unit', 'Y1961', 'Y1962', 'Y1963',
       'Y1964', 'Y1965', 'Y1966', 'Y1967', 'Y1968', 'Y1969', 'Y1970', 'Y1971',
       'Y1972', 'Y1973', 'Y1974', 'Y1975', 'Y1976', 'Y1977', 'Y1978', 'Y1979',
       'Y1980', 'Y1981', 'Y1982', 'Y1983', 'Y1984', 'Y1985', 'Y1986', 'Y1987',
       'Y1988', 'Y1989', 'Y1990', 'Y1991', 'Y1992', 'Y1993', 'Y1994', 'Y1995',
       'Y1996', 'Y1997', 'Y1998', 'Y1999', 'Y2000', 'Y2001', 'Y2002', 'Y2003',
       'Y2004', 'Y2005', 'Y2006', 'Y2007', 'Y2008', 'Y2009', 'Y2010', 'Y2011',
       'Y2012', 'Y2013', 'Y2014', 'Y2015', 'Y2016', 'Y2017', 'Y2018', 'Y2019',
       'Y2020', 'Y2021'],
      dtype='object')

In [9]:
id_variables = ['Area','Item Code (CPC)','Item','Element','Unit']
years = [x for x in production.columns if x.startswith('Y')]

In [10]:
production_df = pd.melt(production, id_vars = id_variables, value_vars = years)

display(production_df.shape)
display(production_df.head(30))
production_df.isna().sum()

(4837117, 7)

Unnamed: 0,Area,Item Code (CPC),Item,Element,Unit,variable,value
0,Afghanistan,'01371,"Almonds, in shell",Area harvested,ha,Y1961,
1,Afghanistan,'01371,"Almonds, in shell",Yield,hg/ha,Y1961,
2,Afghanistan,'01371,"Almonds, in shell",Production,tonnes,Y1961,
3,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Area harvested,ha,Y1961,
4,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Yield,hg/ha,Y1961,
5,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Production,tonnes,Y1961,
6,Afghanistan,'01341,Apples,Area harvested,ha,Y1961,2220.0
7,Afghanistan,'01341,Apples,Yield,hg/ha,Y1961,68018.0
8,Afghanistan,'01341,Apples,Production,tonnes,Y1961,15100.0
9,Afghanistan,'01343,Apricots,Area harvested,ha,Y1961,4820.0


Area                     0
Item Code (CPC)          0
Item                     0
Element                  0
Unit                     0
variable                 0
value              1075949
dtype: int64

I'm going to replace all nulls with 0.

In [11]:
production_df = production_df.fillna(0)
production_df.variable = production_df.variable.str.replace("Y", "").astype(int)

In [12]:
production_df.columns = [x.lower() for x in production_df.columns]
production_df.columns = production_df.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")
production_df = production_df.rename(columns={"variable": "year"})
production_df.head()

  production_df.columns = production_df.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")
  production_df.columns = production_df.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")


Unnamed: 0,area,item_code_cpc,item,element,unit,year,value
0,Afghanistan,'01371,"Almonds, in shell",Area harvested,ha,1961,0.0
1,Afghanistan,'01371,"Almonds, in shell",Yield,hg/ha,1961,0.0
2,Afghanistan,'01371,"Almonds, in shell",Production,tonnes,1961,0.0
3,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Area harvested,ha,1961,0.0
4,Afghanistan,'01654,"Anise, badian, coriander, cumin, caraway, fenn...",Yield,hg/ha,1961,0.0


In [13]:
production_df.isna().sum()

area             0
item_code_cpc    0
item             0
element          0
unit             0
year             0
value            0
dtype: int64

In [14]:
production_df.dtypes

area              object
item_code_cpc     object
item              object
element           object
unit              object
year               int32
value            float64
dtype: object

In [15]:
# production_df.to_csv('processed data/production.csv')

### Other data sources

In [16]:
countries = pd.read_csv('raw data/UNSD — Methodology.csv', encoding='ISO-8859-1')

countries.columns = [x.lower() for x in countries.columns]
countries.columns = countries.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")

countries = countries[['region_name','sub_region_name','country_or_area','iso_alpha3_code','least_developed_countries_ldc']]
countries

  countries.columns = countries.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")
  countries.columns = countries.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")


Unnamed: 0,region_name,sub_region_name,country_or_area,iso_alpha3_code,least_developed_countries_ldc
0,Africa,Northern Africa,Algeria,DZA,
1,Africa,Northern Africa,Egypt,EGY,
2,Africa,Northern Africa,Libya,LBY,
3,Africa,Northern Africa,Morocco,MAR,
4,Africa,Northern Africa,Sudan,SDN,x
...,...,...,...,...,...
243,Oceania,Polynesia,Samoa,WSM,
244,Oceania,Polynesia,Tokelau,TKL,
245,Oceania,Polynesia,Tonga,TON,
246,Oceania,Polynesia,Tuvalu,TUV,x


In [17]:
countries[countries.region_name.isna() == True]

Unnamed: 0,region_name,sub_region_name,country_or_area,iso_alpha3_code,least_developed_countries_ldc
117,,,Antarctica,ATA,


In [18]:
countries = countries.fillna(0)
countries

Unnamed: 0,region_name,sub_region_name,country_or_area,iso_alpha3_code,least_developed_countries_ldc
0,Africa,Northern Africa,Algeria,DZA,0
1,Africa,Northern Africa,Egypt,EGY,0
2,Africa,Northern Africa,Libya,LBY,0
3,Africa,Northern Africa,Morocco,MAR,0
4,Africa,Northern Africa,Sudan,SDN,x
...,...,...,...,...,...
243,Oceania,Polynesia,Samoa,WSM,0
244,Oceania,Polynesia,Tokelau,TKL,0
245,Oceania,Polynesia,Tonga,TON,0
246,Oceania,Polynesia,Tuvalu,TUV,x


In [19]:
# countries.to_csv('processed data/countries.csv')

In [20]:
balance_historic = pd.read_csv('raw data/FoodBalanceSheetsHistoric_E_All_Data/FoodBalanceSheetsHistoric_E_All_Data_NOFLAG.csv', encoding='ISO-8859-1')
display(balance_historic.shape)
balance_historic.head(30)

(238418, 62)

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Unit,Y1961,...,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,1000 persons,8954.0,...,24019.0,24861.0,25631.0,26349.0,27032.0,27708.0,28398.0,29105.0,29825.0,30552.0
1,2,'004,Afghanistan,2901,'S2901,Grand Total,664,Food supply (kcal/capita/day),kcal/capita/day,2999.0,...,1967.0,1948.0,1966.0,2046.0,2041.0,2081.0,2104.0,2107.0,2100.0,2090.0
2,2,'004,Afghanistan,2901,'S2901,Grand Total,674,Protein supply quantity (g/capita/day),g/capita/day,84.91,...,55.24,53.51,53.46,56.0,56.96,57.79,58.14,58.91,58.91,58.25
3,2,'004,Afghanistan,2901,'S2901,Grand Total,684,Fat supply quantity (g/capita/day),g/capita/day,37.51,...,34.95,36.75,31.13,32.09,29.72,30.72,33.88,33.08,33.37,33.52
4,2,'004,Afghanistan,2903,'S2903,Vegetal Products,664,Food supply (kcal/capita/day),kcal/capita/day,2752.0,...,1726.0,1715.0,1762.0,1839.0,1831.0,1871.0,1888.0,1891.0,1883.0,1873.0
5,2,'004,Afghanistan,2903,'S2903,Vegetal Products,674,Protein supply quantity (g/capita/day),g/capita/day,71.38,...,41.9,40.49,42.18,44.55,45.15,46.12,46.03,46.73,46.65,46.03
6,2,'004,Afghanistan,2903,'S2903,Vegetal Products,684,Fat supply quantity (g/capita/day),g/capita/day,19.07,...,17.79,20.14,16.64,17.43,14.67,15.61,18.46,17.65,18.13,18.28
7,2,'004,Afghanistan,2941,'S2941,Animal Products,664,Food supply (kcal/capita/day),kcal/capita/day,247.0,...,241.0,232.0,204.0,207.0,211.0,211.0,216.0,215.0,217.0,216.0
8,2,'004,Afghanistan,2941,'S2941,Animal Products,674,Protein supply quantity (g/capita/day),g/capita/day,13.53,...,13.35,13.02,11.28,11.46,11.8,11.68,12.11,12.18,12.26,12.22
9,2,'004,Afghanistan,2941,'S2941,Animal Products,684,Fat supply quantity (g/capita/day),g/capita/day,18.44,...,17.16,16.62,14.49,14.65,15.05,15.11,15.42,15.43,15.24,15.24


In [21]:
balance = pd.read_csv('raw data/FoodBalanceSheets_E_All_Data/FoodBalanceSheets_E_All_Data_NOFLAG.csv', encoding='ISO-8859-1')
display(balance.shape)
balance.head(20)

(388754, 20)

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Unit,Y2010,Y2011,Y2012,Y2013,Y2014,Y2015,Y2016,Y2017,Y2018,Y2019,Y2020
0,2,'004,Afghanistan,2501,'S2501,Population,511,Total Population - Both sexes,1000 persons,29186.0,30117.0,31161.0,32270.0,33371.0,34414.0,35383.0,36296.0,37172.0,38041.75,38928.35
1,2,'004,Afghanistan,2501,'S2501,Population,5301,Domestic supply quantity,1000 tonnes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,'004,Afghanistan,2901,'S2901,Grand Total,664,Food supply (kcal/capita/day),kcal/capita/day,2170.0,2152.0,2159.0,2196.0,2265.0,2250.0,2228.0,2303.0,2270.0,2236.0,2240.0
3,2,'004,Afghanistan,2901,'S2901,Grand Total,661,Food supply (kcal),million kcal,23112165.71,23661742.0,24559766.2,25870848.4,27593079.4,28259773.0,28775440.5,30507233.0,30799508.7,31052348.87,31832678.75
4,2,'004,Afghanistan,2901,'S2901,Grand Total,674,Protein supply quantity (g/capita/day),g/capita/day,59.23,58.0,57.82,57.71,60.17,58.45,58.46,59.5,57.66,56.06,56.64
5,2,'004,Afghanistan,2901,'S2901,Grand Total,671,Protein supply quantity (g),g,630965.23,637630.0,657675.02,679729.04,732903.24,734237.8,755060.45,788271.0,782296.46,778402.76,804784.15
6,2,'004,Afghanistan,2901,'S2901,Grand Total,684,Fat supply quantity (g/capita/day),g/capita/day,36.69,34.81,36.53,37.1,41.48,38.28,40.7,40.03,41.47,40.81,38.94
7,2,'004,Afghanistan,2901,'S2901,Grand Total,681,Fat supply quantity (g),g,390803.47,382620.0,415463.5,437020.0,505187.0,480842.0,525590.25,530313.0,562624.62,566696.11,553238.0
8,2,'004,Afghanistan,2903,'S2903,Vegetal Products,664,Food supply (kcal/capita/day),kcal/capita/day,1964.0,1953.0,1955.0,1993.0,2019.0,2038.0,2024.0,2108.0,2081.0,2060.0,2057.0
9,2,'004,Afghanistan,2903,'S2903,Vegetal Products,661,Food supply (kcal),million kcal,20920504.0,21463348.0,22237504.0,23472094.0,24595800.0,25605601.0,26140974.0,27921383.0,28228749.0,28600486.57,29231452.68


Slightly different Elements in each table, but the common items that I wish to investigate are:
- Total Population - Both sexes
- Food supply (kcal/capita/day)
- Protein supply quantity (g/capita/day)
- Fat supply quantity (g/capita/day)
- Production
- Import Quantity
- Export Quantity
- Losses
- Food

In [22]:
elements = ['Total Population - Both sexes',
            'Food supply (kcal/capita/day)',
            'Protein supply quantity (g/capita/day)',
            'Fat supply quantity (g/capita/day)',
            'Production',
            'Import Quantity',
            'Export Quantity',
            'Losses',
            'Food']

Melting these in the same way as above.

In [23]:
print('Original df shape: ', balance_historic.shape)

balance_historic = balance_historic[balance_historic['Element'].isin(elements)==True]
print('Filtered df shape: ', balance_historic.shape)

balance_historic_df = pd.melt(balance_historic, id_vars = id_variables,
                              value_vars = [x for x in balance_historic.columns if x.startswith('Y')])

# replace NaNs with 0 as before

balance_historic_df = balance_historic_df.fillna(0)
balance_historic_df.variable = balance_historic_df.variable.str.replace("Y", "").astype(int)

balance_historic_df.columns = [x.lower() for x in balance_historic_df.columns]
balance_historic_df.columns = balance_historic_df.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")
balance_historic_df = balance_historic_df.rename(columns={"variable": "year"})

print('Melted df shape: ', balance_historic_df.shape)
display(balance_historic_df.head(30))
balance_historic_df.isna().sum()

Original df shape:  (238418, 62)
Filtered df shape:  (153195, 62)


  balance_historic_df.columns = balance_historic_df.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")
  balance_historic_df.columns = balance_historic_df.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")


Melted df shape:  (8119335, 7)


Unnamed: 0,area,item_code_cpc,item,element,unit,year,value
0,Afghanistan,'S2501,Population,Total Population - Both sexes,1000 persons,1961,8954.0
1,Afghanistan,'S2901,Grand Total,Food supply (kcal/capita/day),kcal/capita/day,1961,2999.0
2,Afghanistan,'S2901,Grand Total,Protein supply quantity (g/capita/day),g/capita/day,1961,84.91
3,Afghanistan,'S2901,Grand Total,Fat supply quantity (g/capita/day),g/capita/day,1961,37.51
4,Afghanistan,'S2903,Vegetal Products,Food supply (kcal/capita/day),kcal/capita/day,1961,2752.0
5,Afghanistan,'S2903,Vegetal Products,Protein supply quantity (g/capita/day),g/capita/day,1961,71.38
6,Afghanistan,'S2903,Vegetal Products,Fat supply quantity (g/capita/day),g/capita/day,1961,19.07
7,Afghanistan,'S2941,Animal Products,Food supply (kcal/capita/day),kcal/capita/day,1961,247.0
8,Afghanistan,'S2941,Animal Products,Protein supply quantity (g/capita/day),g/capita/day,1961,13.53
9,Afghanistan,'S2941,Animal Products,Fat supply quantity (g/capita/day),g/capita/day,1961,18.44


area             0
item_code_cpc    0
item             0
element          0
unit             0
year             0
value            0
dtype: int64

As the unit has been repeated within the element column, I will tidy by removing string between parentheses.

In [24]:
def remove_inside_parentheses(text):
    return re.sub(r'\([^)]*\)', '', text)

# Apply the function to the 'element' column
balance_historic_df['element'] = balance_historic_df['element'].apply(remove_inside_parentheses)

balance_historic_df.head(30)

Unnamed: 0,area,item_code_cpc,item,element,unit,year,value
0,Afghanistan,'S2501,Population,Total Population - Both sexes,1000 persons,1961,8954.0
1,Afghanistan,'S2901,Grand Total,Food supply,kcal/capita/day,1961,2999.0
2,Afghanistan,'S2901,Grand Total,Protein supply quantity,g/capita/day,1961,84.91
3,Afghanistan,'S2901,Grand Total,Fat supply quantity,g/capita/day,1961,37.51
4,Afghanistan,'S2903,Vegetal Products,Food supply,kcal/capita/day,1961,2752.0
5,Afghanistan,'S2903,Vegetal Products,Protein supply quantity,g/capita/day,1961,71.38
6,Afghanistan,'S2903,Vegetal Products,Fat supply quantity,g/capita/day,1961,19.07
7,Afghanistan,'S2941,Animal Products,Food supply,kcal/capita/day,1961,247.0
8,Afghanistan,'S2941,Animal Products,Protein supply quantity,g/capita/day,1961,13.53
9,Afghanistan,'S2941,Animal Products,Fat supply quantity,g/capita/day,1961,18.44


Now transform the other dataset in the same way.

In [25]:
print('Original df shape: ', balance.shape)

balance = balance[balance['Element'].isin(elements)==True]
print('Filtered df shape: ', balance.shape)

balance_df = pd.melt(balance, id_vars = id_variables,
                              value_vars = [x for x in balance.columns if x.startswith('Y')])

# replace NaNs with 0 as before

balance_df = balance_df.fillna(0)
balance_df.variable = balance_df.variable.str.replace("Y", "").astype(int)

balance_df.columns = [x.lower() for x in balance_df.columns]
balance_df.columns = balance_df.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")
balance_df = balance_df.rename(columns={"variable": "year"})

print('Melted df shape: ', balance_df.shape)
display(balance_df.head(30))
balance_df.isna().sum()

Original df shape:  (388754, 20)
Filtered df shape:  (178040, 20)
Melted df shape:  (1958440, 7)


  balance_df.columns = balance_df.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")
  balance_df.columns = balance_df.columns.str.replace(" ", "_").str.replace("-", "_").str.replace("(", "").str.replace(")", "")


Unnamed: 0,area,item_code_cpc,item,element,unit,year,value
0,Afghanistan,'S2501,Population,Total Population - Both sexes,1000 persons,2010,29186.0
1,Afghanistan,'S2901,Grand Total,Food supply (kcal/capita/day),kcal/capita/day,2010,2170.0
2,Afghanistan,'S2901,Grand Total,Protein supply quantity (g/capita/day),g/capita/day,2010,59.23
3,Afghanistan,'S2901,Grand Total,Fat supply quantity (g/capita/day),g/capita/day,2010,36.69
4,Afghanistan,'S2903,Vegetal Products,Food supply (kcal/capita/day),kcal/capita/day,2010,1964.0
5,Afghanistan,'S2903,Vegetal Products,Protein supply quantity (g/capita/day),g/capita/day,2010,47.42
6,Afghanistan,'S2903,Vegetal Products,Fat supply quantity (g/capita/day),g/capita/day,2010,22.33
7,Afghanistan,'S2941,Animal Products,Food supply (kcal/capita/day),kcal/capita/day,2010,206.0
8,Afghanistan,'S2941,Animal Products,Protein supply quantity (g/capita/day),g/capita/day,2010,11.81
9,Afghanistan,'S2941,Animal Products,Fat supply quantity (g/capita/day),g/capita/day,2010,14.36


area             0
item_code_cpc    0
item             0
element          0
unit             0
year             0
value            0
dtype: int64

In [26]:
balance_df['element'] = balance_df['element'].apply(remove_inside_parentheses)

In [27]:
# balance_historic_df.to_csv('processed data/balance_historic.csv')

In [28]:
# balance_df.to_csv('processed data/balance.csv')

### API source data

The WHO supplies information on childhood nutrition in an API.

In [29]:
import requests
import json

url = "https://ghoapi.azureedge.net/api/uwgt5"

response = requests.get(url)
print(response.status_code)

results = response.json()
# results

200


In [30]:
nutrition = pd.DataFrame(results['value'])
display(nutrition.shape)
nutrition.head(10)

(14858, 25)

Unnamed: 0,Id,IndicatorCode,SpatialDimType,SpatialDim,TimeDimType,ParentLocationCode,ParentLocation,Dim1Type,TimeDim,Dim1,...,DataSourceDim,Value,NumericValue,Low,High,Comments,Date,TimeDimensionValue,TimeDimensionBegin,TimeDimensionEnd
0,28481988,uwgt5,COUNTRY,AFG,YEAR,EMR,Eastern Mediterranean,AGEGROUP,2010,YEARS00-01,...,EQ_MICS,No data,,,,Not available,2022-05-26T10:59:00.397+02:00,2010,2010-01-01T00:00:00+01:00,2010-12-31T00:00:00+01:00
1,28481989,uwgt5,COUNTRY,AFG,YEAR,EMR,Eastern Mediterranean,AGEGROUP,2010,YEARS02-05,...,EQ_MICS,No data,,,,Not available,2022-05-26T10:59:00.443+02:00,2010,2010-01-01T00:00:00+01:00,2010-12-31T00:00:00+01:00
2,28482060,uwgt5,COUNTRY,AFG,YEAR,EMR,Eastern Mediterranean,AGEGROUP,2015,YEARS00-01,...,EQ_DHS,No data,,,,Not available,2022-05-26T10:59:02.36+02:00,2015,2015-01-01T00:00:00+01:00,2015-12-31T00:00:00+01:00
3,28482061,uwgt5,COUNTRY,AFG,YEAR,EMR,Eastern Mediterranean,AGEGROUP,2015,YEARS02-05,...,EQ_DHS,No data,,,,Not available,2022-05-26T10:59:02.39+02:00,2015,2015-01-01T00:00:00+01:00,2015-12-31T00:00:00+01:00
4,28482132,uwgt5,COUNTRY,ALB,YEAR,EUR,Europe,AGEGROUP,2002,YEARS00-01,...,EQ_RHS,No data,,,,Not available,2022-05-26T10:59:04.077+02:00,2002,2002-01-01T00:00:00+01:00,2002-12-31T00:00:00+01:00
5,28482133,uwgt5,COUNTRY,ALB,YEAR,EUR,Europe,AGEGROUP,2002,YEARS02-05,...,EQ_RHS,No data,,,,Not available,2022-05-26T10:59:04.123+02:00,2002,2002-01-01T00:00:00+01:00,2002-12-31T00:00:00+01:00
6,28482204,uwgt5,COUNTRY,ALB,YEAR,EUR,Europe,AGEGROUP,2005,YEARS00-01,...,EQ_MICS,8.0 [5.3-11.9],8.03751,5.33767,11.93083,,2022-05-26T10:59:05.7+02:00,2005,2005-01-01T00:00:00+01:00,2005-12-31T00:00:00+01:00
7,28482205,uwgt5,COUNTRY,ALB,YEAR,EUR,Europe,AGEGROUP,2005,YEARS02-05,...,EQ_MICS,5.9 [4.2-8.3],5.91986,4.16721,8.34546,,2022-05-26T10:59:05.733+02:00,2005,2005-01-01T00:00:00+01:00,2005-12-31T00:00:00+01:00
8,28482276,uwgt5,COUNTRY,ALB,YEAR,EUR,Europe,AGEGROUP,2008,YEARS00-01,...,EQ_DHS,7.4 [5.3-10.4],7.44554,5.27792,10.40558,,2022-05-26T10:59:07.103+02:00,2008,2008-01-01T00:00:00+01:00,2008-12-31T00:00:00+01:00
9,28482277,uwgt5,COUNTRY,ALB,YEAR,EUR,Europe,AGEGROUP,2008,YEARS02-05,...,EQ_DHS,5.7 [4.2-7.6],5.68923,4.234,7.6049,,2022-05-26T10:59:07.12+02:00,2008,2008-01-01T00:00:00+01:00,2008-12-31T00:00:00+01:00


In [31]:
nutrition.dtypes

Id                      int64
IndicatorCode          object
SpatialDimType         object
SpatialDim             object
TimeDimType            object
ParentLocationCode     object
ParentLocation         object
Dim1Type               object
TimeDim                 int64
Dim1                   object
Dim2Type               object
Dim2                   object
Dim3Type               object
Dim3                   object
DataSourceDimType      object
DataSourceDim          object
Value                  object
NumericValue          float64
Low                   float64
High                  float64
Comments               object
Date                   object
TimeDimensionValue     object
TimeDimensionBegin     object
TimeDimensionEnd       object
dtype: object

In [32]:
nutrition.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Id,14858.0,,,,28744784.093754,154223.271233,28481988.0,28606780.25,28741498.5,28878660.75,29016047.0
IndicatorCode,14858.0,1.0,uwgt5,14858.0,,,,,,,
SpatialDimType,14858.0,2.0,COUNTRY,10368.0,,,,,,,
SpatialDim,14858.0,4607.0,PER,456.0,,,,,,,
TimeDimType,14858.0,1.0,YEAR,14858.0,,,,,,,
ParentLocationCode,10368.0,6.0,AFR,4416.0,,,,,,,
ParentLocation,10368.0,6.0,Africa,4416.0,,,,,,,
Dim1Type,10368.0,6.0,WEALTHDECILE,4320.0,,,,,,,
TimeDim,14858.0,,,,2008.441782,7.199164,1991.0,2004.0,2010.0,2014.0,2020.0
Dim1,10368.0,24.0,YEARS00-01,432.0,,,,,,,


In [33]:
nutrition.Dim1Type.value_counts()

WEALTHDECILE         4320
WEALTHQUINTILE       2160
EDUCATIONLEVEL       1296
AGEGROUP              864
RESIDENCEAREATYPE     864
SEX                   864
Name: Dim1Type, dtype: int64

In [34]:
# columns to keep
columns = ['SpatialDimType','SpatialDim','ParentLocation',
          'Dim1Type','Dim1','NumericValue','TimeDimensionValue']

In [35]:
nutrition_filtered = nutrition[columns]

nutrition_filtered = nutrition_filtered[(nutrition['Dim1'].isna()==False) &
                                        (nutrition['NumericValue'].isna()==False)]

nutrition_filtered.columns = [x.lower() for x in nutrition_filtered.columns]
nutrition_filtered.columns = nutrition_filtered.columns.str.replace(" ", "_")

display(nutrition_filtered.shape)
display(nutrition_filtered.isna().sum())
nutrition_filtered.head()

(4202, 7)

spatialdimtype        0
spatialdim            0
parentlocation        0
dim1type              0
dim1                  0
numericvalue          0
timedimensionvalue    0
dtype: int64

Unnamed: 0,spatialdimtype,spatialdim,parentlocation,dim1type,dim1,numericvalue,timedimensionvalue
6,COUNTRY,ALB,Europe,AGEGROUP,YEARS00-01,8.03751,2005
7,COUNTRY,ALB,Europe,AGEGROUP,YEARS02-05,5.91986,2005
8,COUNTRY,ALB,Europe,AGEGROUP,YEARS00-01,7.44554,2008
9,COUNTRY,ALB,Europe,AGEGROUP,YEARS02-05,5.68923,2008
10,COUNTRY,ALB,Europe,AGEGROUP,YEARS00-01,2.26349,2017


In [36]:
# nutrition_filtered.to_csv('processed data/nutrition.csv')

### The following article also contains a table of famines that I can scrape.

In [37]:
from bs4 import BeautifulSoup

url2 = "https://ourworldindata.org/famines#famines-by-world-region-since-1860"

response = requests.get(url2)
response.status_code

200

In [38]:
soup = BeautifulSoup(response.content, "html.parser")
# soup

In [39]:
table = soup.find('tbody', class_='row-hover')

data=[]

for row in table.find_all('tr'):
    columns = row.find_all('td')
    data.append([col.get_text(strip=True) for col in columns])
    
# data

In [40]:
table.find_all('tr')[0].find_all('td')

[<td class="column-1">1846–52</td>,
 <td class="column-2">Ireland</td>,
 <td class="column-3">1,000,000</td>,
 <td class="column-4">1,000,000</td>,
 <td class="column-5">1,000,000</td>,
 <td class="column-6">Ó Gráda (2007)</td>]

In [41]:
famines = pd.DataFrame(data, columns=['Date', 'Location', 'ExcessMortality_midpoint', 'ExcessMortality_lower', 'ExcessMortality_upper', 'Source'])
famines

Unnamed: 0,Date,Location,ExcessMortality_midpoint,ExcessMortality_lower,ExcessMortality_upper,Source
0,1846–52,Ireland,1000000,1000000,1000000,Ó Gráda (2007)
1,1860-1,India,2000000,2000000,2000000,Kumar and Raychaudhuri [Eds.] (1983)
2,1863-67,Cape Verde,30000,30000,30000,"Ó Gráda (2009), p. 22"
3,1866-7,India,961043,961043,961043,Kumar and Raychaudhuri [Eds.] (1983)
4,1868,Finland,100000,100000,100000,Ó Gráda (2009) Table 1.1
...,...,...,...,...,...,...
72,1998-2007,Democratic Republic of Congo,3131500,863000,5400000,Coglan et al (2007); 2009/10 Human Security Re...
73,2002,Malawi,1650,300,3000,Devereux (2002)
74,2003-05,Sudan (Darfur),200000,200000,200000,WPF
75,2003-06,Uganda,100000,100000,100000,WPF


The date column will not be usable in my analysis, so I will have to transform it into a list.

In [42]:
famines.Date = [d.replace('–', '-') for d in famines.Date]
famines

Unnamed: 0,Date,Location,ExcessMortality_midpoint,ExcessMortality_lower,ExcessMortality_upper,Source
0,1846-52,Ireland,1000000,1000000,1000000,Ó Gráda (2007)
1,1860-1,India,2000000,2000000,2000000,Kumar and Raychaudhuri [Eds.] (1983)
2,1863-67,Cape Verde,30000,30000,30000,"Ó Gráda (2009), p. 22"
3,1866-7,India,961043,961043,961043,Kumar and Raychaudhuri [Eds.] (1983)
4,1868,Finland,100000,100000,100000,Ó Gráda (2009) Table 1.1
...,...,...,...,...,...,...
72,1998-2007,Democratic Republic of Congo,3131500,863000,5400000,Coglan et al (2007); 2009/10 Human Security Re...
73,2002,Malawi,1650,300,3000,Devereux (2002)
74,2003-05,Sudan (Darfur),200000,200000,200000,WPF
75,2003-06,Uganda,100000,100000,100000,WPF


In [43]:
famines['StartDate'] = (np.where(famines['Date'].str.contains('-'),
                  famines['Date'].str[:4],
                  famines['Date']))
famines

Unnamed: 0,Date,Location,ExcessMortality_midpoint,ExcessMortality_lower,ExcessMortality_upper,Source,StartDate
0,1846-52,Ireland,1000000,1000000,1000000,Ó Gráda (2007),1846
1,1860-1,India,2000000,2000000,2000000,Kumar and Raychaudhuri [Eds.] (1983),1860
2,1863-67,Cape Verde,30000,30000,30000,"Ó Gráda (2009), p. 22",1863
3,1866-7,India,961043,961043,961043,Kumar and Raychaudhuri [Eds.] (1983),1866
4,1868,Finland,100000,100000,100000,Ó Gráda (2009) Table 1.1,1868
...,...,...,...,...,...,...,...
72,1998-2007,Democratic Republic of Congo,3131500,863000,5400000,Coglan et al (2007); 2009/10 Human Security Re...,1998
73,2002,Malawi,1650,300,3000,Devereux (2002),2002
74,2003-05,Sudan (Darfur),200000,200000,200000,WPF,2003
75,2003-06,Uganda,100000,100000,100000,WPF,2003


In [44]:
famines.Date.str.split('-')

0       [1846, 52]
1        [1860, 1]
2       [1863, 67]
3        [1866, 7]
4           [1868]
          ...     
72    [1998, 2007]
73          [2002]
74      [2003, 05]
75      [2003, 06]
76          [2011]
Name: Date, Length: 77, dtype: object

In [45]:
duration_list = []

for i in list(famines.Date):
    start=[]
    end=[]
    index=[]
    
    start = i[:4] # this is the opening year
    if '-' in i:
        end = i.split('-')[1] # this is the string component after the separater
    else:
        end = i[:4]
    
    index = (4 - len(end))
    
    truncated = start[index:] # the start date truncated to the same length as the end date
    
    duration = 1 + int(end) - int(truncated) # duration of famine (minimum 1 year)
    
    duration_list.append(duration)

len(duration_list)

77

In [46]:
famines['Duration'] = duration_list

famines.head()

Unnamed: 0,Date,Location,ExcessMortality_midpoint,ExcessMortality_lower,ExcessMortality_upper,Source,StartDate,Duration
0,1846-52,Ireland,1000000,1000000,1000000,Ó Gráda (2007),1846,7
1,1860-1,India,2000000,2000000,2000000,Kumar and Raychaudhuri [Eds.] (1983),1860,2
2,1863-67,Cape Verde,30000,30000,30000,"Ó Gráda (2009), p. 22",1863,5
3,1866-7,India,961043,961043,961043,Kumar and Raychaudhuri [Eds.] (1983),1866,2
4,1868,Finland,100000,100000,100000,Ó Gráda (2009) Table 1.1,1868,1


In [47]:
famines.isna().sum()

Date                        0
Location                    0
ExcessMortality_midpoint    0
ExcessMortality_lower       0
ExcessMortality_upper       0
Source                      0
StartDate                   0
Duration                    0
dtype: int64

In [48]:
famines.columns = [x.lower() for x in famines.columns]
famines.columns = famines.columns.str.replace(" ", "_")

In [49]:
# famines.to_csv('processed data/famines.csv')