# Parameters

In [1]:
dest_dir = "/tmp/ghe_20210701"

# Read data and reference tables

In [1]:
from owid import catalog
import json
from pathlib import Path

In [2]:
base_path = Path("/work/etl")

In [64]:
reference_dataset = catalog.Dataset((base_path / "data/reference").as_posix())

In [65]:
sex = reference_dataset["sex"]

In [66]:
sex

Unnamed: 0_level_0,name
code,Unnamed: 1_level_1
female,Female
male,Male
both,Both sexes


In [6]:
countries = reference_dataset["countries-regions"]

In [7]:
countries.head()

Unnamed: 0_level_0,name,iso_alpha2,iso_alpha3,imf_code,cow_letter,cow_code,unctad_code,marc_code,ncd_code,kansas_code,penn_code,wikidata_uri,wikidata_label,legacy_entity_id,legacy_country_id,members
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AFG,Afghanistan,AF,AFG,512.0,AFG,700.0,AFG,AF,AFGN,AFG,AFG,http://www.wikidata.org/entity/Q889,Afghanistan,15.0,562.0,
ALA,Aland Islands,AX,ALA,,,,,,,,,http://www.wikidata.org/entity/Q5689,Åland,296.0,791.0,
ALB,Albania,AL,ALB,914.0,ALB,339.0,ALB,AA,ALBN,ALB,ALB,http://www.wikidata.org/entity/Q222,Albania,16.0,565.0,
DZA,Algeria,DZ,DZA,612.0,ALG,615.0,ALG,AE,ALGR,DZA,DZA,http://www.wikidata.org/entity/Q262,Algeria,17.0,619.0,
ASM,American Samoa,AS,ASM,859.0,,,,AS,,ASM,ASM,http://www.wikidata.org/entity/Q16641,American Samoa,246.0,571.0,


In [67]:
ghe_meadow = catalog.Dataset((base_path / "data/meadow/who/2021-07-01/ghe").as_posix())

In [68]:
ghe_estimates = ghe_meadow["estimates"]

In [69]:
ghe_estimates.shape

(7191900, 9)

In [43]:
ghe_estimates.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,population,deaths,deaths_rate,deaths_100k,daly,daly_rate,daly_100k,causegroup,level
country_code,year,ghe_cause_title,sex_code,agegroup_code,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
GRD,2019,Alcohol use disorders,BTSX,YEARS45-49,6285.0,0.5,8.2e-05,8.2,42.1,0.0066952,669.5,2,3
GRD,2019,Syphilis,MLE,YEARS1-4,3729.0,0.0,1e-08,0.0,0.0,5.9e-07,0.1,1,4
GRD,2019,Cataracts,FMLE,YEARS15-19,3736.0,0.0,0.0,0.0,0.0,0.0,0.0,2,3
GRD,2019,Peptic ulcer disease,BTSX,ALLAges,112002.0,4.7,4.19e-05,4.2,126.1,0.00112614,112.6,2,3
GRD,2019,Drug use disorders,MLE,YEARS85PLUS,366.0,0.0,8.524e-05,8.5,0.4,0.00103158,103.2,2,3


In [70]:
country_codes = set(ghe_estimates.index.unique("country_code"))

In [71]:
len(country_codes)

183

In [72]:
owid_country_codes = set(countries.index)

In [73]:
len(owid_country_codes)

296

In [74]:
country_codes - owid_country_codes

set()

💡 The set of (country codes in GHE) minus (owid table country codes) is empty, so we don't need to do any further harmonization here.

In [75]:
ghe_estimates.index.unique("sex_code")

CategoricalIndex(['BTSX', 'MLE', 'FMLE'], categories=['BTSX', 'FMLE', 'MLE'], ordered=False, dtype='category', name='sex_code')

In [76]:
with open(
    base_path
    / "etl"
    / "steps"
    / "data"
    / "garden"
    / "who"
    / "2021-07-01"
    / "sex_code_mapping.json"
) as f:
    ghe_to_owid_sex_codes = json.load(f)

In [77]:
ghe_to_owid_sex_codes

{'BTSX': 'both', 'MLE': 'male', 'FMLE': 'female'}

In [78]:
index_names = ghe_estimates.index.names

In [79]:
ghe_estimates.reset_index(inplace=True)

In [80]:
ghe_estimates["sex_code"] = ghe_estimates["sex_code"].cat.rename_categories(
    ghe_to_owid_sex_codes
)

In [81]:
ghe_estimates.set_index(index_names, inplace=True)

In [82]:
ghe_estimates.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,population,deaths,deaths_rate,deaths_100k,daly,daly_rate,daly_100k,causegroup,level
country_code,year,ghe_cause_title,sex_code,agegroup_code,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
GRD,2019,Alcohol use disorders,both,YEARS45-49,6285.0,0.5,8.2e-05,8.2,42.1,0.0066952,669.5,2,3
GRD,2019,Syphilis,male,YEARS1-4,3729.0,0.0,1e-08,0.0,0.0,5.9e-07,0.1,1,4
GRD,2019,Cataracts,female,YEARS15-19,3736.0,0.0,0.0,0.0,0.0,0.0,0.0,2,3
GRD,2019,Peptic ulcer disease,both,ALLAges,112002.0,4.7,4.19e-05,4.2,126.1,0.00112614,112.6,2,3
GRD,2019,Drug use disorders,male,YEARS85PLUS,366.0,0.0,8.524e-05,8.5,0.4,0.00103158,103.2,2,3


In [86]:
ghe_garden = catalog.Dataset.create_empty(dest_dir)

In [87]:
ghe_garden.add(ghe_estimates)

In [88]:
ghe_garden.description = """WHO\u2019s Global Health Estimates (GHE) provide the latest available data on death and disability globally, by region and country, and by age, sex and cause. The latest updates include global, regional and country trends from 2000 to 2019 inclusive. By providing key insights on mortality and morbidity trends, these estimates are a powerful tool to support informed decision-making on health policy and resource allocation."""

In [89]:
ghe_garden.save()