# 1. Set-up environment

In [1]:
# Import các thư viện cần thiết
import requests
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
# import seaborn as sns
import json
import random

# 2. Collect data using Web API

In [2]:

BASE_URL = 'https://ghoapi.azureedge.net/api/MORT_100?$filter=Dim1%20eq%20%27YEARS0-4%27%20and%20SpatialDimType%20eq%20%27COUNTRY%27'
COUNTRIES = {"Country_ID": [], "Country_Name": []}
YEARS = (list(range(2000, 2022)))
CAUSES = {"Causes_ID": [], "Causes_Name": []}
NUMBER_OF_DEATHS = {}


In [3]:

def collect_Causes():
    url = "https://ghoapi.azureedge.net/api/DIMENSION/CHILDCAUSE/DimensionValues"
    response = requests.get(url)
    causes_data = response.json()['value']
    causes_df = pd.DataFrame(causes_data).loc[:, ["Code", "Title"]]

    if len(CAUSES["Causes_ID"]) == 0:
        for i in range(len(causes_df)):
            CAUSES["Causes_ID"].append(causes_df.iloc[i]['Code'])
            CAUSES["Causes_Name"].append(causes_df.iloc[i]['Title'])
            NUMBER_OF_DEATHS[causes_df.iloc[i]['Title']] = []

    return pd.DataFrame(CAUSES)


In [4]:
causes_df = collect_Causes()
pd.set_option('max_colwidth', 100)
causes_df.to_csv("../../../Data/causes_of_death.csv", index=False)
causes_df

Unnamed: 0,Causes_ID,Causes_Name
0,CH10,Prematurity
1,CH11,Birth asphyxia and birth trauma
2,CH12,Sepsis and other infectious conditions of the newborn
3,CH13,Other Group 1 (postneonatal only)
4,CH15,Congenital anomalies
5,CH16,Other noncommunicable diseases (postneonatal only)
6,CH17,Injuries
7,CH18,Tuberculosis
8,CH19,Other Group 1 and Other noncommunicable (neonatal and under-5 only)
9,CH2,HIV/AIDS


In [5]:

def collect_Countries():
    url = BASE_URL + "%20and%20Dim2%20eq%20%27" + CAUSES["Causes_ID"][0] + "%27%20and%20TimeDim%20eq%20" + str(YEARS[0])
    response = requests.get(url)
    countries_data = response.json()['value']
    countries_df = pd.DataFrame(countries_data).loc[:, ["SpatialDim"]]
    countries_df['SpatialDim'] = sorted(countries_df['SpatialDim'])

    if len(COUNTRIES["Country_ID"]) == 0:
        for i in range(len(countries_df)):
            COUNTRIES["Country_ID"].append(countries_df.iloc[i]['SpatialDim'])
            COUNTRIES["Country_Name"].append(None)

    url = "https://ghoapi.azureedge.net/api/DIMENSION/COUNTRY/DimensionValues"
    response = requests.get(url)
    countries_data = response.json()['value']
    countries_df = pd.DataFrame(countries_data).loc[:, ["Code", "Title"]]   
    countries_df = countries_df[countries_df['Code'].isin(COUNTRIES["Country_ID"])].reset_index(drop=True)
    COUNTRIES["Country_Name"] = countries_df["Title"]

    return pd.DataFrame(COUNTRIES)


In [6]:
countries_df = collect_Countries()
countries_df.to_csv("../../../Data/countries.csv", index=False)
countries_df

Unnamed: 0,Country_ID,Country_Name
0,AFG,Afghanistan
1,AGO,Angola
2,ALB,Albania
3,AND,Andorra
4,ARE,United Arab Emirates
...,...,...
189,WSM,Samoa
190,YEM,Yemen
191,ZAF,South Africa
192,ZMB,Zambia


In [7]:

def collect_data():
    NUMBER_OF_DEATHS["CountryName"] = []
    NUMBER_OF_DEATHS["Year"] = []

    # fill data for each attribute
    for year in YEARS:
        for cause_id, cause_name in zip(CAUSES["Causes_ID"], CAUSES["Causes_Name"]):
            # create url and params
            url = BASE_URL + "%20and%20Dim2%20eq%20%27" + cause_id + "%27%20and%20TimeDim%20eq%20" + str(year)

            # send a GET request to the URL and return JSON content from WebAPI
            df = pd.DataFrame(requests.get(url).json()['value'])
            if "SpatialDim" in df.columns and "NumericValue" in df.columns:
                df = pd.DataFrame(requests.get(url).json()['value']).loc[:, ["SpatialDim", "NumericValue"]]
                NUMBER_OF_DEATHS[cause_name].extend(list(df.sort_values('SpatialDim').reset_index(drop=True)["NumericValue"]))
            else:
                NUMBER_OF_DEATHS[cause_name].extend(list(None for _ in range(len(COUNTRIES["Country_ID"]))))
        
        NUMBER_OF_DEATHS["CountryName"].extend(COUNTRIES["Country_Name"])
        NUMBER_OF_DEATHS["Year"].extend(list(year for i in range(0, len(COUNTRIES["Country_Name"]))))
            
        print("Loading data in", year)
        
    return pd.DataFrame(NUMBER_OF_DEATHS)
        

In [8]:
#TEST
data_number_of_deaths = collect_data()
assert data_number_of_deaths.shape == (4268, 19)

Loading data in 2000
Loading data in 2001
Loading data in 2002
Loading data in 2003
Loading data in 2004
Loading data in 2005
Loading data in 2006
Loading data in 2007
Loading data in 2008
Loading data in 2009
Loading data in 2010
Loading data in 2011
Loading data in 2012
Loading data in 2013
Loading data in 2014
Loading data in 2015
Loading data in 2016
Loading data in 2017
Loading data in 2018
Loading data in 2019
Loading data in 2020
Loading data in 2021


In [9]:
# Save to csv file with name number_of_deaths_raw_data.csv
data_number_of_deaths.to_csv("../../../Data/number_of_deaths_raw_data.csv", index=False)
data_number_of_deaths

Unnamed: 0,Prematurity,Birth asphyxia and birth trauma,Sepsis and other infectious conditions of the newborn,Other Group 1 (postneonatal only),Congenital anomalies,Other noncommunicable diseases (postneonatal only),Injuries,Tuberculosis,Other Group 1 and Other noncommunicable (neonatal and under-5 only),HIV/AIDS,Diarrhoeal diseases,Pertussis,Tetanus,Measles,Meningitis/encephalitis,Malaria,Acute lower respiratory infections,CountryName,Year
0,19333.41926,11691.37210,4489.50404,,1534.08344,,371.28902,0.0,10579.92644,0.22891,1973.08786,,2331.69796,0.0,1395.33839,0.0,6908.05259,Afghanistan,2000
1,14761.72219,10042.87869,2977.92908,,1405.50824,,130.32201,0.0,3849.14708,30.31433,724.01316,,768.57928,0.0,357.27433,0.0,4267.31161,Angola,2000
2,351.16760,110.05251,17.54192,,106.22332,,4.31283,0.0,30.48989,0.00347,0.77210,,0.09764,0.0,10.36631,0.0,36.97242,Albania,2000
3,3.00000,0.00000,0.00000,,0.00000,,0.00000,0.0,0.00000,0.00000,0.00000,,0.00000,0.0,0.00000,0.0,0.00000,Andorra,2000
4,137.90854,26.85080,2.86905,,75.66284,,1.78464,0.0,66.57905,0.10435,0.00000,,0.00000,0.0,0.15043,0.0,0.09029,United Arab Emirates,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4263,17.45855,5.56010,2.93639,,7.28102,,0.25978,0.0,4.18806,0.00000,0.00000,,0.00000,0.0,0.15554,0.0,1.16055,Samoa,2021
4264,11286.37393,6796.63328,1742.58956,,2227.89226,,229.08594,0.0,3051.97336,0.90444,521.35998,,129.94279,0.0,44.07145,0.0,2523.17301,Yemen,2021
4265,4634.07215,2595.54661,1828.98610,,1315.87236,,166.21530,0.0,1708.85383,36.14888,0.00000,,0.00000,0.0,16.01934,0.0,677.28544,South Africa,2021
4266,6920.62510,4054.12780,912.70245,,1274.53105,,35.09944,0.0,1776.66284,22.93804,134.08604,,23.52546,0.0,108.91078,0.0,1228.79101,Zambia,2021
