In [2]:
import xml.etree.ElementTree as et
import sqlite3

data_path = "../data/raw/unesco/unesco_socio_economics.xml"
root = et.parse(data_path).getroot()

ns = {
    "generic": "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/generic",
    "common": "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/common",
    "xsi": "http://www.w3.org/2001/XMLSchema-instance",
    "message": "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message"
}

dataset = root.find("generic:DataSet", ns)

dados = []

for series in dataset.findall("generic:Series", ns):

    # indicador e local
    key = series.find("generic:SeriesKey", ns).findall("generic:Value", ns)
    indicador = key[0].attrib["value"]
    loc = key[1].attrib["value"]
    
    valores = {} # <ano>: <valor> tudo como string
    # ano e valor
    for obs in series.findall("generic:Obs", ns):
        ano = obs[0].text
        valor = obs[1].attrib["value"]
        valores[ano] = valor

    dadosEntry = {}
    dadosEntry["ind"] = indicador
    dadosEntry["local"] = loc
    dadosEntry["valores"] = valores

    dados.append(dadosEntry)

print(dados[0])
    

{'ind': '200101', 'local': 'ABW', 'valores': {'2014': '104', '2015': '104', '2016': '105', '2017': '105', '2018': '106', '2019': '106', '2020': '107'}}


In [3]:
# get description

import io
import requests
import xml.etree.ElementTree as et

url = "http://data.uis.unesco.org/RestSDMX/sdmx.ashx/GetDataStructure/DEMO_DS"
data = requests.get(url).content
root = et.parse(io.StringIO(data.decode("utf-8")))

code_dict = {}

ns = {
    "gen": "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/structure",
    "message": "http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message" 
}

ind_list = root.find("message:CodeLists", ns).find("gen:CodeList", ns)

for ind in ind_list.findall("gen:Code", ns):
    code = ind.attrib["value"]
    desc = ind.find("gen:Description", ns).text

    code_dict[code] = desc

print(code_dict)


{'SP_DYN_TFRT_IN': 'Fertility rate, total (births per woman)', 'DEMO_H1_H1': 'Demographic indicators', 'DEMO_H1': 'Demographic', 'SP_DYN_LE00_IN': 'Life expectancy at birth, total (years)', 'SP_DYN_IMRT_IN': 'Mortality rate, infant (per 1,000 live births)', '200343': 'Population aged 14 years or younger (thousands)', '200144': 'Population aged 15-24 years (thousands)', '200345': 'Population aged 25-64 years (thousands) ', '200151': 'Population aged 65 years or older (thousands)', 'SP_POP_GROW': 'Population growth (annual %)', 'SH_DYN_AIDS_ZS': 'Prevalence of HIV, total (% of population ages 15-49)', 'SP_RUR_TOTL_ZS': 'Rural population (% of total population)', '200101': 'Total population (thousands)', 'NY_GDP_MKTP_CN': 'GDP (current LCU)', 'DEMO_H2_H1': 'Socio-economic indicators', 'DEMO_H2': 'Socio-economic', 'NY_GDP_MKTP_CD': 'GDP (current US$)', 'NY_GDP_DEFL_ZS': 'GDP deflator (base year varies by country)', 'NY_GDP_MKTP_KD_ZG': 'GDP growth (annual %)', 'NY_GDP_PCAP_CD': 'GDP per ca

In [4]:
#indicators of interest
indicators = [
    "SP_DYN_LE00_IN", #expectativa de vida
    "SP_DYN_IMRT_IN", #mortalidade infantil por 1000 nascimentos
    "NY_GDP_MKTP_CD", #GDP (current US$)
    "NY_GDP_PCAP_CD", #pib per capita
    "NY_GDP_MKTP_KD_ZG", # GDP growth (annual %)
    "200343", #Population aged 14 years or younger (thousands)
    "200144", #Population aged 15-24 years (thousands)
    "200345", #Population aged 25-64 years (thousands) 
    "200151", #Population aged 65 years or older (thousands)
    "SP_RUR_TOTL_ZS", #Rural population (% of total population)
    "200101"  #Total population (thousands)
]

In [5]:
ind_columns = {}
for i in indicators:
    ind_columns[i] = []

year = []
ind_code = []
indicator = []
local = []
value = []

#gerar arvore de busca
tree_data = {}

for i in dados:
    if not(i["ind"] in indicators):
        continue

    if not(i["local"] in tree_data):
        tree_data[i["local"]] = {}

    if not(i["ind"] in tree_data[i["local"]]):
        tree_data[i["local"]][i["ind"]] = {}

    for year in i["valores"].keys():
        year = int(year)
        if not(year in tree_data[i["local"]][i["ind"]]):
            tree_data[i["local"]][i["ind"]][year] = float(i["valores"][str(year)])

        
#print(tree_data)
    


In [6]:
#make tables
import pandas as pd
import os

tgt_path = "../data/processed/unesco/unesco_processed_data"

try:
    os.makedirs(tgt_path)
except FileExistsError:
    pass


years = [i for i in range(2015, 2020)]

tables = [{} for i in years]
for i in tables:
    i["Local"] = []
    for j in indicators:
        i[j] = []


for c, y in enumerate(years):
    for loc in tree_data.keys():
        tables[c]["Local"].append(loc)
        for ind in indicators:
            try:
                tables[c][ind].append(tree_data[loc][ind][y])
            except:
                tables[c][ind].append("")


#print(tables[0])
for idx, val in enumerate(tables):
    df = pd.DataFrame(val)
    name = "unesco_socio_eco_{}.csv".format(years[idx])
    df.to_csv(os.path.join(tgt_path, name), index=False)

desc = {}
desc["codigo"] = []
desc["descricao"] = []
for i in code_dict.keys():
    desc["codigo"].append(i)
    desc["descricao"].append(code_dict[i])

df = pd.DataFrame(desc)
df.to_csv(os.path.join(tgt_path, "unesco_descricao.csv"), index=False)