In [1]:
# Import libraries
import pandas as pd
import json

In [2]:
# Datasets to import
# Tested: May, 22th
URL_BASE = "http://www.deis.cl/wp-content/uploads"
datasets = [
    {
        "source_link": "/2015/05/6.-Defunciones-y-mortalidad-infantil-y-sus-componentes-por-regi%C3%B3n-y-comuna-de-residencia.-Chile-2012.xlsx",
        "year": 2012
    },
    {
        "source_link": "/2016/08/6.-Defunciones-y-mortalidad-infantil-y-sus-componentes-por-regi%C3%B3n-y-comuna-de-residencia.-Chile-2013.xlsx",
        "year": 2013
    },
    {
        "source_link": "/2016/08/6.-Defunciones-y-mortalidad-infantil-y-sus-componentes-por-regi%C3%B3n-y-comuna-de-residencia.-Chile-2014.xlsx",
        "year": 2014
    }
]

# Read Comunas
URL = "https://raw.githubusercontent.com/datachile/datachile-etl/master/official_ids/2017_06_27_comunas_datachile_fixed.csv"

In [3]:
def search_comuna_id(name):
    name = name.upper().strip().replace(u"\xa0", u" ")
    if name in comuna_datachile_id:
        return comuna_datachile_id[name]
    else:
        return comuna_thesauro_id[name]
    

def get_values(d):
    return [ value for key, value in d.items() ]

In [4]:
geo_datachile_id = pd.read_csv(URL)

comuna_datachile_id = dict(
    zip([elm.upper() for elm in geo_datachile_id["comuna_name"]], geo_datachile_id["comuna_datachile_id"])
)

age_range = pd.read_csv("https://docs.google.com/spreadsheets/d/" + 
                   "152kELSLYe4oiCiH7-Wmt-FWvSYavT0LUQt1C_qXnXVM" +
                   "/export?gid=0&format=csv"
                  )

age_range.to_csv("data/age_range_id.csv", index=False)

formatted_df = pd.melt(age_range, id_vars=["id"], value_vars=["es"])
q = formatted_df.to_dict()

age_range_ids = dict(zip(get_values(q["value"]), get_values(q["id"])))

thesauro_comunas = pd.read_csv("https://docs.google.com/spreadsheets/d/" + 
                   "1ccZd25Q5BfeL2w-KCbe1k7ZJIuskJa0gwoTZsillBGk" +
                   "/export?gid=0&format=csv"
                  )

comuna_thesauro_id = dict(zip([elm.upper() for elm in thesauro_comunas["nombre"]], thesauro_comunas["id"]))

In [5]:
tidy = []

for dataset in datasets:
    data = pd.read_excel(URL_BASE + dataset["source_link"])
    
    # Read title/subtitle of Excel
    title = [ str(item).strip() for item in data.iloc[2] ]
    subtitle = [ str(item).strip() for item in data.iloc[3] ]

    # Preprocess excel
    output = []
    for i in range(4, len(data) - 2):
        frame = data.iloc[i]
        tuples = list(zip(subtitle, frame))
        geo = { "comuna": str(frame[1]).strip() } if str(frame[0]).strip() == "nan" else { "country": str(frame[0]).strip() }  if str(frame[0]).strip() == "Total País" else { "region": str(frame[0]).strip() } 

        tuples = list(filter(lambda x: x[0] != "nan", tuples))

        age_range = ""

        query = []
        for key, tup in enumerate(tuples):
            if tup[0] != "Tasa*":
                age_range = tup[0]
                count = tup[1]

            if key % 2 != 0:
                item = {
                    "age_range": age_range,
                    "count": count,
                    "rate": tup[1]
                }
                query.append(item)

        tuples = {**geo, "children": query}
        output.append(tuples)

    # Process data
    country_data = {}
    region_data = {}
    for key, item in enumerate(output):
        if "country" in item:
            country_data = { d["age_range"]: d["rate"] for d in item["children"] }

        if "region" in item:
            region_data = { d["age_range"]: d["rate"] for d in item["children"] }

        if "comuna" in item:
            for d in item["children"]:
                out = {
                    "comuna_id": search_comuna_id(item["comuna"]),
                    "year": dataset["year"],
                    #"age_range": d["age_range"],
                    "age_range_id": age_range_ids[d["age_range"]],
                    "rate_country": country_data[d["age_range"]],
                    "rate_region": region_data[d["age_range"]],
                    "rate_comuna": 0 if d["rate"] == "-" else d["rate"],
                    "count": 0 if d["count"] == "-" else d["count"]
                }
                tidy.append(out)

In [6]:
# Save tidy in /data/ folder
pd.DataFrame(tidy).to_csv("data/under_one.csv", index=False)