# Census 2017 - Population (Ingestion)

In [None]:
import sys
import configparser
config = configparser.ConfigParser()
config.read_file(open("../../settings.ini"))

sys.path.insert(0, config.get('PATHS','libs_path'))
engine_path = config.get('DATABASE','engine_path')

In [None]:
local_path = "../data_final/"
remote_path = "/"

In [None]:
# from local file postgres.py
import postgres
# from local file commons.py
from commons import inline_table_xml, inline_dimension_xml, download_file, download_zip_file, extract_zip_file

import json
import pandas as pd
from sqlalchemy import create_engine

In [None]:
df1 = download_file(remote_path, local_path, "population_census.csv")

In [None]:
df1.head()

In [None]:
df1 = df1.rename(columns = {"comuna_datachile_id": "comuna_id"})

In [None]:
engine = create_engine(engine_path)
db = postgres.PostgresDriver(engine)
db.to_sql(df1, "census", "fact_population_census")

In [None]:
engine.execute("""
CREATE INDEX fact_population_census_index 
ON census.fact_population_census (comuna_id)
""")

## Inline Tables

In [None]:
# Read questionnaire file
questionnaire = pd.ExcelFile("https://docs.google.com/spreadsheets/d/e/2PACX-1vQ4xZxDpyDY4NursNbrsWlCqjREdmBbfC1EMlz4UGQe7M8wMA7Mqw8tZbAcBdkjgBzloyQdcnhiv10C/pub?output=xlsx")
labels = pd.read_excel(questionnaire, "Labels")

id_labels = list(df1) - []
for label in list(df1):
    q = labels[labels["label"] == label].iloc[0]["id"]
    if q[0] == "_" or q[0] == "P":
        df = pd.read_excel(questionnaire, q)
        print (inline_dimension_xml(df, label, "id", "es", label))

In [None]:
df = pd.read_csv("datachile_census_country_id.csv", index_col = 0)
df = df[["my_country_code", "my_country_name"]]
df["my_country_code"] = df["my_country_code"].drop_duplicates()
df = df.dropna()
df["my_country_code"] = df["my_country_code"].astype("int")
df.iloc[7,1] = "País no especificado"

print(inline_dimension_xml(df, "Residence Country 5 Years Ago", "my_country_code", "my_country_name", "residence_country_5_years_ago"))

In [None]:
df = pd.read_csv("datachile_comunas.csv")
df = df[["comuna_datachile_id", "comuna_name"]]
df = df.append({"comuna_datachile_id": 999, "comuna_name": "Comuna no especificada"}, ignore_index=True)

print(inline_dimension_xml(df, "Residence Comuna 5 Years Ago", "comuna_datachile_id", "comuna_name", "residence_comuna_5_years_ago"))

In [None]:
econ = pd.read_csv("economic_activity.csv", sep = ";")
econ["num_id"] = econ.index
econ.iloc[22] = pd.Series({"id": "98", "es": "No aplica", "num_id": "98"})
econ.iloc[23] = pd.Series({"id": "99", "es": "Missing", "num_id": "99"})

print(inline_dimension_xml(econ, "Economic Activity", "num_id", "es", "economic_activity"))