# CENSO 2017
## Sección Población

In [None]:
import numpy as np
import pandas as pd
import os
from IPython.display import display
from google_drive_downloader import GoogleDriveDownloader as gdd
# pip install googledrivedownloader

pd.options.display.max_columns = 200

In [None]:
def _download_data(download=True):
    if download:
        gdd.download_file_from_google_drive(
            file_id="17wl7P6G_xVY54htNy8tfffcTDTqvzDvD",
            dest_path="./data_temp/Microdato_Censo2017-Personas.zip",
            unzip=True
        )
    return True

In [None]:
_download_data(False)
reader = pd.read_csv("./data_temp/Microdato_Censo2017-Personas.csv", sep=";", low_memory=False, chunksize=10**3)
df = pd.concat([x for x in reader], ignore_index=True)

# Uncomment to use just 200,000 rows
#reader = pd.read_csv("./data_temp/Microdato_Censo2017-Personas.csv", sep=";", low_memory = False, iterator = True)
#df = reader.get_chunk(2*10**5)

In [None]:
_questions = list(filter(lambda x: x[0] == "P" and x[1].isdigit(), list(df)))
_questions_to_remove = [
    "P07", "P10PAIS_GRUPO", "P11PAIS_GRUPO",
    "P12A_TRAMO", "P12PAIS_GRUPO", 
    "P15A", "P16A_GRUPO", 
    "P17", "P21A", "P21M"
]

_drop = ["REGION", "PROVINCIA", "ZC_LOC", "ID_ZONA_LOC", "NVIV", "NHOGAR", "PERSONAN"] + _questions_to_remove
_groupby = ["COMUNA", "AREA", "DC", "ESCOLARIDAD"] + list(set(_questions) - set(_questions_to_remove))

In [None]:
q = df.drop(columns=_drop).groupby(_groupby)
_ = pd.DataFrame({
    "cant_per" : q.size()
}).reset_index()

In [None]:
questionnaire = pd.ExcelFile("https://docs.google.com/spreadsheets/d/e/2PACX-1vQ4xZxDpyDY4NursNbrsWlCqjREdmBbfC1EMlz4UGQe7M8wMA7Mqw8tZbAcBdkjgBzloyQdcnhiv10C/pub?output=xlsx")
labels = pd.read_excel(questionnaire, "Labels")

In [None]:
_ = _.rename(columns = { 
    key: labels[labels["id"] == key].iloc[0]["label"] for key in _groupby
})

In [None]:
geo_datachile_id = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQNfVYuzDbyPvvZkaY61MJBZ9nVkZqxD0RlsBarAv8fyqHzjdyhCQ1FPTLayNPhzR_7yUHTmR94pIl3/pub?gid=0&single=true&output=csv")
geo_datachile_id = geo_datachile_id.drop(columns=["region_id", "region_name", "comuna_tax_office_id"])

geo_datachile_id = geo_datachile_id.fillna(0).astype({"comuna_census_id": "int"})
geo_datachile_id = geo_datachile_id.rename(columns={"comuna_census_id": "comuna_id"})

In [None]:
_ = pd.merge(_, geo_datachile_id, on="comuna_id", how="inner")
_ = _.drop(columns=["comuna_id", "comuna_name"])

In [None]:
 # Convert Sex to Datachile Format
# Change Sex (1 Hombre) to (2 Hombre)

_["sex"] = _["sex"].map({1: 2, 2: 1})

In [None]:
aboriginal_people = []
for (a, b) in list(zip(_["native_list"],_["native_list_other"])):
    
    aboriginal_people_id = a

    if b > 20 and b < 98:
        aboriginal_people_id = b
    elif b in [3, 4, 5, 6]:
        aboriginal_people_id = 10 + b
    
    aboriginal_people.append(aboriginal_people_id)

_["aboriginal_people"] = aboriginal_people

In [None]:
# Unused cols
_ = _.drop(columns=["native_list", "native_list_other"])

In [None]:
# Change Census country IDs for Datachile IDs
country_cols = ["residence_country", "residence_country_5_years_ago", "birth_country"]
geo_countries = pd.read_csv("datachile_census_country_id.csv")


for cc in country_cols:
    _ = _.merge(geo_countries, how = "left", left_on = cc, right_on = "my_census_id")
    _[cc] = pd.to_numeric(_["my_country_code"].fillna(256), downcast = "integer")
    _ = _.drop(columns = ["my_census_id", "my_country_name", "my_country_code"])
    _ = _.loc[:, ~_.columns.str.contains('^Unnamed')]
    
_["comuna_customs_id"] = pd.to_numeric(_["comuna_customs_id"], downcast = "integer", errors = "coerce")

In [None]:
# Change Census comuna IDs for Datachile IDs
comuna_cols = ["residence_comuna", "residence_comuna_5_years_ago", "birth_comuna"]
geo_comunas = pd.read_csv("datachile_comunas.csv")

_["economic_activity"] = _["economy_activity"]

for cc in comuna_cols:
    _ = _.merge(geo_comunas, how = "left", left_on = cc, right_on = "my_comuna_customs_id")
    _[cc] = pd.to_numeric(_["my_comuna_datachile_id"].fillna(999), downcast = "integer")
    _ = _.loc[:, ~_.columns.str.contains('my_')]
    _ = _.drop(columns = ["id"])

In [None]:
# Change Economic Activity to numeric values
econ = pd.read_csv("economic_activity.csv", sep = ";")
econ["num_id"] = econ.index
econ.iloc[22] = pd.Series({"id": "98", "es": "No aplica", "num_id": "98"})
econ.iloc[23] = pd.Series({"id": "99", "es": "Missing", "num_id": "99"})

_["economic_activity"] = _["economic_activity"].str.replace(" ","")
_ = _.merge(econ, how = "left", left_on = "economic_activity", right_on = "id")
_["economic_activity"] = _["num_id"]
_ = _.drop(columns = ["id", "es", "num_id"])

In [None]:
# Save changes
pd.DataFrame(_).to_csv("./data_final/population_census.csv", index=False)