# CENSO 2017
## Sección Población

In [15]:
import numpy as np
import pandas as pd
import os
from google_drive_downloader import GoogleDriveDownloader as gdd
# pip install googledrivedownloader

pd.options.display.max_columns = 200

In [16]:
def _download_data(download=True):
    if download:
        gdd.download_file_from_google_drive(
            file_id="17wl7P6G_xVY54htNy8tfffcTDTqvzDvD",
            dest_path="./data_temp/Microdato_Censo2017-Personas.zip",
            unzip=True
        )
    return True

In [17]:
_download_data(False)
reader = pd.read_csv("./data_temp/Microdato_Censo2017-Personas.csv", sep=";", low_memory=False, chunksize=10**3)
df = pd.concat([x for x in reader], ignore_index=True)

#reader = pd.read_csv("./data_temp/Microdato_Censo2017-Personas.csv", sep=";", low_memory = False, iterator = True)
#df = reader.get_chunk(2*10**5)

In [18]:
_questions = list(filter(lambda x: x[0] == "P" and x[1].isdigit(), list(df)))
_questions_to_remove = [
    "P07", "P10PAIS_GRUPO", "P11PAIS_GRUPO",
    "P12A_TRAMO", "P12PAIS_GRUPO", 
    "P15A", "P16A_GRUPO", 
    "P17", "P21A", "P21M"
]

_drop = ["REGION", "PROVINCIA", "ZC_LOC", "ID_ZONA_LOC", "NVIV", "NHOGAR", "PERSONAN"] + _questions_to_remove
_groupby = ["COMUNA", "AREA", "DC", "ESCOLARIDAD"] + list(set(_questions) - set(_questions_to_remove))

In [19]:
q = df.drop(columns=_drop).groupby(_groupby)
_ = pd.DataFrame({
    "cant_per" : q.size()
}).reset_index()

In [20]:
questionnaire = pd.ExcelFile("https://docs.google.com/spreadsheets/d/e/2PACX-1vQ4xZxDpyDY4NursNbrsWlCqjREdmBbfC1EMlz4UGQe7M8wMA7Mqw8tZbAcBdkjgBzloyQdcnhiv10C/pub?output=xlsx")
labels = pd.read_excel(questionnaire, "Labels")

In [21]:
_ = _.rename(columns = { 
    key: labels[labels["id"] == key].iloc[0]["label"] for key in _groupby
})

In [22]:
geo_datachile_id = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQNfVYuzDbyPvvZkaY61MJBZ9nVkZqxD0RlsBarAv8fyqHzjdyhCQ1FPTLayNPhzR_7yUHTmR94pIl3/pub?gid=0&single=true&output=csv")
geo_datachile_id = geo_datachile_id.drop(columns=["region_id", "region_name", "comuna_tax_office_id"])

geo_datachile_id = geo_datachile_id.fillna(0).astype({"comuna_census_id": "int"})
geo_datachile_id = geo_datachile_id.rename(columns={"comuna_census_id": "comuna_id"})

In [23]:
_ = pd.merge(_, geo_datachile_id, on="comuna_id", how="inner")
_ = _.drop(columns=["comuna_id", "comuna_name"])

In [24]:
 # Convert Sex to Datachile Format
# Change Sex (1 Hombre) to (2 Hombre)

_["sex"] = _["sex"].map({1: 2, 2: 1})

In [25]:
aboriginal_people = []
for (a, b) in list(zip(_["native_list"],_["native_list_other"])):
    
    aboriginal_people_id = a

    if b > 20 and b < 98:
        aboriginal_people_id = b
    elif b in [3, 4, 5, 6]:
        aboriginal_people_id = 10 + b
    
    aboriginal_people.append(aboriginal_people_id)

_["aboriginal_people"] = aboriginal_people

In [26]:
# Unused cols
_ = _.drop(columns=["native_list", "native_list_other"])

In [30]:
# Change Census country IDs for Datachile IDs
country_cols = ["residence_country", "residence_country_5_years_ago", "birth_country"]
geo_countries = pd.read_csv("datachile_census_country_id.csv")

_ = _.merge(geo_countries, how = "left", left_on = "birth_country", right_on = "census_country_code")
print(_.head())
#for cc in country_cols:
#    _ = _.merge(geo_countries, how = "left", left_on = cc, right_on = "census_country_code")
#    _[cc] = pd.to_numeric(_["country_code"].fillna(256), downcast = "integer")
#    _ = _.drop(columns = ["census_country_code", "country_name", "country_code"])
    
_["comuna_customs_id"] = pd.to_numeric(_["comuna_customs_id"], downcast = "integer", errors = "coerce")

   area_id  district_id  scholarship  birth_place  residence_country  \
0        1            1            0            1                998   
1        1            1            0            1                998   
2        1            1            0            1                998   
3        1            1            0            1                998   
4        1            1            0            1                998   

   highest_course_approved  children_born_alive  age  residence_comuna  \
0                        0                    0   67                98   
1                        0                   98    0                98   
2                        0                   98    0                98   
3                        0                   98    0                98   
4                        0                   98    1                98   

   residence_5_years_ago  residence_comuna_5_years_ago  habitual_residence  \
0                      2                    

In [None]:
# Save changes
pd.DataFrame(_).to_csv("./data_final/population_census.csv", index=False)

In [None]:
_ = pd.read_csv("./data_final/population_census.csv")
_.head(10)