In [792]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import scipy
import geopandas as gpd
import xarray as xr
import sparse
from itertools import product, combinations
from pathlib import Path
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib

pd.options.display.max_rows = 500
#pd.options.display.max_columns = 4000

import sys
sys.path.append('../src/')

# import categories as cats
# import constraints

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from extended_survey import process_people_df, process_places_df
from census import process_census
from constraints import get_ind_const, get_viv_const

from setup_lin_system import setup_ls, check_solvable, find_conf_const

In [3]:
constraints_ind = get_ind_const()
constraints_viv = get_viv_const()

In [4]:
survey_dir = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/')
personas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Personas19.CSV')
viviendas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Viviendas19.CSV')
census_iter_path = Path('../data/census_loc/ITER_19CSV20.csv')
census_resageburb_path = Path('../data/census_ageb_manz/RESAGEBURB_19CSV20.csv')

In [5]:
%%time
viviendas_cat = process_places_df(viviendas_path)
viviendas_full = pd.concat(viviendas_cat.values())

CPU times: user 708 ms, sys: 102 ms, total: 810 ms
Wall time: 810 ms


In [6]:
%%time
personas_cat = process_people_df(personas_path)
personas_full = pd.concat(personas_cat.values())

CPU times: user 9.97 s, sys: 476 ms, total: 10.4 s
Wall time: 10.5 s


In [7]:
%%time
(
    df_mun, df_mun_min, df_mun_max,
    df_loc, df_loc_min, df_loc_max,
    df_agebs, df_agebs_min, df_agebs_max
) = process_census(census_iter_path, census_resageburb_path)

CPU times: user 10.8 s, sys: 1.24 s, total: 12.1 s
Wall time: 12.1 s


In [8]:
#%%time
#XWC_dict = setup_ls(personas_cat, df_mun, constraints_ind)

In [962]:
from setup_lin_system import get_X, get_X_I, get_W, find_zero_nozero_const, find_nonzero_zero_const, get_X_I_fast, get_matrices, fill_zero_h

In [10]:
from census import locate_collective

In [11]:
df_mun_loc = locate_collective(df_mun, df_loc)

In [12]:
for mun, n_p, n_v, df in df_mun_loc:
    print(mun, n_p, n_v, df.index.values)

7 8.0 1.0 [ 29 216 219 221 269 344]
9 1873.0 1.0 [620]
31 10.0 1.0 [160 298]
49 4.0 1.0 [ 22  57  58  67  77 127 128 138 139 149 180 196 197 206 209 233 236 254
 272 278 289 295 296]


In [415]:
personas_census_controlled = [
    'SEXO', 'EDAD', 'ENT_PAIS_NAC', 'AFRODES', 'DHSERSAL', 'RELIGION', 'DIS',
    'HLENGUA', 'ASISTEN', 'NIVACAD', 'ALFABET', 'ENT_PAIS_RES_5A', 'SITUA_CONYUGAL',
    'CONACT',
]

personas_extra = [
    # Household related
    'CLAVIVP', 'PARENTESCO',
    
    # Work related   
    'INGTRMEN', 'OCUPACION_C', 'SITTRA', 'HORTRA', 'ACTIVIDADES_C',
    'AGUINALDO', 'VACACIONES', 'SERVICIO_MEDICO', 'UTILIDADES', 'INCAP_SUELDO', 'SAR_AFORE', 'CREDITO_VIVIENDA',
    
    # Misc
    'NACIONALIDAD', 'SERSALUD', 'ELENGUA', 'PERTE_INDIGENA',
    
    # Transport related
    'ENT_PAIS_ASI',
    'MED_TRASLADO_ESC_Automóvil o camioneta',
    'MED_TRASLADO_ESC_Bicicleta',
    'MED_TRASLADO_ESC_Blanco por pase',
    'MED_TRASLADO_ESC_Caminando',
    'MED_TRASLADO_ESC_Camión, autobús, combi, colectivo',
    'MED_TRASLADO_ESC_Metro, tren ligero, tren suburbano',
    'MED_TRASLADO_ESC_Metrobús (autobús en carril confinado)',
    'MED_TRASLADO_ESC_Motocicleta o motoneta',
    'MED_TRASLADO_ESC_No especificado',
    'MED_TRASLADO_ESC_Otro',
    'MED_TRASLADO_ESC_Taxi (App Internet)',
    'MED_TRASLADO_ESC_Taxi (sitio, calle, otro)',
    'MED_TRASLADO_ESC_Transporte escolar',
    
    'ENT_PAIS_TRAB',
    'MED_TRASLADO_TRAB_Automóvil o camioneta',
    'MED_TRASLADO_TRAB_Bicicleta',
    'MED_TRASLADO_TRAB_Blanco por pase',
    'MED_TRASLADO_TRAB_Caminando',
    'MED_TRASLADO_TRAB_Camión, autobús, combi, colectivo',
    'MED_TRASLADO_TRAB_Metro, tren ligero, tren suburbano',
    'MED_TRASLADO_TRAB_Metrobús (autobús en carril confinado)',
    'MED_TRASLADO_TRAB_Motocicleta o motoneta',
    'MED_TRASLADO_TRAB_No especificado's,
    'MED_TRASLADO_TRAB_Otro',
    'MED_TRASLADO_TRAB_Taxi (App Internet)',
    'MED_TRASLADO_TRAB_Taxi (sitio, calle, otro)',
    'MED_TRASLADO_TRAB_Transporte escolar',
]

# This should change when cloning across municipalities
spatial_cols_per = [
    'MUN_ASI', # If MUN_RES_5A == MUN, change to MUN, else NE. Keep BBP, NE, OtroPais, OtraEnt
    'TIE_TRASLADO_ESCU', # Keep BPP, no se traslada, NE, if not change to NE for imputation

    'MUN_TRAB',
    'TIE_TRASLADO_TRAB',
    
    'MUN_RES_5A', # If MUN_RES_5A == MUN, change to MUN, else NE. Keep BBP, NE, OtroPais, OtraEnt
    'MUN', # Change to current mun
]

In [414]:
viviendas_census_controlled = [
    'PISOS', 'ELECTRICIDAD', 'AGUA_ENTUBADA', 'ABA_AGUA_ENTU',
    'TINACO', 'CISTERNA', 'SERSAN', 'CONAGUA', 'DRENAJE',
    
    'REFRIGERADOR', 'LAVADORA', 'HORNO',
    'RADIO', 'TELEVISOR', 'COMPUTADORA', 'TELEFONO', 'CELULAR',
    'INTERNET', 'SERV_TV_PAGA', 'SERV_PEL_PAGA', 'CON_VJUEGOS',
    
    'AUTOPROP', 'MOTOCICLETA', 'BICICLETA',
    
    'CUADORM', 'TOTCUART',
    
    'JEFE_SEXO'
]

viviendas_extra = [
    'CLAVIVP', 'NUMPERS', 'TIPOHOG', 'JEFE_EDAD',
    
    'INGTRHOG',
    
    'PAREDES', 'TECHOS', 'COCINA', 'LUG_COC', 'COMBUSTIBLE',
    'ESTUFA', 'FOCOS', 'FOCOS_AHORRA', 'ABA_AGUA_NO_ENTU', 'BOMBA_AGUA',
    'REGADERA', 'BOILER', 'CALENTADOR_SOLAR', 'AIRE_ACON', 'PANEL_SOLAR',
    'USOEXC', 'SEPARACION1', 'SEPARACION2', 'SEPARACION3', 'SEPARACION4', 
    'DESTINO_BAS', 'TENENCIA', 'ESCRITURAS', 'FORMA_ADQUI', 
    
    'FINANCIAMIENTO1', 'FINANCIAMIENTO2', 'FINANCIAMIENTO3', 'DEUDA',
    
    'MCONMIG', 'MNUMPERS',
    
    'INGR_PEROTROPAIS', 'INGR_PERDENTPAIS', 'INGR_AYUGOB', 'INGR_JUBPEN',
    
    'ALIMENTACION', 'ALIM_ADL1', 'ALIM_ADL2',
    'ING_ALIM_ADL1', 'ING_ALIM_ADL2', 'ING_ALIM_ADL3'
]

# This should change when cloning across municipalities
spatial_cols_viv = [
    'MUN'
]

In [None]:
# Now, what about conflicting constraints?
# We can use the nnls solution to add households sparsely
# But the weights?

# Sketch of the solution...
# 1. Find conflicting constraints
# 2. Identofy households and people datframes among all municipalities as in the zero constraints solution
# 3. Add an identifuer column, extra = 1
# 4. Merge with households involved in the constraints already in mun_dest
# 5. Find the nnnls solution.
# 6. Keep the non-zero rows of new households.
# 7. Add each a weight of 1 (extend sample). Make sure there are few additions.
# 8. Assert the conflicting constraints are no more. If this is not the case, the nnls solution is no good.

In [1161]:
%%time
XWC_dict = setup_ls(personas_cat, viviendas_cat, df_mun, constraints_ind, constraints_viv)

Building initial dict ...
    Abasolo ...Done.
    Agualeguas ...Done.
    Los Aldamas ...Done.
    Allende ...Done.
    Anáhuac ...Done.
    Apodaca ...Done.
    Aramberri ...Done.
    Bustamante ...Done.
    Cadereyta Jiménez ...Done.
    El Carmen ...Done.
    Cerralvo ...Done.
    Ciénega de Flores ...Done.
    China ...Done.
    Doctor Arroyo ...Done.
    Doctor Coss ...Done.
    Doctor González ...Done.
    Galeana ...Done.
    García ...Done.
    San Pedro Garza García ...Done.
    General Bravo ...Done.
    General Escobedo ...Done.
    General Terán ...Done.
    General Treviño ...Done.
    General Zaragoza ...Done.
    General Zuazua ...Done.
    Guadalupe ...Done.
    Los Herreras ...Done.
    Higueras ...Done.
    Hualahuises ...Done.
    Iturbide ...Done.
    Juárez ...Done.
    Lampazos de Naranjo ...Done.
    Linares ...Done.
    Marín ...Done.
    Melchor Ocampo ...Done.
    Mier y Noriega ...Done.
    Mina ...Done.
    Montemorelos ...Done.
    Monterrey ...Done.
    P

In [1169]:
len(XWC_dict['Los Aldamas']['Y'])

534

In [1163]:
for mun in personas_cat.keys():
    print(mun, XWC_dict[mun]['conf_consts'])

Abasolo ['VPH_2YMASD', 'VPH_PISODT', 'VPH_PISOTI', 'VPH_1DOR']
Agualeguas []
Los Aldamas ['PCATOLICA', 'PEA_F', 'PRO_CRIEVA', 'P_12YMAS', 'PRELIG_NE', 'PSIN_RELIG', 'PE_INAC_F', 'POBTOT', 'P_12YMAS_M']
Allende ['VPH_3YMASC', 'VPH_NODREN', 'VPH_AGUAFV', 'VPH_AGUADV', 'VPH_C_ELEC', 'VPH_DRENAJ', 'VPH_1CUART', 'VPH_2CUART', 'VPH_S_ELEC']
Anáhuac ['VPH_3YMASC', 'VPH_1CUART', 'VPH_NODREN', 'VPH_AGUAFV', 'VPH_PISODT', 'VPH_PISOTI', 'VPH_AGUADV', 'VPH_2YMASD', 'VPH_C_ELEC', 'VPH_DRENAJ', 'VPH_1DOR', 'VPH_2CUART', 'VPH_S_ELEC']
Apodaca ['VPH_EXCSA', 'VPH_NDEAED', 'VPH_AGUAFV', 'VPH_PISODT', 'VPH_PISOTI', 'VPH_AGUADV', 'VPH_LETR', 'VPH_C_ELEC', 'VPH_DRENAJ', 'VPH_DSADMA', 'VPH_S_ELEC']
Aramberri ['P3YM_HLI', 'P3HLI_NE_F', 'P3HLI_HE', 'P3HLI_HE_F', 'P3YM_HLI_F', 'P5HLI_NE', 'P5_HLI_HE']
Bustamante ['PCATOLICA', 'PRO_CRIEVA', 'P_0A2', 'P12YM_CASA', 'P_0A2_F', 'P_12YMAS', 'P_3YMAS_M', 'PSIN_RELIG', 'P12YM_SEPA', 'PRELIG_NE', 'POTRAS_REL', 'P_3YMAS', 'POBFEM', 'POBTOT', 'P12YM_SOLT']
Cadereyta Jimé

## Different type of solutions

In [16]:
import scipy.linalg as spl
from scipy.optimize import nnls

In [11]:
mun = 'Cerralvo'
X2 = XWC_dict[mun]['X']
W2 = XWC_dict[mun]['W']
C2 = XWC_dict[mun]['C']

check_solvable(W2, C2)

True

In [106]:
W.shape, W2.shape

((97, 2882), (183, 2882))

In [90]:
# Non negative least squares solution

X_sol_nn, err = nnls(W, C)
np.linalg.norm(W.values @ X_sol_nn - C.values, ord=1), np.linalg.norm(X_sol_nn)

(6.67119692820961e-11, 1280.5252064123974)

In [91]:
X_sol_nn2, err = nnls(W2, C2)
np.linalg.norm(W2.values @ X_sol_nn2 - C2.values, ord=1), np.linalg.norm(X_sol_nn2)

(9.615463980594541e-11, 1091.7601945594413)

In [100]:
# Least squares solution

W_inv = spl.pinv(W)
X_sol = W_inv @ C.values
np.linalg.norm(W.values @ X_sol - C.values, ord=1), np.linalg.norm(X_sol)

(2.957613265408554e-09, 174.80568107488986)

In [104]:
# Least squares solution

W_inv = spl.pinv(W2)
X_sol2 = W_inv @ C2.values
np.linalg.norm(W2.values @ X_sol2 - C2.values, ord=1), np.linalg.norm(X_sol2)

(1.2072849386868256e-09, 174.805681074888)

In [105]:
np.linalg.norm(X_sol2 - X_sol)

5.845229258204539e-10

In [518]:
from ipf import ipf_classic_numba

In [107]:
%%time
# The IPF solution
x = X.FACTOR.values.astype(float)
x_ipf, err = ipf_classic_numba(x, W.values.astype(float), C.values, tol=1e-1, max_iters=1e4)
err, np.linalg.norm(x_ipf)

CPU times: user 7.97 s, sys: 791 ms, total: 8.76 s
Wall time: 6.36 s


(44.4800477607044, 205.72287053844428)

In [115]:
%%time
# The IPF solution
x = X2.FACTOR.values.astype(float)
x_ipf2, err = ipf_classic_numba(x, W2.values.astype(float), C2.values, tol=1e-3, max_iters=1e4)
err, np.linalg.norm(x_ipf2)

CPU times: user 1.21 s, sys: 200 ms, total: 1.41 s
Wall time: 1.15 s


(0.0009983262271617832, 206.0888458338842)

In [681]:
# The max ent solution

### OD Survey

In [96]:
od_df = pd.read_csv('../data/OD_Survey/datos_limpios_tiempos.csv', low_memory=False)

In [101]:
od_df.head()

Unnamed: 0,ID-HOGAR,H-P,H-P-V,Latitud,Longitud,FechaHoraEnc,NumVisita,TipoEnc,RealizoEnc,Encuestador,Supervisor,Cod_MunDomicilio,ColDom,CalleDom,NExtDom,NIntDom,RefDom,CPDom,Punto_zona,Cod_EdoDomicilio,LineaTelef,VHAuto,VHMoto,VHPickup,VHCamion,VHBici,VHPatineta,VHPatines,VHScooter,VHOtro,CHBaños,CHDormitorios,Internet,Hab14masTrabajo,HabitantesTotal,HbitantesMayor6,HbitantesMenor5,TodosEstan,NunHabitante,Género,Edad,RelaciónHogar,RelaciónHogar_O,Discapacidad,Discapacidad_O,Estudios,Estudios_O,Ocupacion,Ocupacion_O,SectorEconom,SectorEconom_O,ViajeAyer,Num_Viaje,Lugar_Or,Cod_MunOri,ColOri,RefOri,CalleOri,Esquina_Ori,Cruce_Ori,OtroEstadoOri,OtroEstadoOri_O,CodOri,Cod_EdoOri,Macrozona Origen,ZonaOri,Hora Inicio V,LugarDest,Cod_MunDest,ColDest,RefDest,CalleDest,Esquina_Dest,Cruce_Dest,OtroEstadoDest,OtroEstadoDest_O,CodDest,Cod_EdoDest,Macrozona Destino,ZonaDest,Hora Término Viaje,Cod_IDEdoDest,Cod_IDMunDest,Cod_IDLocDest,Cod_LocDest,Cod_IDColDest,Cod_ColDest,Cod_IDRefDest,Cod_RefDest,Tiempo Tot de Viaje,Motivo,Motivo_O,M1_TipoTransp,M1_Transp_O,M1Tpo_Caminata,M1N_Ruta,M1_HHTpoParada,M1_MMTpoParada,M1_HHTpoAbordo,M1_HHTpoAbordo_O,M1_MMTpoAbordo,M1_Pago,M2_Transp,M2_Transp_O,M2_TipoTransp,M2Tpo_Caminata,M2N_Ruta,M2_HHTpoParada,M2_MMTpoParada,M2_TpoTranspordo,M2_HHTpoAbordo,M2_HHTpoAbordo_O,M2_MMTpoAbordo,M2_Pago,M3_Transp,M3_Transp_O,M3_TipoTransp,M3Tpo_Caminata,M3N_Ruta,M3_HHTpoParada,M3_MMTpoParada,M3_TpoTranspordo,M3_HHTpoAbordo,M3_HHTpoAbordo_O,M3_MMTpoAbordo,M3_Pago,M4_Transp,M4_Transp_O,M4_TipoTransp,M4Tpo_Caminata,M4N_Ruta,M4_HHTpoParada,M4_MMTpoParada,M4_TpoTranspordo,M4_HHTpoAbordo,M4_HHTpoAbordo_O,M4_MMTpoAbordo,M4_Pago,M5_Transp,M5_Transp_O,M5_TipoTransp,M5Tpo_Caminata,M5N_Ruta,M5_HHTpoParada,M5_MMTpoParada,M5_TpoTranspordo,M5_HHTpoAbordo,M5_HHTpoAbordo_O,M5_MMTpoAbordo,M5_Pago,M6_Transp,M6_Transp_O,M6_TipoTransp,M6Tpo_Caminata,M6N_Ruta,M6_HHTpoParada,M6_MMTpoParada,M6_TpoTranspordo,M6_HHTpoAbordo,M6_HHTpoAbordo_O,M6_MMTpoAbordo,M6_Pago,M7_Transp,M7_Transp_O,M7_TipoTransp,M7Tpo_Caminata,M7N_Ruta,M7_HHTpoParada,M7_MMTpoParada,M7_TpoTranspordo,M7_HHTpoAbordo,M7_HHTpoAbordo_O,M7_MMTpoAbordo,M7_Pago,M8_Transp,M8_Transp_O,M8_TipoTransp,M8Tpo_Caminata,M8N_Ruta,M8_HHTpoParada,M8_MMTpoParada,M8_TpoTranspordo,M8_HHTpoAbordo,M8_HHTpoAbordo_O,M8_MMTpoAbordo,M8_Pago,TipoEstacionamiento,TpoBusqueda,TpoEstacionadoHH,TpoEstacionadoMM,CostoEstacionamiento,Obs_Encuestador,Obs_ENCUESTA,FE,Modo Agrupado,motivos,genero,estudios,disc,origen,Tiempo,tiempo_s,tiempo_m,tiempo_h
0,23853a-20,23853a-20/2,23853a-20/2-2,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,HABITANTE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,2,Mujer,40,Madre/Esposa,,Ninguna,,Sin Instrucción,,Ama de casa,,Otro,,Sí,2,El DESTINO de viaje inmediato anterior,Monterrey,,,,,,,,10358.0,Nuevo Leon,19.0,76,1899-12-31T18:50:00Z,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T19:00:00Z,19.0,39.0,,,34979.0,BARRIO ESTRELLA,,,1899-12-31T00:10:00Z,regreso a casa,,Vehículo Particular,,0.0,,0.0,0,0,0.0,10,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"Propio (Casa, oficina)",,11.0,,,,,1282.883061,automovil,3,F,Bajo,No,NA+NA,,600,10,0.166667
1,23853a-20,23853a-20/4,23853a-20/4-2,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,VIAJE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,4,Hombre,7,Hijo(a),,Ninguna,,Primaria o Secundaria,,Estudiante,,Otro,,Sí,2,El DESTINO de viaje inmediato anterior,Monterrey,,,,,,,,10364.0,Nuevo Leon,17.0,86,1899-12-31T12:35:00Z,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T12:45:00Z,19.0,39.0,,,34979.0,BARRIO ESTRELLA,,,1899-12-31T00:10:00Z,regreso a casa,,Otro Modo,,0.0,,0.0,0,0,0.0,10,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,1282.883061,automovil,3,H,Bajo,No,NA+NA,,600,10,0.166667
2,23853a-20,23853a-20/4,23853a-20/4-1,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,VIAJE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,4,Hombre,7,Hijo(a),,Ninguna,,Primaria o Secundaria,,Estudiante,,Otro,,Sí,1,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T07:40:00Z,En OTRO LUGAR,Monterrey,barrio estrella elite,esc Prim Guadalupe chapa,,,,,,10364.0,Nuevo Leon,17.0,86,1899-12-31T07:50:00Z,19.0,39.0,,,,BARRIO ESTRELLA,,,1899-12-31T00:10:00Z,estudio,,Otro Modo,,0.0,,0.0,0,0,0.0,10,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,999.649138,automovil,2,H,Bajo,No,NA+NA,,600,10,0.166667
3,23853a-20,23853a-20/3,23853a-20/3-2,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,VIAJE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,3,Hombre,19,Hijo(a),,Ninguna,,Licenciatura,,Estudiante,,Otro,,Sí,2,El DESTINO de viaje inmediato anterior,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T14:15:00Z,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T15:00:00Z,19.0,39.0,,,34979.0,BARRIO ESTRELLA,,,1899-12-31T00:45:00Z,regreso a casa,,Otro Modo,,0.0,,0.0,0,0,0.0,45,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,1282.883061,automovil,3,H,Alto,No,NA+NA,,2700,45,0.75
4,23853a-20,23853a-20/3,23853a-20/3-1,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,VIAJE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,3,Hombre,19,Hijo(a),,Ninguna,,Licenciatura,,Estudiante,,Otro,,Sí,1,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T10:15:00Z,En OTRO LUGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T11:00:00Z,19.0,39.0,,,34979.0,BARRIO ESTRELLA,,,1899-12-31T00:45:00Z,estudio,,Otro Modo,,0.0,,0.0,0,0,0.0,45,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"Propio (Casa, oficina)",,11.0,,,Pasan por el un amigo,,1282.883061,automovil,2,H,Alto,No,NA+NA,,2700,45,0.75


### Determining household structure

In [17]:
personas.columns

Index(['ID_PERSONA', 'ID_VIV', 'FACTOR', 'MUN', 'CLAVIVP', 'SEXO', 'EDAD',
       'ENT_PAIS_NAC', 'PARENTESCO', 'NACIONALIDAD', 'SERSALUD', 'AFRODES',
       'DHSERSAL', 'RELIGION', 'DIS', 'HLENGUA', 'ELENGUA', 'PERTE_INDIGENA',
       'ASISTEN', 'ENT_PAIS_ASI', 'MUN_ASI', 'TIE_TRASLADO_ESCU',
       'MED_TRASLADO_ESC_Automóvil o camioneta', 'MED_TRASLADO_ESC_Bicicleta',
       'MED_TRASLADO_ESC_Blanco por pase', 'MED_TRASLADO_ESC_Caminando',
       'MED_TRASLADO_ESC_Camión, autobús, combi, colectivo',
       'MED_TRASLADO_ESC_Metro, tren ligero, tren suburbano',
       'MED_TRASLADO_ESC_Metrobús (autobús en carril confinado)',
       'MED_TRASLADO_ESC_Motocicleta o motoneta',
       'MED_TRASLADO_ESC_No especificado', 'MED_TRASLADO_ESC_Otro',
       'MED_TRASLADO_ESC_Taxi (App Internet)',
       'MED_TRASLADO_ESC_Taxi (sitio, calle, otro)',
       'MED_TRASLADO_ESC_Transporte escolar', 'NIVACAD', 'ALFABET',
       'ENT_PAIS_RES_5A', 'MUN_RES_5A', 'SITUA_CONYUGAL', 'CONACT',
       'OC

In [27]:
# Map people to households
viv_per_dict = {}
for id_viv, group in personas_full.groupby('ID_VIV'):
    viv_per_dict[id_viv] = group[['ID_PERSONA', 'SEXO', 'EDAD', 'PARENTESCO']]

In [52]:
household_set = set()
for viv_id, viv_df in viv_per_dict.items():
    # print(tuple(viv_df.PARENTESCO.value_counts()))
    household_set.update([tuple(viv_df.PARENTESCO.value_counts())])

In [53]:
len(household_set)

189

In [61]:
viv_per_dict[190010000001]

Unnamed: 0,ID_PERSONA,SEXO,EDAD,PARENTESCO
0,19001000000100001,F,8-11,Hija(o)
1,19001000000100002,M,25-49,Esposa(o)
2,19001000000100003,M,0-2,Hija(o)
3,19001000000100004,F,6-7,Hija(o)
4,19001000000100005,F,25-49,Jefa(e)


In [None]:
# number of workers
# child by age

In [62]:
personas.PARENTESCO.value_counts()

Hija(o)                                        1149
Jefa(e)                                         791
Esposa(o)                                       535
Nieta(o)                                        252
Nuera o yerno                                    69
Sobrina(o)                                       36
Hermana(o)                                       28
Madre o padre                                    18
Hijastra(o)                                      14
Cuñada(o)                                        12
Bisnieta(o) o tataranieta(o)                      7
Otros familiares                                  7
Suegra(o)                                         6
Parentesco no especificado                        4
Trabajador(a) doméstico(a)                        2
Sin parentesco                                    2
Hija(o) adoptiva(o)                               1
Esposa(o) de nieto(a)                             1
Concubina(o) o unión libre                        1
Madrastra o 

### Seed from group counts

In [None]:
# Seed for whole met area

In [None]:
# Seed per municipality

In [None]:
# Comparing seeds from municipality and whole area

### Seed using bayesian networks

### Comparing both seeds

# Reconstrucción de microdatos para el censo 2020, nivel AGEB

El objetivo es obtener una base de microdatos para cada AGEB del censo 2020 consistente con los tabulados oficiales.
Para tal efecto, vamos a probar diferentes metodologías, comenzando con una estrategia de reconstrucción utilizando métodos de programación lineal.

Para tener una idea clara de la implementación, tomemos una AGEB específica de la zona metropolitana de Monterrey.

A nivel municipio o demarcación territorial, localidad y AGEB, cualquier indicador con menos de tres unidades aparece con asterisco
a excepción de las variables Población total (POBTOT), Total de viviendas (VIVTOT) y Total de viviendas habitadas (TVIVHAB).

Para comenzar vamos a restringir nuestro objetivos a dos variables: edad y sexo. Usemos una AGEB sin datos faltantes, más adelante ajustaremos la implementación para contemplar los datos redactados (*).

Porbaremos con dos AGEBs,  una será la de menor población con datos columnos para las columnas que se refieren a población y sexo, la segunda la de mayor población.

Las columnas de población en la tabla de microdatos serán discretizadas en el menor número intervalos compatibles con las categorías del censo que puedan aprovechar las restricciones.
La ventaja de no usar una única columna de edad con valor entero es la disminución de tamaño del espacio de soluciones.
Imputar una edad específica puede hacerse en una etapa de procesamiento posterior.
Para no contar dos veces soluciones que unicamente difieren en el ordenamiento de las filas, agregaremos restricciones de ordenamiento para los valores de edad.

Las categorias posibles de obtener del censo son las siguientes y las codificamos como enteros:
- 0-2: 0
- 3-4: 1
- 5: 2
- 6-7: 3
- 8-11: 4
- 12-14: 5
- 15-17: 6
- 18-24: 7
- 25-59: 8
- 60-64: 9
- 65-130: 10

Tener esta codificación en mente es importante al momento de definir las restricciones.

Al momento de crear los objetivos, usaremos un arreglo de variables en 2D, la tabla de microdatos.
Esto debe ser compatible con OR-Tools, por lo que será implementado como una lista de listas.
Es este problema restringido a edad y sexo, necesitamos dos columnas: SEXO y EDAD.

In [40]:
def series_to_dict(series):
    d = series.to_dict()
    keys = d.keys()
    
    vals = d.values()
    vals = [int(v) 
            if isinstance(v, float) and not np.isnan(v)
            else v for v in d.values() ]
    
    d = {k: v for k,v in zip(keys, vals)}
    
    return d

## Codificando directamente la tabla de contingencia

In [43]:
from ortools.sat.python import cp_model
from itertools import product

In [44]:
class VarArraySolutionPrinter(cp_model.CpSolverSolutionCallback):
    """Print intermediate solutions."""

    def __init__(self, variables):
        cp_model.CpSolverSolutionCallback.__init__(self)
        self.__variables = variables
        self.__solution_count = 0

    def on_solution_callback(self):
        self.__solution_count += 1
        #for v in self.__variables:
        #    print('%s=%i' % (v, self.Value(v)), end=' ')
        #print()

    def solution_count(self):
        return self.__solution_count

In [171]:
def create_variables(model, dimensions, N):
    x = {}
    for var_id in product(*dimensions.values()):
        x[var_id] = model.NewIntVar(0, N, f'{var_id}')

    return x

In [173]:
def add_constraint(model, ageb, dimensions, variables, c_name, c_dict, verbose=False):
    assert c_name in ageb.keys()
    
    for k, v in c_dict.items():
        assert k in dimensions.keys()
        for vv in v:
            assert vv in dimensions[k]
    
    # Replace classes with classes involved in the constraint only
    c_dims = dimensions.copy()
    for d in c_dims.keys():
        if d in c_dict.keys():
            c_dims[d] = c_dict[d]
    
    # Find all variables involved in the constraints
    c_vars = []
    for var_id in product(*c_dims.values()):
        c_vars.append(variables[var_id])
    
    if verbose:
        print(' + '.join([v.Name() for v in c_vars]) + f' = {c_name}')
    
    model.Add(sum(c_vars) == ageb[c_name])

In [174]:
model = cp_model.CpModel()

variables = create_variables(model, dimensions, ageb['POBTOT'])

for c_name, c_dict in constraints.items():
    add_constraint(model, ageb, dimensions, variables, c_name, c_dict, verbose=False)
    
solver = cp_model.CpSolver()
solution_printer = VarArraySolutionPrinter(list(x.values()))
solver.parameters.enumerate_all_solutions = True

status = solver.Solve(model, solution_printer)
print(f'Found {solution_printer.solution_count()} solutions. Status: {status}')

Found 613872 solutions. Status: 4
