In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import xarray as xr
import sparse
from itertools import product, combinations
from pathlib import Path
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib

pd.options.display.max_rows = 500
#pd.options.display.max_columns = 4000

import sys
sys.path.append('../src/')

# import categories as cats
# import constraints

In [2]:
from extended_survey import process_people_df
from census import process_census
from constraints import get_ind_const

from setup_lin_system import setup_ls, check_solvable, find_conf_const

In [3]:
constraints_ind = get_ind_const()

In [4]:
survey_dir = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/')
personas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Personas19.CSV')
census_iter_path = Path('../data/census_loc/ITER_19CSV20.csv')
census_resageburb_path = Path('../data/census_ageb_manz/RESAGEBURB_19CSV20.csv')

In [5]:
%%time
personas_cat = process_people_df(personas_path)

CPU times: user 9.44 s, sys: 444 ms, total: 9.89 s
Wall time: 9.9 s


In [6]:
%%time
(
    df_mun, df_mun_min, df_mun_max,
    df_loc, df_loc_min, df_loc_max,
    df_agebs, df_agebs_min, df_agebs_max
) = process_census(census_iter_path, census_resageburb_path)

CPU times: user 10.5 s, sys: 1.15 s, total: 11.7 s
Wall time: 11.7 s


In [59]:
%%time
XWC_dict = setup_ls(personas_cat, df_mun, constraints_ind)

Cerralvo
    X has 2879 entries.
    Filling zeroes ...
    X has 2880 entries.
    Solving conflicts ...
    X has 2883 entries.
CPU times: user 23.4 s, sys: 737 ms, total: 24.2 s
Wall time: 20.8 s


## Different type of solutions

In [16]:
import scipy.linalg as spl
from scipy.optimize import nnls

In [103]:
mun = 'Cerralvo'
X2 = XWC_dict[mun]['X']
W2 = XWC_dict[mun]['W']
C2 = XWC_dict[mun]['C']

check_solvable(W2, C2)

True

In [106]:
W.shape, W2.shape

((97, 2882), (183, 2882))

In [90]:
# Non negative least squares solution

X_sol_nn, err = nnls(W, C)
np.linalg.norm(W.values @ X_sol_nn - C.values, ord=1), np.linalg.norm(X_sol_nn)

(6.67119692820961e-11, 1280.5252064123974)

In [91]:
X_sol_nn2, err = nnls(W2, C2)
np.linalg.norm(W2.values @ X_sol_nn2 - C2.values, ord=1), np.linalg.norm(X_sol_nn2)

(9.615463980594541e-11, 1091.7601945594413)

In [100]:
# Least squares solution

W_inv = spl.pinv(W)
X_sol = W_inv @ C.values
np.linalg.norm(W.values @ X_sol - C.values, ord=1), np.linalg.norm(X_sol)

(2.957613265408554e-09, 174.80568107488986)

In [104]:
# Least squares solution

W_inv = spl.pinv(W2)
X_sol2 = W_inv @ C2.values
np.linalg.norm(W2.values @ X_sol2 - C2.values, ord=1), np.linalg.norm(X_sol2)

(1.2072849386868256e-09, 174.805681074888)

In [105]:
np.linalg.norm(X_sol2 - X_sol)

5.845229258204539e-10

In [74]:
from ipf import ipf_classic_numba

In [107]:
%%time
# The IPF solution
x = X.FACTOR.values.astype(float)
x_ipf, err = ipf_classic_numba(x, W.values.astype(float), C.values, tol=1e-1, max_iters=1e4)
err, np.linalg.norm(x_ipf)

CPU times: user 7.97 s, sys: 791 ms, total: 8.76 s
Wall time: 6.36 s


(44.4800477607044, 205.72287053844428)

In [115]:
%%time
# The IPF solution
x = X2.FACTOR.values.astype(float)
x_ipf2, err = ipf_classic_numba(x, W2.values.astype(float), C2.values, tol=1e-3, max_iters=1e4)
err, np.linalg.norm(x_ipf2)

CPU times: user 1.21 s, sys: 200 ms, total: 1.41 s
Wall time: 1.15 s


(0.0009983262271617832, 206.0888458338842)

In [681]:
# The max ent solution

### OD Survey

In [96]:
od_df = pd.read_csv('../data/OD_Survey/datos_limpios_tiempos.csv', low_memory=False)

In [101]:
od_df.head()

Unnamed: 0,ID-HOGAR,H-P,H-P-V,Latitud,Longitud,FechaHoraEnc,NumVisita,TipoEnc,RealizoEnc,Encuestador,Supervisor,Cod_MunDomicilio,ColDom,CalleDom,NExtDom,NIntDom,RefDom,CPDom,Punto_zona,Cod_EdoDomicilio,LineaTelef,VHAuto,VHMoto,VHPickup,VHCamion,VHBici,VHPatineta,VHPatines,VHScooter,VHOtro,CHBaños,CHDormitorios,Internet,Hab14masTrabajo,HabitantesTotal,HbitantesMayor6,HbitantesMenor5,TodosEstan,NunHabitante,Género,Edad,RelaciónHogar,RelaciónHogar_O,Discapacidad,Discapacidad_O,Estudios,Estudios_O,Ocupacion,Ocupacion_O,SectorEconom,SectorEconom_O,ViajeAyer,Num_Viaje,Lugar_Or,Cod_MunOri,ColOri,RefOri,CalleOri,Esquina_Ori,Cruce_Ori,OtroEstadoOri,OtroEstadoOri_O,CodOri,Cod_EdoOri,Macrozona Origen,ZonaOri,Hora Inicio V,LugarDest,Cod_MunDest,ColDest,RefDest,CalleDest,Esquina_Dest,Cruce_Dest,OtroEstadoDest,OtroEstadoDest_O,CodDest,Cod_EdoDest,Macrozona Destino,ZonaDest,Hora Término Viaje,Cod_IDEdoDest,Cod_IDMunDest,Cod_IDLocDest,Cod_LocDest,Cod_IDColDest,Cod_ColDest,Cod_IDRefDest,Cod_RefDest,Tiempo Tot de Viaje,Motivo,Motivo_O,M1_TipoTransp,M1_Transp_O,M1Tpo_Caminata,M1N_Ruta,M1_HHTpoParada,M1_MMTpoParada,M1_HHTpoAbordo,M1_HHTpoAbordo_O,M1_MMTpoAbordo,M1_Pago,M2_Transp,M2_Transp_O,M2_TipoTransp,M2Tpo_Caminata,M2N_Ruta,M2_HHTpoParada,M2_MMTpoParada,M2_TpoTranspordo,M2_HHTpoAbordo,M2_HHTpoAbordo_O,M2_MMTpoAbordo,M2_Pago,M3_Transp,M3_Transp_O,M3_TipoTransp,M3Tpo_Caminata,M3N_Ruta,M3_HHTpoParada,M3_MMTpoParada,M3_TpoTranspordo,M3_HHTpoAbordo,M3_HHTpoAbordo_O,M3_MMTpoAbordo,M3_Pago,M4_Transp,M4_Transp_O,M4_TipoTransp,M4Tpo_Caminata,M4N_Ruta,M4_HHTpoParada,M4_MMTpoParada,M4_TpoTranspordo,M4_HHTpoAbordo,M4_HHTpoAbordo_O,M4_MMTpoAbordo,M4_Pago,M5_Transp,M5_Transp_O,M5_TipoTransp,M5Tpo_Caminata,M5N_Ruta,M5_HHTpoParada,M5_MMTpoParada,M5_TpoTranspordo,M5_HHTpoAbordo,M5_HHTpoAbordo_O,M5_MMTpoAbordo,M5_Pago,M6_Transp,M6_Transp_O,M6_TipoTransp,M6Tpo_Caminata,M6N_Ruta,M6_HHTpoParada,M6_MMTpoParada,M6_TpoTranspordo,M6_HHTpoAbordo,M6_HHTpoAbordo_O,M6_MMTpoAbordo,M6_Pago,M7_Transp,M7_Transp_O,M7_TipoTransp,M7Tpo_Caminata,M7N_Ruta,M7_HHTpoParada,M7_MMTpoParada,M7_TpoTranspordo,M7_HHTpoAbordo,M7_HHTpoAbordo_O,M7_MMTpoAbordo,M7_Pago,M8_Transp,M8_Transp_O,M8_TipoTransp,M8Tpo_Caminata,M8N_Ruta,M8_HHTpoParada,M8_MMTpoParada,M8_TpoTranspordo,M8_HHTpoAbordo,M8_HHTpoAbordo_O,M8_MMTpoAbordo,M8_Pago,TipoEstacionamiento,TpoBusqueda,TpoEstacionadoHH,TpoEstacionadoMM,CostoEstacionamiento,Obs_Encuestador,Obs_ENCUESTA,FE,Modo Agrupado,motivos,genero,estudios,disc,origen,Tiempo,tiempo_s,tiempo_m,tiempo_h
0,23853a-20,23853a-20/2,23853a-20/2-2,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,HABITANTE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,2,Mujer,40,Madre/Esposa,,Ninguna,,Sin Instrucción,,Ama de casa,,Otro,,Sí,2,El DESTINO de viaje inmediato anterior,Monterrey,,,,,,,,10358.0,Nuevo Leon,19.0,76,1899-12-31T18:50:00Z,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T19:00:00Z,19.0,39.0,,,34979.0,BARRIO ESTRELLA,,,1899-12-31T00:10:00Z,regreso a casa,,Vehículo Particular,,0.0,,0.0,0,0,0.0,10,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"Propio (Casa, oficina)",,11.0,,,,,1282.883061,automovil,3,F,Bajo,No,NA+NA,,600,10,0.166667
1,23853a-20,23853a-20/4,23853a-20/4-2,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,VIAJE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,4,Hombre,7,Hijo(a),,Ninguna,,Primaria o Secundaria,,Estudiante,,Otro,,Sí,2,El DESTINO de viaje inmediato anterior,Monterrey,,,,,,,,10364.0,Nuevo Leon,17.0,86,1899-12-31T12:35:00Z,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T12:45:00Z,19.0,39.0,,,34979.0,BARRIO ESTRELLA,,,1899-12-31T00:10:00Z,regreso a casa,,Otro Modo,,0.0,,0.0,0,0,0.0,10,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,1282.883061,automovil,3,H,Bajo,No,NA+NA,,600,10,0.166667
2,23853a-20,23853a-20/4,23853a-20/4-1,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,VIAJE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,4,Hombre,7,Hijo(a),,Ninguna,,Primaria o Secundaria,,Estudiante,,Otro,,Sí,1,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T07:40:00Z,En OTRO LUGAR,Monterrey,barrio estrella elite,esc Prim Guadalupe chapa,,,,,,10364.0,Nuevo Leon,17.0,86,1899-12-31T07:50:00Z,19.0,39.0,,,,BARRIO ESTRELLA,,,1899-12-31T00:10:00Z,estudio,,Otro Modo,,0.0,,0.0,0,0,0.0,10,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,999.649138,automovil,2,H,Bajo,No,NA+NA,,600,10,0.166667
3,23853a-20,23853a-20/3,23853a-20/3-2,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,VIAJE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,3,Hombre,19,Hijo(a),,Ninguna,,Licenciatura,,Estudiante,,Otro,,Sí,2,El DESTINO de viaje inmediato anterior,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T14:15:00Z,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T15:00:00Z,19.0,39.0,,,34979.0,BARRIO ESTRELLA,,,1899-12-31T00:45:00Z,regreso a casa,,Otro Modo,,0.0,,0.0,0,0,0.0,45,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,1282.883061,automovil,3,H,Alto,No,NA+NA,,2700,45,0.75
4,23853a-20,23853a-20/3,23853a-20/3-1,25.765088,-100.406374,2019-09-24T00:00:00Z,Visita 1,VIAJE,,Lucia Hernández Mondragón,Patricia Baena,Monterrey,Barrio Estrella Elit,lucero,112,,,64102.0,86,Nuevo Leon,Si,2.0,,,,,,,,,1.0,2.0,Si,4.0,4.0,4.0,,,3,Hombre,19,Hijo(a),,Ninguna,,Licenciatura,,Estudiante,,Otro,,Sí,1,El HOGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T10:15:00Z,En OTRO LUGAR,Monterrey,,,,,,,,2249.0,Nuevo Leon,17.0,86,1899-12-31T11:00:00Z,19.0,39.0,,,34979.0,BARRIO ESTRELLA,,,1899-12-31T00:45:00Z,estudio,,Otro Modo,,0.0,,0.0,0,0,0.0,45,,No utilizó otro medio de transporte,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"Propio (Casa, oficina)",,11.0,,,Pasan por el un amigo,,1282.883061,automovil,2,H,Alto,No,NA+NA,,2700,45,0.75


### Determining household structure

In [147]:
# Map people to households
viv_per_dict = {}
for id_viv, group in personas.groupby('ID_VIV'):
    viv_per_dict[id_viv] = group[['ID_PERSONA', 'NUMPER', 'SEXO', 'EDAD', 'PARENTESCO', 'IDENT_MADRE', 'IDENT_PADRE']]

In [148]:
viv_per_dict[190010000001].sort_values('NUMPER')

Unnamed: 0,ID_PERSONA,NUMPER,SEXO,EDAD,PARENTESCO,IDENT_MADRE,IDENT_PADRE
4,19001000000100005,1,3,30,101,96,96
1,19001000000100002,2,1,36,201,96,96
0,19001000000100001,3,3,9,301,1,96
3,19001000000100004,4,3,6,301,1,96
2,19001000000100003,5,1,1,301,1,2


### Seed from group counts

In [None]:
# Seed for whole met area

In [None]:
# Seed per municipality

In [None]:
# Comparing seeds from municipality and whole area

### Seed using bayesian networks

### Comparing both seeds

# Reconstrucción de microdatos para el censo 2020, nivel AGEB

El objetivo es obtener una base de microdatos para cada AGEB del censo 2020 consistente con los tabulados oficiales.
Para tal efecto, vamos a probar diferentes metodologías, comenzando con una estrategia de reconstrucción utilizando métodos de programación lineal.

Para tener una idea clara de la implementación, tomemos una AGEB específica de la zona metropolitana de Monterrey.

A nivel municipio o demarcación territorial, localidad y AGEB, cualquier indicador con menos de tres unidades aparece con asterisco
a excepción de las variables Población total (POBTOT), Total de viviendas (VIVTOT) y Total de viviendas habitadas (TVIVHAB).

Para comenzar vamos a restringir nuestro objetivos a dos variables: edad y sexo. Usemos una AGEB sin datos faltantes, más adelante ajustaremos la implementación para contemplar los datos redactados (*).

Porbaremos con dos AGEBs,  una será la de menor población con datos columnos para las columnas que se refieren a población y sexo, la segunda la de mayor población.

Las columnas de población en la tabla de microdatos serán discretizadas en el menor número intervalos compatibles con las categorías del censo que puedan aprovechar las restricciones.
La ventaja de no usar una única columna de edad con valor entero es la disminución de tamaño del espacio de soluciones.
Imputar una edad específica puede hacerse en una etapa de procesamiento posterior.
Para no contar dos veces soluciones que unicamente difieren en el ordenamiento de las filas, agregaremos restricciones de ordenamiento para los valores de edad.

Las categorias posibles de obtener del censo son las siguientes y las codificamos como enteros:
- 0-2: 0
- 3-4: 1
- 5: 2
- 6-7: 3
- 8-11: 4
- 12-14: 5
- 15-17: 6
- 18-24: 7
- 25-59: 8
- 60-64: 9
- 65-130: 10

Tener esta codificación en mente es importante al momento de definir las restricciones.

Al momento de crear los objetivos, usaremos un arreglo de variables en 2D, la tabla de microdatos.
Esto debe ser compatible con OR-Tools, por lo que será implementado como una lista de listas.
Es este problema restringido a edad y sexo, necesitamos dos columnas: SEXO y EDAD.

In [40]:
def series_to_dict(series):
    d = series.to_dict()
    keys = d.keys()
    
    vals = d.values()
    vals = [int(v) 
            if isinstance(v, float) and not np.isnan(v)
            else v for v in d.values() ]
    
    d = {k: v for k,v in zip(keys, vals)}
    
    return d

## Codificando directamente la tabla de contingencia

In [43]:
from ortools.sat.python import cp_model
from itertools import product

In [44]:
class VarArraySolutionPrinter(cp_model.CpSolverSolutionCallback):
    """Print intermediate solutions."""

    def __init__(self, variables):
        cp_model.CpSolverSolutionCallback.__init__(self)
        self.__variables = variables
        self.__solution_count = 0

    def on_solution_callback(self):
        self.__solution_count += 1
        #for v in self.__variables:
        #    print('%s=%i' % (v, self.Value(v)), end=' ')
        #print()

    def solution_count(self):
        return self.__solution_count

In [171]:
def create_variables(model, dimensions, N):
    x = {}
    for var_id in product(*dimensions.values()):
        x[var_id] = model.NewIntVar(0, N, f'{var_id}')

    return x

In [173]:
def add_constraint(model, ageb, dimensions, variables, c_name, c_dict, verbose=False):
    assert c_name in ageb.keys()
    
    for k, v in c_dict.items():
        assert k in dimensions.keys()
        for vv in v:
            assert vv in dimensions[k]
    
    # Replace classes with classes involved in the constraint only
    c_dims = dimensions.copy()
    for d in c_dims.keys():
        if d in c_dict.keys():
            c_dims[d] = c_dict[d]
    
    # Find all variables involved in the constraints
    c_vars = []
    for var_id in product(*c_dims.values()):
        c_vars.append(variables[var_id])
    
    if verbose:
        print(' + '.join([v.Name() for v in c_vars]) + f' = {c_name}')
    
    model.Add(sum(c_vars) == ageb[c_name])

In [174]:
model = cp_model.CpModel()

variables = create_variables(model, dimensions, ageb['POBTOT'])

for c_name, c_dict in constraints.items():
    add_constraint(model, ageb, dimensions, variables, c_name, c_dict, verbose=False)
    
solver = cp_model.CpSolver()
solution_printer = VarArraySolutionPrinter(list(x.values()))
solver.parameters.enumerate_all_solutions = True

status = solver.Solve(model, solution_printer)
print(f'Found {solution_printer.solution_count()} solutions. Status: {status}')

Found 613872 solutions. Status: 4
