In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import scipy
import geopandas as gpd
import xarray as xr
import sparse
from itertools import product, combinations
from pathlib import Path
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib
from collections import defaultdict

pd.options.display.max_rows = 500
#pd.options.display.max_columns = 4000

import sys
sys.path.append('../src/')

from extended_survey import process_people_df, process_places_df, categorize_p, categorize_v
from census import process_census
from constraints import get_ind_const, get_viv_const

survey_dir = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/')
personas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Personas19.CSV')
viviendas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Viviendas19.CSV')
census_iter_path = Path('../data/census_loc/ITER_19CSV20.csv')
census_resageburb_path = Path('../data/census_ageb_manz/RESAGEBURB_19CSV20.csv')
output_path = Path('../output/')

In [4]:
personas = process_people_df(personas_path)
viviendas = process_places_df(viviendas_path)

pcat = personas[[
    'ID_PERSONA', 'ID_VIV', 'FACTOR', 'MUN',
    'SEXO', 'EDAD',
    # 'ENT_PAIS_NAC',
    # 'AFRODES',
    'DHSERSAL1', 'DHSERSAL2', 'RELIGION',
    # 'DIS_VER', 'DIS_OIR', 'DIS_CAMINAR', 'DIS',
    # 'DIS_RECORDAR', 'DIS_BANARSE', 'DIS_HABLAR', 'DIS_MENTAL',
    # 'HLENGUA',
    # 'HESPANOL',  # Global seed zero problem 
    'ASISTEN', 'NIVACAD', 'ESCOLARI', 'ALFABET',
    # 'ENT_PAIS_RES_5A',
    'SITUA_CONYUGAL', 'CONACT',
    'INGTRMEN', 'HORTRA'
]].copy()
vcat = viviendas.copy()

# Look for viviendas with NA
na_vivs_v = vcat.ID_VIV[vcat.isna().T.sum() > 0].to_list()
na_vivs_p = pcat.ID_VIV[pcat.isna().T.sum() > 0].to_list()
na_vivs = set(na_vivs_v + na_vivs_p)

# Drop NA before categorizing
pcat = pcat[~pcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)
vcat = vcat[~vcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)

pcat = categorize_p(pcat)
vcat = categorize_v(vcat)

# Dopr NA again after discretizing
na_vivs = pcat.ID_VIV[pcat.isna().T.sum() > 0].to_list()
pcat = pcat[~pcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)
vcat = vcat[~vcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)

assert pcat.isna().sum().sum() == 0
assert vcat.isna().sum().sum() == 0

# Leave only constrained columns
pcat = pcat[[
    'ID_PERSONA', 'ID_VIV', 'FACTOR',
    'MUN',
    'SEXO', 'EDAD',
    # 'ENT_PAIS_NAC',
    # 'AFRODES',
    #'RELIGION',
    # 'DIS_VER', 'DIS_OIR', 'DIS_CAMINAR',
    # 'DIS_RECORDAR', 'DIS_BANARSE', 'DIS_HABLAR', 'DIS_MENTAL',
    # 'DIS_CON', 'DIS_LIMI',
    # 'HLENGUA',
    # 'HESPANOL',
    'ASISTEN',  'EDUC',
    # 'ALFABET',
    # 'ENT_PAIS_RES_5A',
    'SITUA_CONYUGAL',
    'CONACT',
    # 'DHSERSAL_IMSS', 'DHSERSAL_ISSSTE', 'DHSERSAL_ISSSTE_E', 'DHSERSAL_P_D_M',
    # 'DHSERSAL_Popular_NGenración_SBienestar',
    # 'DHSERSAL_IMSS_Prospera/Bienestar',
    'DHSERSAL_Privado', 'DHSERSAL_Otro',
    'DHSERSAL_No afiliado', 'DHSERSAL_PUB', 'DHSERSAL_AFIL',
]]

vcat = vcat[[
    'ID_VIV', 'FACTOR', 'MUN', 'NUMPERS',
    'CLAVIVP',
    # 'PISOS',
    'CUADORM', 'TOTCUART',
    # 'ELECTRICIDAD', 'AGUA_ENTUBADA',
    # 'ABA_AGUA_ENTU',
    # 'TINACO', 'CISTERNA',
    # 'SERSAN',
    # 'CONAGUA',
    # 'DRENAJE',
    'REFRIGERADOR', 'LAVADORA', 'HORNO',
    'AUTOPROP', 'MOTOCICLETA', 'BICICLETA', 'RADIO', 'TELEVISOR',
    'COMPUTADORA', 'TELEFONO', 'CELULAR', 'INTERNET', 'SERV_TV_PAGA',
    'SERV_PEL_PAGA', 'CON_VJUEGOS',
    'JEFE_SEXO'
]]

print(pcat.shape[0]/personas.shape[0])
print(vcat.shape[0]/viviendas.shape[0])

0.95276327279856
0.9579107969282409


In [5]:
constraints_ind = get_ind_const()
constraints_viv = get_viv_const()

In [6]:
(
    df_mun, df_loc,
    df_agebs, df_agebs_min, df_agebs_max
) = process_census(census_iter_path, census_resageburb_path)

In [166]:
df_loc

Unnamed: 0_level_0,Unnamed: 1_level_0,POBTOT,POBFEM,POBMAS,P_0A2,P_0A2_F,P_0A2_M,P_3YMAS,P_3YMAS_F,P_3YMAS_M,P_5YMAS,...,PNOCUPA_M,PNOCUPA_F,P8YM_AN,P8YM_AN_M,P8YM_AN_F,P6A14NOA,P6A14NOAF,P6A14NOAM,POBCOL,TOTCOL
MUN,LOC,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1,1992,1032.0,960.0,108.0,54.0,54.0,1882.0,977.0,905.0,1814.0,...,264.0,613.0,36.0,16.0,20.0,27.0,18.0,9.0,0.0,0.0
1,8,7,,,,,,,,,,...,,,,,,,,,,
1,11,3,,,,,,,,,,...,,,,,,,,,,
1,12,3,,,,,,,,,,...,,,,,,,,,,
1,15,2,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,161,3,,,,,,,,,,...,,,,,,,,,,
51,163,1,,,,,,,,,,...,,,,,,,,,,
51,175,6,,,,,,,,,,...,,,,,,,,,,
51,185,2,,,,,,,,,,...,,,,,,,,,,


In [7]:
# Build matrices

In [8]:
from setup_lin_system import make_init_system, get_W

In [9]:
X, I, J, L, W, Up, Uh, U, C, Y = make_init_system(pcat, vcat, constraints_ind, constraints_viv, df_mun)

In [10]:
X.shape[0]/pcat.shape[0]

0.10308545561676379

In [11]:
C.shape

(51, 63)

In [9]:
mun_list = X.MUN.unique()
const_zeroprob_list = []
for mun in mun_list:
    mun_mask = Y.MUN == mun
    U_mun = U.loc[:, mun_mask]
    const_zeroprob_list.extend(U_mun.index[U_mun.T.sum() == 0].to_list())
set(const_zeroprob_list)

set()

In [12]:
from ortools.sat.python import cp_model

In [107]:
mun = 'Monterrey'
N_mun = int(df_mun.loc[mun, 'TVIVHAB'])
C_mun = C.loc[mun].astype(int).to_dict()

Y_mun = Y.index[Y[mun] > 0]
U_mun = U.loc[:, Y_mun]

model = cp_model.CpModel()

y_mun = {}
for var_id in Y_mun:
    y_mun[var_id] = model.NewIntVar(0, N_mun, f'{var_id}')

# Add constraints
for c_name, c_val in C_mun.items():
    coefficients = U_mun.loc[c_name][U_mun.loc[c_name] > 0]
    expressions = [y_mun[vid] for vid in coefficients.index]
    # if c_name == 'TVIVHAB': continue
    if c_name in ['POBTOT', 'TVIVHAB']:
        model.Add(cp_model.LinearExpr.WeightedSum(expressions, coefficients.values) == c_val)
    else:
        model.Add(cp_model.LinearExpr.WeightedSum(expressions, coefficients.values) >= c_val)

# Extract localities for mun
mun_dict = {mun: i+1 for i, mun in enumerate(df_mun.index)}

C_locs = df_loc.loc[mun_dict[mun], C.columns]
y_loc = {}
for i, loc in enumerate(C_locs.index):
    #if i != 12: continue
    C_loc = C_locs.loc[loc].fillna(0).astype(int).to_dict()
    N_loc = C_loc['TVIVHAB']
    assert N_loc > 0
    
    y_loc[loc] = {}
    for var_id in Y_mun:
        y_loc[loc][var_id] = model.NewIntVar(0, N_loc, f'{loc}_{var_id}')
        
    for c_name, c_val in C_loc.items():
        coefficients = U_mun.loc[c_name][U_mun.loc[c_name] > 0]
        expressions = [y_loc[loc][vid] for vid in coefficients.index]
        # if c_name == 'TVIVHAB': continue
        if c_name in ['TVIVHAB', 'POBTOT']:
             model.Add(cp_model.LinearExpr.WeightedSum(expressions, coefficients.values) == c_val)
        elif c_val > 0:
            model.Add(cp_model.LinearExpr.WeightedSum(expressions, coefficients.values) >= int(0.95*c_val))
            
# Hierarchical constraints
for var_id, var_mun in y_mun.items():
    var_locs = []
    for loc_dict in y_loc.values():
        var_locs.append(loc_dict[var_id])
    model.Add(sum(var_locs) == var_mun)
            
print(model.ModelStats())

solver = cp_model.CpSolver()
status = solver.Solve(model)
if status == cp_model.OPTIMAL or status == cp_model.FEASIBLE:
    print('Solution found.')
else:
    print("No solution found.")

satisfaction model '': (model_fingerprint: 0xa72100e55f990362)
#Variables: 38'352
  - 14'382 Booleans in [0,1]
  - 4'794 in [0,2]
  - 4'794 in [0,3]
  - 4'794 in [0,5]
  - 4'794 in [0,329082]
  - 4'794 in [0,329095]
#kLinearN: 5'011 (#terms: 583'371)
Solution found.


In [108]:
ysol = {}
for vid, yvar in y_mun.items():
    ysol[vid] = solver.Value(yvar)

In [109]:
np.abs(U_mun @ pd.Series(ysol) - C.loc[mun]).sum()

156696.0

In [112]:
np.mean(np.abs(U_mun @ pd.Series(ysol) - C.loc[mun]))/C.loc[mun].POBTOT

0.002176072748621686

In [105]:
np.abs(U_mun @ Y.loc[Y_mun, mun] - C.loc[mun]).sum()

1471593.0

In [106]:
np.abs(U_mun @ Y.loc[Y_mun, mun]*C.loc[mun].TVIVHAB/Y.loc[Y_mun, mun].sum()- C.loc[mun]).sum()

443863.98689009657

In [133]:
y_loc.keys()

dict_keys([1, 36, 41, 42, 46, 52, 53])

In [142]:
df_agebs_mun = df_agebs_min.loc[mun_dict[mun]].reset_index()

y_agebs = {}
for loc, loc_dict in y_loc.items():
    if loc not in df_agebs_mun.LOC.unique(): continue
    y_agebs[loc] = {}
    df_agebs_loc = df_agebs_min.loc[mun_dict[mun], loc].reset_index()
    for ageb in df_agebs_loc.AGEB:
        y_agebs[loc][ageb] = {}
        for varid in loc_dict:
            y_agebs[loc][ageb][varid] = model.NewIntVar(0, N_loc, f'{ageb}_{loc}_{var_id}')

In [143]:
print(model.ModelStats())

satisfaction model '': (model_fingerprint: 0xc3394c452fce0818)
#Variables: 2'329'884
  - 2'305'914 Booleans in [0,1]
  - 4'794 in [0,2]
  - 4'794 in [0,3]
  - 4'794 in [0,5]
  - 4'794 in [0,329082]
  - 4'794 in [0,329095]
#kLinearN: 5'011 (#terms: 583'371)


In [403]:
U.T.duplicated().sum()

44119