In [1]:
%load_ext autoreload
%autoreload 2

import gurobipy as gp
from gurobipy import GRB
import numpy as np
import scipy.sparse as sp
from pathlib import Path
import pandas as pd
import geopandas as gpd

import sys
sys.path.append('../src/')

In [2]:
from extended_survey import process_people_df, process_places_df, categorize_p, categorize_v

# Define data paths
personas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Personas19.CSV')
viviendas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Viviendas19.CSV')

# Load survey data
personas = process_people_df(personas_path)
viviendas = process_places_df(viviendas_path)

# Select subset of categorical columns to control for
# Seleting them before dropping nan from survey
# Alternative is to impute NAN (e.g. use missforest)
pcat = personas[[
    'ID_PERSONA', 'ID_VIV', 'FACTOR', 'MUN',
    'SEXO', 'EDAD',
    'DHSERSAL1', 'DHSERSAL2', 'RELIGION',
    'ASISTEN', 'NIVACAD', 'ESCOLARI', 'ALFABET',
    'SITUA_CONYUGAL', 'CONACT',
    'INGTRMEN', 'HORTRA'
]].copy()
vcat = viviendas.copy()

# Drop NA values on both surveys

# Look for viviendas with NA values in people and household constraints
na_vivs_v = vcat.ID_VIV[vcat.isna().T.sum() > 0].to_list()
na_vivs_p = pcat.ID_VIV[pcat.isna().T.sum() > 0].to_list()
na_vivs = set(na_vivs_v + na_vivs_p)

# Drop NA before categorizing
pcat = pcat[~pcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)
vcat = vcat[~vcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)

# Categorize columns
pcat = categorize_p(pcat)
vcat = categorize_v(vcat)

assert pcat.isna().sum().sum() == 0
assert vcat.isna().sum().sum() == 0

# Leave only categorized columns
pcat = pcat.drop(columns=[
    'DHSERSAL1', 'DHSERSAL2',
    'NIVACAD', 'ESCOLARI',
    'INGTRMEN', 'HORTRA',
    'DHSERSAL_IMSS', 'DHSERSAL_ISSSTE', 'DHSERSAL_ISSSTE_E',
    'DHSERSAL_P_D_M', 'DHSERSAL_Popular_NGenración_SBienestar',
    'DHSERSAL_IMSS_Prospera/Bienestar'
])

vcat = vcat[[
    'ID_VIV', 'FACTOR', 'MUN', 'NUMPERS',
    'CLAVIVP',
    'CUADORM', 'TOTCUART',
    'REFRIGERADOR', 'LAVADORA', 'HORNO',
    'AUTOPROP', 'MOTOCICLETA', 'BICICLETA', 'RADIO', 'TELEVISOR',
    'COMPUTADORA', 'TELEFONO', 'CELULAR', 'INTERNET', 'SERV_TV_PAGA',
    'SERV_PEL_PAGA', 'CON_VJUEGOS',
    'JEFE_SEXO'
]]

print(f'We keep {pcat.shape[0]/personas.shape[0]*100}% and {vcat.shape[0]/viviendas.shape[0]*100} of the people and household datasets respectively.')

We keep 95.276327279856% and 95.79107969282408 of the people and household datasets respectively.


In [3]:
from constraints import get_ind_const, get_viv_const

# Load constraints
constraints_ind = get_ind_const()
constraints_viv = get_viv_const()

print(f'We have a total of {len(constraints_ind)} people level constraints and {len(constraints_viv)} of household level constraints.')

We have a total of 58 people level constraints and 5 of household level constraints.


In [4]:
from census import process_census

# Load census
census_iter_path = Path('../data/census_loc/ITER_19CSV20.csv')
census_resageburb_path = Path('../data/census_ageb_manz/RESAGEBURB_19CSV20.csv')
(
    df_mun, df_loc,
    df_agebs
) = process_census(census_iter_path, census_resageburb_path)

In [5]:
# Build population matrices
from setup_lin_system import make_init_system, get_W

X, I, J, L, W, Up, Uh, U, C, Y = make_init_system(pcat, vcat, constraints_ind, constraints_viv, df_mun)
U.columns.name = 'ID_VIV'

mun_list = X.MUN.unique()
const_zeroprob_list = []
for mun in mun_list:
    mun_mask = Y.MUN == mun
    U_mun = U.loc[:, mun_mask]
    const_zeroprob_list.extend(U_mun.index[U_mun.T.sum() == 0].to_list())
assert len(set(const_zeroprob_list)) == 0, set(const_zeroprob_list)

In [6]:
from taz import load_taz

# Import TAZ data
taz_path='taz_census.gpkg'
taz_dict = load_taz(taz_path, mun_list)

In [7]:
# Reduce matrices
from miqp import reduce_matrices

Uh, Yh, h_to_y = reduce_matrices(U, Y)

94177 orignal households compressed into 72991 distinct prototypes.


In [None]:
# Create a GUROBI model
mun = 'Cadereyta Jiménez'
taz_gdf = taz_dict[mun]

# Load the constraints as a dictionary
C_mun = C.loc[mun].astype(int).to_dict()
C_taz_all = taz_gdf.set_index('ZONA')[C_mun.keys()].fillna(0).astype(int)

# Get the sample households ids and the contraint weight matrix
Y_mun = Yh.loc[Yh[mun] > 0, mun]
U_mun = Uh.loc[:, Y_mun.index]
assert U_mun.T.duplicated().sum() == 0
    
print(f'{mun} has {C_taz_all.shape[0]} taz '
      f'and {C_mun["TVIVHAB"]} households '
      f'and {Y_mun.shape[0]} respondents and {C_taz_all.shape[1]} constraints.')

Cadereyta Jiménez has 35 taz and 37366 households and 1592 respondents and 63 constraints.


In [None]:
from miqp import solve_gurobi_taz

In [458]:
cols = [
    'TVIVHAB',
    #'VPH_NDACMM',
    #'VPH_AUTOM',
    #'VPH_MOTO',
    #'VPH_BICI',
    #'POBTOT',
    #'POBFEM', 
    'POBMAS', 
    #'P_0A2',
    #'P_0A2_F',
    #'P_0A2_M', 
    #'P_3YMAS',
    #'P_3YMAS_M', 
    #'P_3YMAS_F', 
    #'P_5YMAS',
    #'P_5YMAS_M',
    #'P_5YMAS_F',
    #'P_12YMAS',
    #'P_12YMAS_M', 
    #'P_12YMAS_F',
    #'P_15YMAS',
    #'P_15YMAS_M',
    #'P_15YMAS_F',
    #'P_18YMAS', 
    #'P_18YMAS_M',
    #'P_18YMAS_F',
    #'P_3A5',
    #'P_3A5_M',
    #'P_3A5_F', 
    #'P_6A11',
    #'P_6A11_M',
    #'P_6A11_F',
    #'P_8A14',
    #'P_8A14_M',
    #'P_8A14_F',
    #'P_12A14', 
    #'P_12A14_M', 
    #'P_12A14_F',
    #'P_15A17',
    #'P_15A17_M',
    #'P_15A17_F', 
    #'P_18A24', 
    #'P_18A24_M',
    #'P_18A24_F',
    #'P_60YMAS', 
    #'P_60YMAS_M',
    #'P_60YMAS_F',
    #'P3A5_NOA', 
    #'P6A14NOA',
    #'P15A17A', 
    #'P18A24A',
    #'P15YM_SE',
    #'P15PRI_IN',
    #'P15PRI_CO', 
    #'P15SEC_IN',
    #'P15SEC_CO',
    'P18YM_PB', 
    #'POCUPADA', 
    #'POCUPADA_F', 
    #'POCUPADA_M',
    #'PNOCUPA',
    #'PNOCUPA_F',
    'PNOCUPA_M'
]
model = solve_gurobi_taz(U_mun.loc[cols], Y_mun, C_taz.loc[cols], obj_type='L2', jitter=False, save=False)

MUN_Apodaca_TAZ_811


In [391]:
taz = 811
C_taz = C_taz_all.loc[taz]
model = solve_gurobi_taz(U_mun.loc[cols], Y_mun, C_taz.loc[cols], obj_type='L2', jitter=False, save=False)

MUN_Apodaca_TAZ_811


In [430]:
model.computeIIS()

In [431]:
for c in model.getConstrs():
    if c.IISConstr: print(f'\t{c.constrname}{c.Sense}')


	TVIVHAB=
	POBMAS>
	P18YM_PB>
	PNOCUPA_M>


In [18]:
# Create mun df from best solutions
Y_taz = pd.DataFrame(Y_mun)
Y_taz[taz_gdf.ZONA.values.tolist()] = 0

for taz in C_taz_all.index:
    if C_taz_all.loc[taz].TVIVHAB < 1:
        continue
    sol_df = pd.read_pickle(f'MUN_{mun}_TAZ_{taz}_gsols.pkl')
    Y_taz.loc[:, taz] = sol_df.iloc[0].drop('obj_val').values

In [124]:
hy = h_to_y.loc[Y_taz.index].loc[54643]
hy

ID_VIV    [190390000057, 190390000165, 190390000294, 190...
Survey    [57, 56, 70, 70, 70, 61, 60, 83, 83, 66, 66, 6...
rep                                                     102
Name: 54643, dtype: object

In [131]:
pd.DataFrame(np.zeros((hy.rep, Y_taz.shape[1] - 1), dtype=int), index=hy.ID_VIV, columns=Y_taz.columns.drop(mun))

Unnamed: 0,-10,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,88
190390000057,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190390000165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190390000294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190390000298,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190390000307,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190390004929,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190390004933,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190390005116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
190390005119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
totals_sub.values[None, :].shape

(1, 89)

In [52]:
# Create data frame for mun
idx = []
[idx.extend(l) for l in h_to_y.loc[Y_taz.index].ID_VIV]
idx = pd.Index(sorted(idx))

taz_cols = Y_taz.columns.drop(mun).tolist()

Y_exp = pd.DataFrame(np.zeros((len(idx), Y_taz.shape[1]), dtype=int), index=idx, columns=taz_cols + ['fraction'])
Y_exp['fraction'] = Y_exp['fraction'].astype(float)
Y_exp

Unnamed: 0,-10,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,fraction
190390000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
190390000002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
190390000003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
190390000004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
190390000005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190390005155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
190390005156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
190390005157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
190390005158,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [56]:
for id1, ytaz in Y_taz.iterrows():
    id_viv = h_to_y.loc[id1].ID_VIV
    weights = h_to_y.loc[id1].Survey
    Y_exp.loc[id_viv, taz_cols] = ytaz.loc[taz_cols].values
    Y_exp.loc[id_viv, 'fraction'] = weights
Y_exp

Unnamed: 0,-10,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,fraction
190390000001,0,1,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1.000000
190390000002,0,1,0,0,0,1,2,1,0,1,...,3,2,1,1,1,1,0,0,0,1.000000
190390000003,0,1,0,0,0,1,2,1,0,1,...,2,0,0,0,0,0,0,0,0,1.000000
190390000004,0,1,0,0,0,1,1,1,0,1,...,2,1,0,0,0,0,1,0,0,1.000000
190390000005,0,0,0,0,0,0,0,0,0,0,...,2,3,1,1,1,1,1,2,0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190390005155,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,4,1.000000
190390005156,0,0,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,4,0,1.000000
190390005157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1.000000
190390005158,1,4,4,3,1,7,9,4,0,3,...,14,16,10,10,12,12,15,20,0,0.004662


In [93]:
pd.concat([Y.loc[Y_exp.index, 'Survey'], Y_exp[taz_cols].sum(axis=1), Y_exp.fraction], axis=1)

Unnamed: 0,Survey,0,fraction
190390000001,68,20,1.000000
190390000002,68,47,1.000000
190390000003,68,39,1.000000
190390000004,68,44,1.000000
190390000005,68,25,1.000000
...,...,...,...
190390005155,2,78,1.000000
190390005156,2,8,1.000000
190390005157,2,3,1.000000
190390005158,2,441,0.004662


In [429]:
# Recover original household ids and attributes
# Repeated household divide their weight according to their ratio in the survey


In [398]:
# Recover people list using household ids
Y.loc[[190390003574, 190390004400]].Survey

ID_VIV
190390003574    65
190390004400    60
Name: Survey, dtype: int64

In [352]:
df_loc_agebs = merge_loc_agebs(df_mun, df_loc, df_agebs, impute=False)

In [348]:
mg = load_marco_geo(marco_geo_path, df_mun, df_loc_agebs)

In [351]:
np.all(mg[df_loc_agebs.columns] == df_loc_agebs)

True