In [176]:
%load_ext autoreload
%autoreload 2

import gurobipy as gp
from gurobipy import GRB
import numpy as np
import scipy.sparse as sp
from pathlib import Path
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

import sys
sys.path.append('../src/')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from extended_survey import process_people_df, process_places_df, categorize_p, categorize_v

# Define data paths
personas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Personas19.CSV')
viviendas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Viviendas19.CSV')

# Load survey data
personas = process_people_df(personas_path)
viviendas = process_places_df(viviendas_path)

# Select subset of categorical columns to control for
# Seleting them before dropping nan from survey
# Alternative is to impute NAN (e.g. use missforest)
pcat = personas[[
    'ID_PERSONA', 'ID_VIV', 'FACTOR', 'MUN',
    'SEXO', 'EDAD',
    # 'ENT_PAIS_NAC',
    # 'AFRODES',
    'DHSERSAL1', 'DHSERSAL2', 'RELIGION',
    # 'DIS_VER', 'DIS_OIR', 'DIS_CAMINAR', 'DIS',
    # 'DIS_RECORDAR', 'DIS_BANARSE', 'DIS_HABLAR', 'DIS_MENTAL',
    # 'HLENGUA',
    # 'HESPANOL',  # Global seed zero problem 
    'ASISTEN', 'NIVACAD', 'ESCOLARI', 'ALFABET',
    # 'ENT_PAIS_RES_5A',
    'SITUA_CONYUGAL', 'CONACT',
    'INGTRMEN', 'HORTRA'
]].copy()
vcat = viviendas.copy()

# Drop NA values on both surveys

# Look for viviendas with NA values in people and household constraints
na_vivs_v = vcat.ID_VIV[vcat.isna().T.sum() > 0].to_list()
na_vivs_p = pcat.ID_VIV[pcat.isna().T.sum() > 0].to_list()
na_vivs = set(na_vivs_v + na_vivs_p)

# Drop NA before categorizing
pcat = pcat[~pcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)
vcat = vcat[~vcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)

# Categorize columns
pcat = categorize_p(pcat)
vcat = categorize_v(vcat)

assert pcat.isna().sum().sum() == 0
assert vcat.isna().sum().sum() == 0

# Leave only categorized columns
pcat = pcat.drop(columns=[
    'DHSERSAL1', 'DHSERSAL2',
    'NIVACAD', 'ESCOLARI',
    'INGTRMEN', 'HORTRA',
    'DHSERSAL_IMSS', 'DHSERSAL_ISSSTE', 'DHSERSAL_ISSSTE_E',
    'DHSERSAL_P_D_M', 'DHSERSAL_Popular_NGenración_SBienestar',
    'DHSERSAL_IMSS_Prospera/Bienestar'
])

vcat = vcat[[
    'ID_VIV', 'FACTOR', 'MUN', 'NUMPERS',
    'CLAVIVP',
    # 'PISOS',
    'CUADORM', 'TOTCUART',
    # 'ELECTRICIDAD', 'AGUA_ENTUBADA',
    # 'ABA_AGUA_ENTU',
    # 'TINACO', 'CISTERNA',
    # 'SERSAN',
    # 'CONAGUA',
    # 'DRENAJE',
    'REFRIGERADOR', 'LAVADORA', 'HORNO',
    'AUTOPROP', 'MOTOCICLETA', 'BICICLETA', 'RADIO', 'TELEVISOR',
    'COMPUTADORA', 'TELEFONO', 'CELULAR', 'INTERNET', 'SERV_TV_PAGA',
    'SERV_PEL_PAGA', 'CON_VJUEGOS',
    'JEFE_SEXO'
]]

print(f'We keep {pcat.shape[0]/personas.shape[0]*100}% and {vcat.shape[0]/viviendas.shape[0]*100} of the people and household datasets respectively.')

We keep 95.276327279856% and 95.79107969282408 of the people and household datasets respectively.


In [3]:
from constraints import get_ind_const, get_viv_const

# Load constraints
constraints_ind = get_ind_const()
constraints_viv = get_viv_const()

print(f'We have a total of {len(constraints_ind)} people level constraints and {len(constraints_viv)} of household level constraints.')

We have a total of 58 people level constraints and 5 of household level constraints.


In [4]:
from census import process_census

# Load census
census_iter_path = Path('../data/census_loc/ITER_19CSV20.csv')
census_resageburb_path = Path('../data/census_ageb_manz/RESAGEBURB_19CSV20.csv')
(
    df_mun, df_loc, df_agebs
) = process_census(census_iter_path, census_resageburb_path)

In [5]:
# Build population matrices
from setup_lin_system import make_init_system, get_W

X, I, J, L, Up, Uh, U, Yp, Yh, Y, C = make_init_system(pcat, vcat, constraints_ind, constraints_viv, df_mun)

mun_list = X.MUN.unique()
const_zeroprob_list = []
for mun in mun_list:
    mun_mask = Y.MUN == mun
    U_mun = U.loc[:, mun_mask]
    const_zeroprob_list.extend(U_mun.index[U_mun.T.sum() == 0].to_list())
assert len(set(const_zeroprob_list)) == 0, set(const_zeroprob_list)

In [6]:
from taz import load_taz

# Import TAZ data
taz_path='taz_census.gpkg'
taz_dict = load_taz(taz_path, mun_list)

In [7]:
for df in taz_dict.values():
    df = df.dropna()
    assert np.all(df.POBTOT == df.POBHOG + df.POBCOL)
    assert np.all(df.TVIVHAB == df.TOTHOG + df.TOTCOL)    

In [108]:
from miqp import solve_gurobi_taz, relax_model, solve_gb

In [9]:
import tqdm.notebook as tqdm

In [None]:
%%time

r_list = []
for mun in tqdm.tqdm(taz_dict.keys(), desc='MUN', position=0):
    for taz in tqdm.tqdm(taz_dict[mun].ZONA, desc='TAZ', position=1, leave=False):
        sol_df, model = solve_gb(mun, taz, taz_dict, Y, U, C, obj_type='L2', save=True)
        if model is None:
            continue
        r_list.append((mun, taz, model.Status, model.Runtime, model.SolCount,
                       taz_dict[mun].query(f'ZONA=={taz}').POBTOT.item(),
                       taz_dict[mun].query(f'ZONA=={taz}').POBHOG.item(),
                       taz_dict[mun].query(f'ZONA=={taz}').POBCOL.item(),
                       taz_dict[mun].query(f'ZONA=={taz}').TOTHOG.item()
                      ))
        model.dispose()

MUN:   0%|          | 0/18 [00:00<?, ?it/s]

TAZ:   0%|          | 0/4 [00:00<?, ?it/s]

TAZ:   0%|          | 0/123 [00:00<?, ?it/s]

TAZ:   0%|          | 0/35 [00:00<?, ?it/s]

TAZ:   0%|          | 0/16 [00:00<?, ?it/s]

TAZ:   0%|          | 0/12 [00:00<?, ?it/s]

TAZ:   0%|          | 0/51 [00:00<?, ?it/s]

TAZ:   0%|          | 0/22 [00:00<?, ?it/s]

TAZ:   0%|          | 0/79 [00:00<?, ?it/s]

TAZ:   0%|          | 0/20 [00:00<?, ?it/s]

TAZ:   0%|          | 0/82 [00:00<?, ?it/s]

Wall time: 10h 3min 24s

In [156]:
for rr in r_list:
    rr[-1].dispose()

In [157]:
s_list = [rr[2] for rr in r_list]
n_list = [rr[4] for rr in r_list]

In [172]:
pd.value_counts(s_list)

  pd.value_counts(s_list)
  pd.value_counts(s_list)


9    415
2      7
Name: count, dtype: int64

In [184]:
sol_df, model = solve_gb('Monterrey', 88, taz_dict, Y, U, C, obj_type=0, save=False)


Interrupt request received


In [186]:
model.Params.JSONSolDetail

0

In [None]:
# Create mun df from best solutions
Y_taz = pd.DataFrame(Y_mun)
Y_taz[taz_gdf.ZONA.values.tolist()] = 0

for taz in C_taz_all.index:
    if C_taz_all.loc[taz].TVIVHAB < 1:
        continue
    sol_df = pd.read_pickle(f'MUN_{mun}_TAZ_{taz}_gsols.pkl')
    Y_taz.loc[:, taz] = sol_df.iloc[0].drop('obj_val').values

In [None]:
# Create data frame for mun
idx = []
[idx.extend(l) for l in h_to_y.loc[Y_taz.index].ID_VIV]
idx = pd.Index(sorted(idx))

Y_exp = pd.DataFrame(np.zeros((len(idx), Y_taz.shape[1] -), dtype=int))

In [None]:
id_viv = h_to_y.loc[Y_taz.index].loc[53259].ID_VIV
weights = h_to_y.loc[Y_taz.index].loc[53259].Survey
id_viv, weights

In [None]:
idx = []
[idx.extend(l) for l in h_to_y.loc[Y_taz.index].ID_VIV]
idx = pd.Index(sorted(idx))
idx

In [None]:
# Recover people list using household ids
Y.loc[[190390003574, 190390004400]].Survey