Header for people table

- HouseholdId
- PersonNumber
- Age
- Sex
- License
- TransitPass
- EmploymentStatus
- Occupationm
- FreeParking
- StudentStatus
- EmploymentZone
- SchoolZone
- ExpansionFactor

Also use ZONA 3 for taz zones!

The lastest synth pop is the one without the large households! 

In [1]:
%load_ext autoreload
%autoreload 2

import gurobipy as gp
from gurobipy import GRB
import numpy as np
import scipy.sparse as sp
from pathlib import Path
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

import sys
sys.path.append('../src/')

In [2]:
from extended_survey import process_people_df, process_places_df, categorize_p, categorize_v

# Define data paths
personas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Personas19.CSV')
viviendas_path = Path('../data/cuestionario_ampliado/Censo2020_CA_nl_csv/Viviendas19.CSV')

# Load survey data
personas = process_people_df(personas_path)
viviendas = process_places_df(viviendas_path)

# Select subset of categorical columns to control for
# Seleting them before dropping nan from survey
# Alternative is to impute NAN (e.g. use missforest)
pcat = personas[[
    'ID_PERSONA', 'ID_VIV', 'FACTOR', 'MUN',
    'SEXO', 'EDAD',
    # 'ENT_PAIS_NAC',
    # 'AFRODES',
    'DHSERSAL1', 'DHSERSAL2', 'RELIGION',
    # 'DIS_VER', 'DIS_OIR', 'DIS_CAMINAR', 'DIS',
    # 'DIS_RECORDAR', 'DIS_BANARSE', 'DIS_HABLAR', 'DIS_MENTAL',
    # 'HLENGUA',
    # 'HESPANOL',  # Global seed zero problem 
    'ASISTEN', 'NIVACAD', 'ESCOLARI', 'ALFABET',
    # 'ENT_PAIS_RES_5A',
    'SITUA_CONYUGAL', 'CONACT',
    'INGTRMEN', 'HORTRA'
]].copy()
vcat = viviendas.copy()

# Drop NA values on both surveys

# Look for viviendas with NA values in people and household constraints
na_vivs_v = vcat.ID_VIV[vcat.isna().T.sum() > 0].to_list()
na_vivs_p = pcat.ID_VIV[pcat.isna().T.sum() > 0].to_list()
na_vivs = set(na_vivs_v + na_vivs_p)

# Drop NA before categorizing
pcat = pcat[~pcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)
vcat = vcat[~vcat.ID_VIV.isin(na_vivs)].reset_index(drop=True)

# Categorize columns
pcat = categorize_p(pcat)
vcat = categorize_v(vcat)

assert pcat.isna().sum().sum() == 0
assert vcat.isna().sum().sum() == 0

# Leave only categorized columns
pcat = pcat.drop(columns=[
    'DHSERSAL1', 'DHSERSAL2',
    'NIVACAD', 'ESCOLARI',
    'INGTRMEN', 'HORTRA',
    'DHSERSAL_IMSS', 'DHSERSAL_ISSSTE', 'DHSERSAL_ISSSTE_E',
    'DHSERSAL_P_D_M', 'DHSERSAL_Popular_NGenración_SBienestar',
    'DHSERSAL_IMSS_Prospera/Bienestar'
])

vcat = vcat[[
    'ID_VIV', 'FACTOR', 'MUN', 'NUMPERS',
    'CLAVIVP',
    # 'PISOS',
    'CUADORM', 'TOTCUART',
    # 'ELECTRICIDAD', 'AGUA_ENTUBADA',
    # 'ABA_AGUA_ENTU',
    # 'TINACO', 'CISTERNA',
    # 'SERSAN',
    # 'CONAGUA',
    # 'DRENAJE',
    'REFRIGERADOR', 'LAVADORA', 'HORNO',
    'AUTOPROP', 'MOTOCICLETA', 'BICICLETA', 'RADIO', 'TELEVISOR',
    'COMPUTADORA', 'TELEFONO', 'CELULAR', 'INTERNET', 'SERV_TV_PAGA',
    'SERV_PEL_PAGA', 'CON_VJUEGOS',
    'JEFE_SEXO'
]]

print(f'We keep {pcat.shape[0]/personas.shape[0]*100}% and {vcat.shape[0]/viviendas.shape[0]*100} of the people and household datasets respectively.')

We keep 95.13226758864869% and 95.63444031938158 of the people and household datasets respectively.


  personas.INGTRMEN.replace(BPP, 2e6),
  ).replace("No especificado", np.nan)
  personas.HORTRA.replace(BPP, 2e6),
  ).replace("No especificado", np.nan)
  viviendas.INGTRHOG.replace(BPP, 2e6),
  ).replace("No especificado", np.nan)


In [3]:
# Optionally filter large households, keep households up to 10 people
large_id_viv = vcat.assign(HSIZE=lambda df: df.NUMPERS.astype(int)).query("HSIZE > 10").ID_VIV
vcat = vcat.query("~ID_VIV.isin(@large_id_viv)").reset_index(drop=True)
pcat = pcat.query("~ID_VIV.isin(@large_id_viv)").reset_index(drop=True)

In [4]:
from constraints import get_ind_const, get_viv_const

# Load constraints
constraints_ind = get_ind_const()
constraints_viv = get_viv_const()

print(f'We have a total of {len(constraints_ind)} people level constraints and {len(constraints_viv)} of household level constraints.')

We have a total of 58 people level constraints and 5 of household level constraints.


In [5]:
from census import process_census

# Load census
census_iter_path = Path('../data/census_loc/ITER_19CSV20.csv')
census_resageburb_path = Path('../data/census_ageb_manz/RESAGEBURB_19CSV20.csv')
(
    df_mun, df_loc, df_agebs
) = process_census(census_iter_path, census_resageburb_path)

In [6]:
# Build population matrices
from setup_lin_system import make_init_system, get_W

X, I, J, L, Up, Uh, U, Yp, Yh, Y, C = make_init_system(pcat, vcat, constraints_ind, constraints_viv, df_mun)

mun_list = X.MUN.unique()
const_zeroprob_list = []
for mun in mun_list:
    mun_mask = Y.MUN == mun
    U_mun = U.loc[:, mun_mask]
    const_zeroprob_list.extend(U_mun.index[U_mun.T.sum() == 0].to_list())
assert len(set(const_zeroprob_list)) == 0, set(const_zeroprob_list)

In [7]:
from taz import load_taz

# Import TAZ data
taz_path='taz_census.gpkg'
taz_dict = load_taz(taz_path, mun_list)

In [10]:
df_mun.loc["Monterrey", ["POBTOT", "POBCOL"]]

POBTOT    1142994.0
POBCOL       4127.0
Name: Monterrey, dtype: float64

In [8]:
for df in taz_dict.values():
    df = df.dropna()
    assert np.all(df.POBTOT == df.POBHOG + df.POBCOL)
    assert np.all(df.TVIVHAB == df.TOTHOG + df.TOTCOL)    

In [9]:
tot = 0
for t, c in taz_dict.items():
    tot += c.query("ZONA != -10 & ZONA < 900")[["POBTOT", "P3A5_NOA", "P6A14NOA", "P15A17A", "P18A24A"]].sum()
tot

POBTOT      5004945.0
P3A5_NOA      91416.0
P6A14NOA      32431.0
P15A17A      174083.0
P18A24A      212819.0
dtype: float64

In [15]:
C.loc["Monterrey", ["POBHOG", "POBCOL"]]

POBHOG    1138867.0
POBCOL       4127.0
Name: Monterrey, dtype: float64

In [11]:
c[["POBHOG", "P_0A2", "P_3A5", "P_6A11", "P_8A14", "P_12A14", "P_15A17", "P_18A24", "P_60YMAS", "P3A5_NOA", "P6A14NOA", "P15A17A", "P18A24A"]].head()

Unnamed: 0,POBHOG,P_0A2,P_3A5,P_6A11,P_8A14,P_12A14,P_15A17,P_18A24,P_60YMAS,P3A5_NOA,P6A14NOA,P15A17A,P18A24A
0,3165,145,180,297,313,113,125,287,561,86,39,39,33
1,1470,29,45,89,129,63,83,226,264,9,0,73,147
2,716,30,43,62,90,46,30,75,77,16,3,24,39
3,870,39,52,82,98,48,55,79,167,18,3,41,28
4,1806,47,75,152,169,74,100,224,313,30,16,76,94


In [12]:
from miqp import solve_gurobi_taz, relax_model, solve_gb, relax_model

In [13]:
import tqdm.notebook as tqdm

In [None]:
%%time
mun = 'Monterrey'
taz = 87
sol_df, model = solve_gb(mun, taz, taz_dict, Y, U, C, obj_type='L1', save=False, force=True)

In [None]:
relax_model(model, verbose=True)

In [167]:
p = model.presolve()

In [168]:
p.printStats()

In [138]:
p.write("model.lp")

In [83]:
C_mun = C.loc[mun].astype(int).to_dict()
C_taz = (
        taz_dict[mun]
        .set_index('ZONA')[C_mun.keys()]
        .fillna(0)
        .astype(int)
        .loc[taz]
        .copy()
    )

In [158]:
C_taz.TOTHOG

11551

In [159]:
C_taz[["TOTHOG", "P_3A5", "P_6A11", "P_12A14", "P_15A17", "P_18A24", "P_60YMAS", "P3A5_NOA", "P6A14NOA", "P15A17A", "P18A24A"]]

TOTHOG      11551
P_3A5        3053
P_6A11       5429
P_12A14      3176
P_15A17      3275
P_18A24      7292
P_60YMAS     2024
P3A5_NOA     1280
P6A14NOA      356
P15A17A      2141
P18A24A      1735
Name: 87, dtype: int64

In [78]:
pop = pd.read_csv("../output/synth_people.csv")
hous = pd.read_csv("../output/synth_households.csv")
pop = pop.merge(hous, on="HouseholdId")

In [96]:
pop.query("HouseholdZone == 87 & StudentStatus == 'O' & 6 <= Age <= 14").ExpansionFactor_x.sum()

550

In [14]:
taz_c_list = []
# For the zones total, get constraints of interest+
for mun, tazs_df in taz_dict.items():
    taz_c_list.append(
        tazs_df.query(
            "ZONA != -10 and ZONA < 900"
        )[[
            "ZONA", "POBHOG", "P_3A5", "P_6A11", "P_12A14", "P_15A17", "P_18A24",
            "P_60YMAS", "P3A5_NOA", "P6A14NOA", "P15A17A", "P18A24A"
        ]].copy())
taz_c_df = pd.concat(taz_c_list).dropna().assign(
    P_6A14=lambda df: df.P_6A11 + df.P_12A14,
    P_3A5A=lambda df: df.P_3A5 - df.P3A5_NOA,
    P_6A14A=lambda df: df.P_6A14 - df.P6A14NOA,
).set_index("ZONA")

In [15]:
taz_c_df.sum()

POBHOG      4991029.0
P_3A5        248351.0
P_6A11       468173.0
P_12A14      234474.0
P_15A17      235840.0
P_18A24      608668.0
P_60YMAS     569263.0
P3A5_NOA      91416.0
P6A14NOA      32431.0
P15A17A      174083.0
P18A24A      212819.0
P_6A14       702647.0
P_3A5A       156935.0
P_6A14A      670216.0
dtype: float64

In [None]:
%%time

r_list = []
for mun in tqdm.tqdm(taz_dict.keys(), desc='MUN', position=0):
    for taz in tqdm.tqdm(taz_dict[mun].ZONA, desc='TAZ', position=1, leave=False):
        sol_df, model = solve_gb(mun, taz, taz_dict, Y, U, C, obj_type='L1', save=True)
        if model is None:
            continue
        r_list.append((mun, taz, model.Status, model.Runtime, model.SolCount,
                       taz_dict[mun].query(f'ZONA=={taz}').POBTOT.item(),
                       taz_dict[mun].query(f'ZONA=={taz}').POBHOG.item(),
                       taz_dict[mun].query(f'ZONA=={taz}').POBCOL.item(),
                       taz_dict[mun].query(f'ZONA=={taz}').TOTHOG.item()
                      ))
        model.dispose()

MUN:   0%|          | 0/18 [00:00<?, ?it/s]

TAZ:   0%|          | 0/4 [00:00<?, ?it/s]


Interrupt request received


TAZ:   0%|          | 0/123 [00:00<?, ?it/s]

TAZ:   0%|          | 0/35 [00:00<?, ?it/s]

TAZ:   0%|          | 0/16 [00:00<?, ?it/s]

TAZ:   0%|          | 0/12 [00:00<?, ?it/s]

TAZ:   0%|          | 0/51 [00:00<?, ?it/s]

TAZ:   0%|          | 0/22 [00:00<?, ?it/s]

In [18]:
for z in taz_dict["Monterrey"].ZONA:
    print(z)

-10
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88


Wall time: 10h 3min 24s

In [156]:
for rr in r_list:
    rr[-1].dispose()

In [157]:
s_list = [rr[2] for rr in r_list]
n_list = [rr[4] for rr in r_list]

In [172]:
pd.value_counts(s_list)

  pd.value_counts(s_list)
  pd.value_counts(s_list)


9    415
2      7
Name: count, dtype: int64

In [184]:
sol_df, model = solve_gb('Monterrey', 88, taz_dict, Y, U, C, obj_type=0, save=False)


Interrupt request received


In [186]:
model.Params.JSONSolDetail

0

In [15]:
pd.read_pickle(f'MUN_Monterrey_TAZ_10_gsols.pkl')

Unnamed: 0,obj_val,190390000001,190390000002,190390000003,190390000004,190390000005,190390000006,190390000007,190390000008,190390000009,...,41545,41546,41547,41548,41549,41550,41551,41552,41553,41554
0,0.008382,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.008383,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.01202,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
pd.read_pickle(f'MUN_Monterrey_TAZ_2_gsols.pkl')

Unnamed: 0,obj_val,190390000001,190390000002,190390000003,190390000004,190390000005,190390000006,190390000007,190390000008,190390000009,...,41545,41546,41547,41548,41549,41550,41551,41552,41553,41554
0,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
6,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
8,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0.000272,1,0,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Create mun df from best solutions
Y_taz = pd.DataFrame(Y_mun)
Y_taz[taz_gdf.ZONA.values.tolist()] = 0

for taz in C_taz_all.index:
    if C_taz_all.loc[taz].TVIVHAB < 1:
        continue
    sol_df = pd.read_pickle(f'MUN_{mun}_TAZ_{taz}_gsols.pkl')
    Y_taz.loc[:, taz] = sol_df.iloc[0].drop('obj_val').values

In [None]:
# Create data frame for mun
idx = []
[idx.extend(l) for l in h_to_y.loc[Y_taz.index].ID_VIV]
idx = pd.Index(sorted(idx))

Y_exp = pd.DataFrame(np.zeros((len(idx), Y_taz.shape[1] -), dtype=int))

In [None]:
id_viv = h_to_y.loc[Y_taz.index].loc[53259].ID_VIV
weights = h_to_y.loc[Y_taz.index].loc[53259].Survey
id_viv, weights

In [None]:
idx = []
[idx.extend(l) for l in h_to_y.loc[Y_taz.index].ID_VIV]
idx = pd.Index(sorted(idx))
idx

In [None]:
# Recover people list using household ids
Y.loc[[190390003574, 190390004400]].Survey