In [1]:
import json
import os
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd

from nad.equip import intersect_gdf_with_equipments
from nad.flooding import get_flooded_area
from nad.load import get_census_level, load_all_equipments, load_census
from nad.traffic import calculate_tdpa_exposure

In [2]:
data_path = Path(os.environ["DATA_PATH"])
population_grids_path = Path(os.environ["POPULATION_GRIDS_PATH"])
segregation_path = Path(os.environ["SEGREGATION_PATH"])
geostatistical_framework_path = Path(os.environ["GEOSTATISTICAL_FRAMEWORK_PATH"])
census_path = Path(os.environ["CENSUS_PATH"])
results_path = Path("./results")

# Fix

In [3]:
df_census_base = load_census(census_path / "2020" / "08.csv")
df_census_blocks = get_census_level(df_census_base, level="block")

df_agebs = (
    gpd.read_file(
        population_grids_path
        / "final"
        / "zone_agebs"
        / "shaped"
        / "2020"
        / "08.2.03.gpkg",
    )
    .drop(columns=["POBTOT"])
    .set_index("CVEGEO")
)

df_geom_blocks = (
    gpd.read_file(geostatistical_framework_path / "2020" / "08_chihuahua" / "08m.shp")
    .assign(CVEGEO_AGEB=lambda df: df["CVEGEO"].str[:13])
    .query("CVEGEO_AGEB in @df_agebs.index")
    .drop(
        columns=[
            "CVEGEO_AGEB",
            "CVE_ENT",
            "CVE_MUN",
            "CVE_LOC",
            "CVE_MZA",
            "TIPOMZA",
            "AMBITO",
            "CVE_AGEB",
        ],
    )
    .set_index("CVEGEO")
    .to_crs("EPSG:6372")
)

df_blocks = df_geom_blocks.join(df_census_blocks, how="inner").fillna(0)

In [4]:
# wanted_ids = [2014, 3232, 3374]

df_neighborhoods = (
    gpd.read_file(data_path / "datos" / "colonias_raw.geojson")
    .drop(columns=["ENTIDAD", "MUNICIPIO", "CONTROL"])
    .drop_duplicates(subset=["geometry"])
    .to_crs("EPSG:6372")
    .assign(orig_area=lambda df: df["geometry"].area)
)

overlay = (
    df_neighborhoods.overlay(df_agebs.reset_index())
    .assign(area=lambda df: df["geometry"].area)
    .groupby("ID")
    .agg({"area": "sum", "orig_area": "first"})
    .assign(area_frac=lambda df: df["area"] / df["orig_area"])
    .fillna(0)
)

urban_neighborhoods = overlay[overlay["area_frac"] >= 0.10].index

df_neighborhoods = df_neighborhoods[df_neighborhoods["ID"].isin(urban_neighborhoods)]

In [5]:
self_intersections = (
    df_neighborhoods.sjoin(df_neighborhoods, how="inner", predicate="within")
    .reset_index(names="index_left")
    .query("index_left != index_right")
)

whole = df_neighborhoods[~df_neighborhoods.index.isin(self_intersections.index_left)]

In [6]:
df_neighborhoods_fixed = df_neighborhoods.copy()
for _, row in self_intersections.iterrows():
    outer = df_neighborhoods_fixed.loc[row["index_right"], "geometry"]
    inner = df_neighborhoods_fixed.loc[row["index_left"], "geometry"]
    df_neighborhoods_fixed.loc[row["index_right"], "geometry"] = outer.difference(inner)

df_neighborhoods_fixed = df_neighborhoods_fixed.set_index("ID").drop(
    columns=["orig_area"],
)

# Colonias a manzanas

In [7]:
neighborhoods_to_blocks_map = (
    df_blocks[["geometry"]]
    .reset_index()
    .assign(orig_block_area=lambda df: df["geometry"].area)
    .overlay(
        df_neighborhoods_fixed[["geometry"]].reset_index(names="neighborhood_id"),
        how="intersection",
    )
    .assign(area_frac=lambda df: df["geometry"].area / df["orig_block_area"])
    .sort_values("area_frac", ascending=False)
    .groupby(["CVEGEO"])
    .first()
    .reset_index()
    .groupby("neighborhood_id")["CVEGEO"]
    .apply(lambda x: list(set(x)))
    .to_dict()
)

blocks_to_neighborhoods_map = {
    block: neighborhood
    for neighborhood, blocks in neighborhoods_to_blocks_map.items()
    for block in blocks
}

with open("./colonias_a_manzanas.json", "w") as f:
    json.dump(neighborhoods_to_blocks_map, f, indent=4)

In [8]:
df_neighborhoods_final = (
    df_neighborhoods_fixed.assign(
        CVEGEO=df_neighborhoods_fixed.index.map(neighborhoods_to_blocks_map),
    )["CVEGEO"]
    .explode()
    .to_frame()
    .reset_index(names="neighborhood_id")
    .merge(df_blocks.reset_index(), on="CVEGEO", how="inner")
    .assign(GRAPROESxPOBTOT=lambda df: df["GRAPROES"] * df["POBTOT"])
    .groupby("neighborhood_id")
    .agg(
        {
            "POBTOT": "sum",
            "P_0A2": "sum",
            "P_3A5": "sum",
            "P_0A5": "sum",
            "P_18YMAS": "sum",
            "P_60YMAS": "sum",
            "P18YM_PB": "sum",
            "TVIVPARHAB": "sum",
            "GRAPROESxPOBTOT": "sum",
        },
    )
    .assign(GRAPROES=lambda df: (df["GRAPROESxPOBTOT"] / df["POBTOT"]).round(2))
    .drop(columns=["GRAPROESxPOBTOT"])
    .assign(geometry=df_neighborhoods_fixed["geometry"])
)

df_neighborhoods_final = gpd.GeoDataFrame(
    df_neighborhoods_final,
    crs=df_neighborhoods_fixed.crs,
    geometry="geometry",
)

df_neighborhoods_final = df_neighborhoods_final.join(
    df_neighborhoods[["ID", "NOMBRE"]].set_index("ID"),
).assign(
    NOMBRE=lambda df: df["NOMBRE"]
    .str.casefold()
    .replace({"sin nombre": np.nan, "sin colonia": np.nan}),
)

In [9]:
df_blocks = df_blocks.assign(
    neighborhood_id=lambda df: df.index.map(blocks_to_neighborhoods_map),
)

# Equipamientos

In [10]:
df_equip = load_all_equipments(data_path)

# Stats

In [11]:
rows = {}

## Población, área y viviendas totales

In [12]:
rows["total_poblacion"] = df_neighborhoods_final["POBTOT"]
rows["total_viviendas"] = df_neighborhoods_final["TVIVPARHAB"]
rows["total_area_m2"] = df_neighborhoods_final.to_crs("EPSG:6372")["geometry"].area

## Porcentaje población 0-5 años

In [13]:
rows["porcentaje_pob_0a5"] = (
    df_neighborhoods_final["P_0A2"] + df_neighborhoods_final["P_3A5"]
) / df_neighborhoods_final["POBTOT"]
rows["total_pob_0a5"] = (
    df_neighborhoods_final["P_0A2"] + df_neighborhoods_final["P_3A5"]
)

## Porcentaje de población >60

In [14]:
rows["porcentaje_pob_60"] = (
    df_neighborhoods_final["P_60YMAS"] / df_neighborhoods_final["POBTOT"]
)
rows["total_pob_60"] = df_neighborhoods_final["P_60YMAS"]

## Porcentaje de población con menos de preparatoria terminada

In [15]:
rows["porcentaje_menos_prepa_terminada"] = (
    1 - df_neighborhoods_final["P18YM_PB"] / df_neighborhoods_final["P_18YMAS"]
)
rows["total_menos_prepa_terminada"] = (
    df_neighborhoods_final["P_18YMAS"] - df_neighborhoods_final["P18YM_PB"]
)
rows["total_pob_18"] = df_neighborhoods_final["P_18YMAS"]

## Equipamientos

In [16]:
df_equip = load_all_equipments(data_path)
temp = intersect_gdf_with_equipments(df_neighborhoods_final, df_equip)

for col in temp.columns:
    rows[f"num_equip_{col}"] = temp[col]

## Marginación

In [17]:
unique_muns = set(df_agebs.index.str[:5])
df_bienestar = (
    pd.read_excel(data_path / "datos" / "IMUC_2020.xlsx", skiprows=3)
    .rename(columns={"NOM_COLONIA": "NOMBRE"})
    .dropna(subset=["CVE_MUN"])
    .assign(CVE_MUN=lambda df: df["CVE_MUN"].astype(int).astype(str).str.zfill(5))
    .query("CVE_MUN.isin(@unique_muns)")
    .sort_values("POB_TOT", ascending=False)
    .drop_duplicates(subset=["NOMBRE"], keep="first")
    .filter(["NOMBRE", "GM_2020"])
    .dropna(subset=["NOMBRE"])
    .assign(NOMBRE=lambda df: df["NOMBRE"].str.casefold())
)

In [18]:
df_neighborhoods_dup = df_neighborhoods_final.sort_values(
    "POBTOT",
    ascending=False,
).drop_duplicates(subset=["NOMBRE"], keep="first")

rows["indice_bienestar"] = (
    df_neighborhoods_dup.reset_index()
    .merge(
        df_bienestar,
        on="NOMBRE",
        how="inner",
    )
    .set_index("neighborhood_id")
    .reindex(df_neighborhoods_final.index, fill_value=np.nan)["GM_2020"]
)

## Industrias

In [19]:
harmful_industries = gpd.read_file(results_path / "harmful_industries.gpkg")

In [20]:
df_neighborhoods_affected = (
    df_blocks.filter(["geometry", "P_0A5", "P_60YMAS", "TVIVPARHAB", "neighborhood_id"])
    .sjoin(harmful_industries[["geometry"]], how="inner", predicate="within")
    .reset_index(names="CVEGEO")
    .drop_duplicates(subset="CVEGEO")
    .assign(neighborhood_id=lambda df: df["neighborhood_id"].astype(int))
    .groupby("neighborhood_id")
    .agg({"P_0A5": "sum", "P_60YMAS": "sum", "TVIVPARHAB": "sum"})
)

rows["porcentaje_infantes_vulnerables_industria"] = (
    df_neighborhoods_affected["P_0A5"] / df_neighborhoods_final["P_0A5"]
).fillna(0)
rows["total_infantes_vulnerables_industria"] = df_neighborhoods_affected["P_0A5"]

rows["porcentaje_adultos_mayores_vulnerables_industria"] = (
    df_neighborhoods_affected["P_60YMAS"] / df_neighborhoods_final["P_60YMAS"]
).fillna(0)
rows["total_adultos_mayores_vulnerables_industria"] = df_neighborhoods_affected[
    "P_60YMAS"
]

rows["porcentaje_viviendas_vulnerables_industria"] = (
    df_neighborhoods_affected["TVIVPARHAB"] / df_neighborhoods_final["TVIVPARHAB"]
).fillna(0)
rows["total_viviendas_vulnerables_industria"] = df_neighborhoods_affected["TVIVPARHAB"]

## Vulnerabilidad calor

In [21]:
rows["vulnerabilidad_calor"] = (
    gpd.read_file(data_path / "datos" / "GUHVI" / "colonias_GUHVI.gpkg")
    .set_index("ID")["guhvi_class"]
    .astype(int)
)

## Accesibilidad

In [22]:
df_neighborhoods_accessibility = (
    gpd.read_file(
        results_path / "accessibility_blocks.gpkg",
    )
    .set_index("CVEGEO")
    .assign(neighborhood_id=lambda df: df.index.map(blocks_to_neighborhoods_map))
    .groupby("neighborhood_id")
    .agg(
        {
            "accessibility_score": "mean",
            "tiempo_parque": "mean",
            "tiempo_clinica_hospital": "mean",
            "tiempo_preparatoria": "mean",
            "hogares_parque_15mi": "sum",
            "hogares_clinica_hospital_30mi": "sum",
            "hogares_preparatoria_30mi": "sum",
            "TVIVPARHAB": "sum",
            "geometry": "first",
        },
    )
    .assign(
        porcentaje_hogares_parque_15mi=lambda df: df.apply(
            lambda x: x["hogares_parque_15mi"] / x["TVIVPARHAB"]
            if x["TVIVPARHAB"] > 0
            else 0,
            axis=1,
        ),
        porcentaje_hogares_clinica_hospital_30mi=lambda df: df.apply(
            lambda x: x["hogares_clinica_hospital_30mi"] / x["TVIVPARHAB"]
            if x["TVIVPARHAB"] > 0
            else 0,
            axis=1,
        ),
        porcentaje_hogares_preparatoria_30mi=lambda df: df.apply(
            lambda x: x["hogares_preparatoria_30mi"] / x["TVIVPARHAB"]
            if x["TVIVPARHAB"] > 0
            else 0,
            axis=1,
        ),
    )
    .drop(columns=["TVIVPARHAB"])
    .rename(
        columns={
            "accessibility_score": "indice_accesibilidad",
            "hogares_parque_15mi": "total_hogares_parque_15min",
            "hogares_clinica_hospital_30mi": "total_hogares_clinica_hospital_30min",
            "hogares_preparatoria_30mi": "total_hogares_preparatoria_30min",
            "porcentaje_hogares_parque_15mi": "porcentaje_hogares_parque_15min",
            "porcentaje_hogares_clinica_hospital_30mi": "porcentaje_hogares_clinica_hospital_30min",
            "porcentaje_hogares_preparatoria_30mi": "porcentaje_hogares_preparatoria_30min",
        },
    )
    .reset_index(names="neighborhood_id")
    .assign(neighborhood_id=lambda df: df["neighborhood_id"].astype(int))
    .set_index("neighborhood_id")
)

for column in df_neighborhoods_accessibility.drop(columns=["geometry"]).columns:
    rows[column] = df_neighborhoods_accessibility[column]

## Tráfico

In [23]:
gdf_vialidades = gpd.read_file(results_path / "vialidades.gpkg")

rows["tdpa_density"] = calculate_tdpa_exposure(
    df_neighborhoods_final.rename_axis("id"),
    gdf_vialidades,
    threshold_km=500,
)

## Inundaciones

In [24]:
is_flooded = pd.read_csv(results_path / "flooded_blocks.csv").set_index("CVEGEO")[
    "is_flooded"
]

In [25]:
pop_flooded = (
    df_blocks.assign(is_flooded=pd.Series(is_flooded))
    .query("is_flooded")
    .groupby("neighborhood_id")["POBTOT"]
    .sum()
)

rows["total_poblacion_inundada"] = pop_flooded
rows["porcentaje_poblacion_inundada"] = (
    pop_flooded / df_neighborhoods_final["POBTOT"]
).fillna(0)

In [26]:
flooded_rows = []

flooded_df = get_flooded_area(
    data_path / "datos" / "CdJuarez_60min" / "CdJuarez_60Min_Reclass.tif",
    df_neighborhoods_final,
)
rows["area_inundada_m2"] = flooded_df["flooded_area"]
rows["porcentaje_area_inundada"] = flooded_df["flooded_area_frac"]

# Out

In [27]:
df_neighborhoods_final["NOMBRE"]

neighborhood_id
1                      vista del norte
2               excelencia universidad
3                           del futuro
4                              condesa
6               misiones de san miguel
                     ...              
3480    campos eliseos (cerrada norte)
3486         kali residencial etapa ii
3488                   ejido salvarcar
3491                     antonio silva
3500        roma (complejo industrial)
Name: NOMBRE, Length: 977, dtype: object

In [28]:
out = (
    gpd.GeoDataFrame(
        pd.DataFrame.from_dict(rows, orient="index")
        .transpose()
        .join(df_neighborhoods_final[["NOMBRE", "geometry"]]),
        crs=df_neighborhoods_final.crs,
        geometry="geometry",
    )
    .to_crs("EPSG:4326")
    .rename(columns={"NOMBRE": "nombre"})
    .reset_index(names="id_colonia")
)

for c in out.columns:
    if (
        not c.startswith("indice")
        and c != "geometry"
        and c != "nombre"
        and c != "id_colonia"
    ):
        out[c] = out[c].astype(float)

out.to_file(results_path / "platform" / "colonias.geojson")
out.drop(columns=["geometry"]).to_csv(
    results_path / "platform" / "colonias.csv",
    index=False,
)