In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import contextily as ctx
import urbanpy as up
from tqdm.notebook import tqdm

In [None]:
tqdm.pandas()

In [None]:
gdf_celdas = gpd.read_parquet("outputs/celdas_country_bra_02_06_2025.parquet")
print(f"{gdf_celdas.shape[0]} Cells loaded")

In [None]:
gdf_schools = gpd.read_parquet(
    "outputs/amazon_schools.parquet",
)
print(f"{gdf_schools.shape[0]} Cells loaded")

In [None]:
gdf_schools.loc[:, "lat"] = gdf_schools["geometry"].y
gdf_schools.loc[:, "lon"] = gdf_schools["geometry"].x

In [None]:
gdf_celdas.head()

In [None]:
gdf_celdas["code"].value_counts()

In [None]:
gdf_celdas["code"].isna().sum()

In [None]:
gdf_schools["country_code"].value_counts()

In [None]:
gdf_schools_bra = gdf_schools[gdf_schools["country_code"] == "BRA"]

In [None]:
gdf_schools_bra.shape

In [None]:
assert gdf_schools.crs.to_string() == gdf_celdas.crs.to_string(), "CRS do not match"

In [None]:
brazil_subregions_polys = gpd.read_file(
    "/Users/claudio/Downloads/regions_brazil.geojson"
)
brazil_subregions_polys

In [None]:
brazil_subregions_polys.crs.to_string()

In [None]:
# Check the boundaries with a plot
fig, ax = plt.subplots(figsize=(10, 10))

brazil_subregions_polys.plot("name", cmap="Set3", legend=True, ax=ax, alpha=0.5)

ctx.add_basemap(
    ax, source=ctx.providers.Esri.WorldImagery, crs=brazil_subregions_polys.crs
)

ax.set_axis_off()

ax.set_title("Brazil Subregions")

plt.show()

In [None]:
brazil_subregions_polys.shape

In [None]:
gdf_schools_bra_subregions = (
    gdf_schools_bra.sjoin(
        brazil_subregions_polys,
        how="left",
        predicate="intersects",
    )
    .drop(columns=["index_right"])
    .reset_index()
    .drop_duplicates(subset="index", keep="last")
    .rename(columns={"name": "subregion"})
)

In [None]:
gdf_schools_bra.shape[0], gdf_schools_bra_subregions.shape[0]

In [None]:
print(
    "Schools by subregion\n",
    gdf_schools_bra_subregions["subregion"].value_counts().sort_index(),
)

In [None]:
gdf_celdas_subregions = (
    gdf_celdas.sjoin(
        brazil_subregions_polys,
        how="left",
        predicate="intersects",
    )
    .reset_index()
    .drop_duplicates(subset="index", keep="last")
    .drop(columns=["index_right"])
    .rename(columns={"name": "subregion"})
)

In [None]:
gdf_celdas.shape[0], gdf_celdas_subregions.shape[0]

In [None]:
print(
    "Cells by subregion\n",
    gdf_celdas_subregions["subregion"].value_counts().sort_index(),
)

In [None]:
gdf_celdas_subregions["subregion"].isna().sum()

In [None]:
assert brazil_subregions_polys.crs.to_string() == gdf_celdas_subregions.crs.to_string()

In [None]:
gdf_schools_bra_norte = gdf_schools_bra_subregions[
    gdf_schools_bra_subregions["subregion"] == "Norte"
].copy()

In [None]:
gdf_celdas_bra_norte = gdf_celdas_subregions[
    gdf_celdas_subregions["subregion"] == "Norte"
].copy()

In [None]:
gdf_schools_bra_norte.columns

In [None]:
# 'nivel_primaria', 'nivel_media', 'nivel_secundaria',
gdf_primary_schools_bra_norte = gdf_schools_bra_norte[
    gdf_schools_bra_norte["nivel_primaria"] == 1
]
gdf_middle_schools_bra_norte = gdf_schools_bra_norte[
    gdf_schools_bra_norte["nivel_media"] == 1
]
gdf_secondary_schools_bra_norte = gdf_schools_bra_norte[
    gdf_schools_bra_norte["nivel_secundaria"] == 1
]

In [None]:
print("total # of schools in bra norte", gdf_schools_bra_norte.shape[0])
print("primary:", gdf_primary_schools_bra_norte.shape[0])
print("middle:", gdf_middle_schools_bra_norte.shape[0])
print("secondary:", gdf_secondary_schools_bra_norte.shape[0])
print(
    "add primary, middle, and secondary:",
    gdf_primary_schools_bra_norte.shape[0]
    + gdf_middle_schools_bra_norte.shape[0]
    + gdf_secondary_schools_bra_norte.shape[0],
)

In [None]:
# start server
up.routing.start_osrm_server("south-america/brazil/norte", "foot")

In [None]:
gdf_celdas_bra_norte_access_primary = up.accessibility.travel_times(
    gdf_celdas_bra_norte, gdf_primary_schools_bra_norte, "primary_schools"
)

In [None]:
# Obtain the school unique_id and lat_lon
gdf_celdas_bra_norte_access_primary = pd.merge(
    gdf_celdas_bra_norte_access_primary,
    gdf_primary_schools_bra_norte.reset_index()[["id_edificio", "lat", "lon"]],
    how="left",
    left_on="nearest_primary_schools_ix",
    right_index=True,
    suffixes=("", "_primary_school"),
)

In [None]:
gdf_celdas_bra_norte_access_middle = up.accessibility.travel_times(
    gdf_celdas_bra_norte_access_primary, gdf_middle_schools_bra_norte, "middle_schools"
)

In [None]:
# Obtain the school unique_id and lat_lon
gdf_celdas_bra_norte_access_middle = pd.merge(
    gdf_celdas_bra_norte_access_middle,
    gdf_middle_schools_bra_norte.reset_index()[["id_edificio", "lat", "lon"]],
    how="left",
    left_on="nearest_middle_schools_ix",
    right_index=True,
    suffixes=("", "_middle_school"),
)

In [None]:
gdf_celdas_bra_norte_access_total = up.accessibility.travel_times(
    gdf_celdas_bra_norte_access_middle,
    gdf_secondary_schools_bra_norte,
    "secondary_schools",
)

In [None]:
# Obtain the secondary school unique_id and lat_lon
gdf_celdas_bra_norte_access_total = pd.merge(
    gdf_celdas_bra_norte_access_total,
    gdf_secondary_schools_bra_norte.reset_index()[["id_edificio", "lat", "lon"]],
    how="left",
    left_on="nearest_secondary_schools_ix",
    right_index=True,
    suffixes=("", "_secondary_school"),
)

In [None]:
# Save the results
gdf_celdas_bra_norte_access_total.to_parquet(
    "outputs/celdas_bra_norte_access_v3.parquet"
)

In [None]:
up.routing.stop_osrm_server("south-america/brazil/norte", "foot")

### Nordeste


Select the data for the subregion


In [None]:
gdf_celdas_bra_nordeste = gdf_celdas_subregions[
    gdf_celdas_subregions["subregion"] == "Nordeste"
].copy()
print("# of cells in Nordest:", gdf_celdas_bra_nordeste.shape[0])

In [None]:
gdf_schools_bra_nordeste = gdf_schools_bra_subregions[
    gdf_schools_bra_subregions["subregion"] == "Nordeste"
].copy()
print("# of schools in Nordest:", gdf_schools_bra_nordeste.shape[0])

In [None]:
gdf_primary_schools_bra_nordeste = gdf_schools_bra_nordeste[
    gdf_schools_bra_nordeste["nivel_primaria"] == 1
]
gdf_middle_schools_bra_nordeste = gdf_schools_bra_nordeste[
    gdf_schools_bra_nordeste["nivel_media"] == 1
]
gdf_secondary_schools_bra_nordeste = gdf_schools_bra_nordeste[
    gdf_schools_bra_nordeste["nivel_secundaria"] == 1
]

In [None]:
print("total # of schools in bra norte", gdf_schools_bra_nordeste.shape[0])
print("primary:", gdf_primary_schools_bra_nordeste.shape[0])
print("middle:", gdf_middle_schools_bra_nordeste.shape[0])
print("secondary:", gdf_secondary_schools_bra_nordeste.shape[0])
print(
    "add primary, middle, and secondary:",
    gdf_primary_schools_bra_nordeste.shape[0]
    + gdf_middle_schools_bra_nordeste.shape[0]
    + gdf_secondary_schools_bra_nordeste.shape[0],
)

In [None]:
# start server
up.routing.start_osrm_server("south-america/brazil/nordeste", "foot")

In [None]:
gdf_celdas_bra_nordeste_access_primary = up.accessibility.travel_times(
    gdf_celdas_bra_nordeste, gdf_primary_schools_bra_nordeste, "primary_schools"
)

In [None]:
# Obtain the school unique_id and lat_lon
gdf_celdas_bra_nordeste_access_primary = pd.merge(
    gdf_celdas_bra_nordeste_access_primary,
    gdf_primary_schools_bra_nordeste.reset_index()[["id_edificio", "lat", "lon"]],
    how="left",
    left_on="nearest_primary_schools_ix",
    right_index=True,
    suffixes=("", "_primary_school"),
)

In [None]:
gdf_celdas_bra_nordeste_access_middle = up.accessibility.travel_times(
    gdf_celdas_bra_nordeste_access_primary,
    gdf_middle_schools_bra_nordeste,
    "middle_schools",
)

In [None]:
# Obtain the school unique_id and lat_lon
gdf_celdas_bra_nordeste_access_middle = pd.merge(
    gdf_celdas_bra_nordeste_access_middle,
    gdf_middle_schools_bra_nordeste.reset_index()[["id_edificio", "lat", "lon"]],
    how="left",
    left_on="nearest_middle_schools_ix",
    right_index=True,
    suffixes=("", "_middle_school"),
)

In [None]:
gdf_celdas_bra_nordeste_access_total = up.accessibility.travel_times(
    gdf_celdas_bra_nordeste_access_middle,
    gdf_secondary_schools_bra_nordeste,
    "secondary_schools",
)

In [None]:
# Obtain the secondary school unique_id and lat_lon
gdf_celdas_bra_nordeste_access_total = pd.merge(
    gdf_celdas_bra_nordeste_access_total,
    gdf_secondary_schools_bra_nordeste.reset_index()[["id_edificio", "lat", "lon"]],
    how="left",
    left_on="nearest_secondary_schools_ix",
    right_index=True,
    suffixes=("", "_secondary_school"),
)

In [None]:
# Save the results
gdf_celdas_bra_nordeste_access_total.to_parquet(
    "outputs/celdas_bra_nordeste_access_v3.parquet"
)

In [None]:
up.routing.stop_osrm_server("south-america/brazil/nordeste", "foot")

### Centro-Oeste


In [None]:
gdf_celdas_bra_centro_oeste = gdf_celdas_subregions[
    gdf_celdas_subregions["subregion"] == "Centro-oeste"
].copy()
print("# of cells in Centro-oeste:", gdf_celdas_bra_centro_oeste.shape[0])

In [None]:
gdf_schools_bra_centro_oeste = gdf_schools_bra_subregions[
    gdf_schools_bra_subregions["subregion"] == "Centro-oeste"
].copy()
print("# of schools in Centro-oeste:", gdf_schools_bra_centro_oeste.shape[0])

In [None]:
gdf_primary_schools_bra_centro_oeste = gdf_schools_bra_centro_oeste[
    gdf_schools_bra_centro_oeste["nivel_primaria"] == 1
]
gdf_middle_schools_bra_centro_oeste = gdf_schools_bra_centro_oeste[
    gdf_schools_bra_centro_oeste["nivel_media"] == 1
]
gdf_secondary_schools_bra_centro_oeste = gdf_schools_bra_centro_oeste[
    gdf_schools_bra_centro_oeste["nivel_secundaria"] == 1
]

In [None]:
print("total # of schools in bra norte", gdf_schools_bra_centro_oeste.shape[0])
print("primary:", gdf_primary_schools_bra_centro_oeste.shape[0])
print("middle:", gdf_middle_schools_bra_centro_oeste.shape[0])
print("secondary:", gdf_secondary_schools_bra_centro_oeste.shape[0])
print(
    "add primary and secondary:",
    gdf_primary_schools_bra_centro_oeste.shape[0]
    + gdf_middle_schools_bra_centro_oeste.shape[0]
    + gdf_secondary_schools_bra_centro_oeste.shape[0],
)

In [None]:
# start server
up.routing.start_osrm_server("south-america/brazil/centro-oeste", "foot")

In [None]:
gdf_celdas_bra_centro_oeste_access_primary = up.accessibility.travel_times(
    gdf_celdas_bra_centro_oeste, gdf_primary_schools_bra_centro_oeste, "primary_schools"
)

In [None]:
# Obtain the school unique_id and lat_lon
gdf_celdas_bra_centro_oeste_access_primary = pd.merge(
    gdf_celdas_bra_centro_oeste_access_primary,
    gdf_primary_schools_bra_centro_oeste.reset_index()[["id_edificio", "lat", "lon"]],
    how="left",
    left_on="nearest_primary_schools_ix",
    right_index=True,
    suffixes=("", "_primary_school"),
)

In [None]:
gdf_celdas_bra_centro_oeste_access_middle = up.accessibility.travel_times(
    gdf_celdas_bra_centro_oeste_access_primary,
    gdf_middle_schools_bra_centro_oeste,
    "middle_schools",
)

In [None]:
# Obtain the school unique_id and lat_lon
gdf_celdas_bra_centro_oeste_access_middle = pd.merge(
    gdf_celdas_bra_centro_oeste_access_middle,
    gdf_middle_schools_bra_centro_oeste.reset_index()[["id_edificio", "lat", "lon"]],
    how="left",
    left_on="nearest_middle_schools_ix",
    right_index=True,
    suffixes=("", "_middle_school"),
)

In [None]:
gdf_celdas_bra_centro_oeste_access_total = up.accessibility.travel_times(
    gdf_celdas_bra_centro_oeste_access_middle,
    gdf_secondary_schools_bra_centro_oeste,
    "secondary_schools",
)

In [None]:
# Obtain the secondary school unique_id and lat_lon
gdf_celdas_bra_centro_oeste_access_total = pd.merge(
    gdf_celdas_bra_centro_oeste_access_total,
    gdf_secondary_schools_bra_centro_oeste.reset_index()[["id_edificio", "lat", "lon"]],
    how="left",
    left_on="nearest_secondary_schools_ix",
    right_index=True,
    suffixes=("", "_secondary_school"),
)

In [None]:
# Save the results
gdf_celdas_bra_centro_oeste_access_total.to_parquet(
    "outputs/celdas_bra_centro_oeste_access_v3.parquet"
)

In [None]:
up.routing.stop_osrm_server("south-america/brazil/centro-oeste", "foot")

In [None]:
gdf_celdas_bra_centro_oeste_access_total.head()

In [None]:
# def add_education_level(gdf_celdas, gdf_schools):
#     # Perform a merge using the neatest schools index
#     gdf_schools["school_ix"] = range(len(gdf_schools))
#     gdf_schools.set_index("school_ix", inplace=True)
#     gdf_celdas = gdf_celdas.merge(
#         gdf_schools[["EduNivelPrimaria", "EduNivelSecundariaTotal"]],
#         left_on="nearest_schools_ix",
#         right_index=True,
#         how="left",
#     )
#     return gdf_celdas

In [None]:
# # Apply the function for each country
# gdf_celdas_bra_norte_access_comp = add_education_level(
#     gdf_celdas_bra_norte_access, gdf_schools_bra_norte
# )
# gdf_celdas_bra_nordeste_access_comp = add_education_level(
#     gdf_celdas_bra_nordeste_access, gdf_schools_bra_nordeste
# )
# gdf_celdas_bra_centro_oeste_access_comp = add_education_level(
#     gdf_celdas_bra_centro_oeste_access, gdf_schools_bra_centro_oeste
# )

In [None]:
# Read and concatenate them into a single GeoDataFrame
gdf_combined_comp = gpd.GeoDataFrame(
    pd.concat(
        [
            gdf_celdas_bra_norte_access_total,
            gdf_celdas_bra_nordeste_access_total,
            gdf_celdas_bra_centro_oeste_access_total,
        ],
        ignore_index=True,
    )
)

# Display the combined GeoDataFrame
gdf_combined_comp.head()

In [None]:
# gdf_combined_comp["nivel_educativo"] = gdf_combined_comp.apply(
#     lambda x: (
#         "Primaria"
#         if x["EduNivelPrimaria"] == 1
#         else "Secundaria" if x["EduNivelSecundariaTotal"] == 1 else "Ninguno"
#     ),
#     axis=1,
# )

In [None]:
# gdf_combined_comp["nivel_educativo"].value_counts()

In [None]:
# Save the combined GeoDataFrame to a new file
gdf_combined_comp.to_parquet("outputs/celdas_combined_bra_access_v3.parquet")