In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly
import plotly.express as px
import geopandas as gpd
import contextily as ctx
import rioxarray
import urbanpy as up
from mpl_toolkits.axes_grid1 import make_axes_locatable
from tqdm.notebook import tqdm
from geocube.api.core import make_geocube

In [None]:
tqdm.pandas()

In [None]:
# Read the grid data
gdf_celdas = gpd.read_parquet("outputs/celdas_02_06_2025.parquet")

In [None]:
# Set the base path for the data
BASE_PATH = "/Users/claudio/Documents/amazonia-bid/inputs/WorldPop"
countries_labels = [
    "per",
    "col",
    "bol",
    "ecu",
]  # "bra", TODO: Download brazil data (4GB per file * 6 files)
age_groups = [5, 10, 15]  # 5-9, 10-14, 15-19
genders = ["m", "f"]

In [None]:
# Read the country polygons
countries = gpd.read_parquet(
    "~/Documents/amazonia-bid/outputs/amazon_countries.parquet"
)
countries

In [None]:
# Visualize the countries
fig, ax = plt.subplots(figsize=(10, 10))

# Plot the polygon with the population data
countries.plot(ax=ax, column="ADM0_PCODE", alpha=0.5, edgecolor="k", legend=True)

# Add the basemap
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldImagery, crs=countries.crs)

# Add the title
ax.set_title("Country Boundaries")

# Add the grid
ax.grid(True)

# Add the x and y labels
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")

# Show the plot
plt.show()

In [None]:
# Guyana, Suriname, Venezuela are not included in the analysis
countries_ADM0CODE = ["PE", "CO", "EC", "BO"]  # TODO: Fix "BR" data errors
amzn_countries = countries[countries["ADM0_PCODE"].isin(countries_ADM0CODE)]

In [None]:
countries_labels = sorted(countries_labels)
countries_labels

In [None]:
amzn_countries = amzn_countries.sort_values(by=["ADM0_PCODE"], ascending=True)
amzn_countries

In [None]:
amzn_countries["code"] = countries_labels

In [None]:
amzn_countries

In [None]:
# Now we will asign a country code to each cell in the grid
gdf_celdas_countries = gdf_celdas.sjoin(
    amzn_countries,
    how="left",
    predicate="intersects",
    lsuffix="celdas",
    rsuffix="countries",
)

In [None]:
# NOTE: We will simply drop the duplicated rows for now
gdf_celdas_countries = gdf_celdas_countries.drop_duplicates(subset=["cell_id"])

In [None]:
# Check the joined data has the same number of rows as the original grid data
try:
    assert (
        gdf_celdas.shape[0] == gdf_celdas_countries.shape[0]
    ), "Number of rows mismatch"
except AssertionError as e:
    print(f"AssertionError: {e}")
    print("Number of rows in gdf_celdas:", gdf_celdas.shape[0])
    print("Number of rows in gdf_celdas_countries:", gdf_celdas_countries.shape[0])

In [None]:
# Save the joined data
gdf_celdas_countries.to_parquet("outputs/celdas_countries_wo_br_02_06_2025.parquet")

In [None]:
# Check the unique country codes
gdf_celdas_countries["code"].unique()

In [None]:
# Check the number of cells per country
gdf_celdas_countries["code"].value_counts()

In [None]:
# Check the percentage of cells per country
gdf_celdas_countries["code"].value_counts(normalize=True) * 100

In [None]:
# Check the number of cells without a country code
gdf_celdas_countries["code"].isna().sum()

In [None]:
# Check the percentage of cells without a country code
(gdf_celdas_countries["code"].isna().sum() / gdf_celdas_countries.shape[0]) * 100

In [None]:
# Convert the cell_id to integer so it can be rasterized for population aggregation
gdf_celdas_countries["cell_id"] = gdf_celdas_countries["cell_id"].astype("int32")
# check the type
gdf_celdas_countries["cell_id"].dtype

In [None]:
# Accumulate the population data for each country
countries_geodata = {}

for index, row in tqdm(
    amzn_countries.iterrows(), total=amzn_countries.shape[0], desc="Countries"
):

    # Filter the country cells
    country = row.code

    # Skip bol for now
    if country in ["per"]:
        continue

    celdas_country = gdf_celdas_countries.loc[
        gdf_celdas_countries["code"] == country, :
    ]
    # print(f"{country=}")
    # print(f"{celdas_country=}")

    # Check the data type of cell_id
    assert (
        celdas_country["cell_id"].dtype == "int32"
    ), f"cell_id is not int32 for {country}"

    # Accumulate the population data for each age group+gender pair
    population_data = []
    for age_group in tqdm(
        age_groups, total=len(age_groups), desc=f"Age Groups for {row.code}"
    ):  # 5-9, 10-14, 15-19
        for gender in tqdm(
            genders, total=len(genders), desc=f"Genders for {row.code} - {age_group}"
        ):  # m, f
            pop_col = f"pop_2020_{gender}_{age_group}"

            file_name = (
                BASE_PATH + f"/{country}/{country}_{gender}_{age_group}_2020.tif"
            )

            # Read the tiff file with the population data
            xds = rioxarray.open_rasterio(
                file_name, masked=True, chunks={"x": 1024, "y": 1024}
            )

            # Clip the raster with the country cells
            xds_clipped = xds.rio.clip(
                celdas_country.geometry.values, celdas_country.crs, from_disk=True
            )
            # print(f"{xds_clipped=}")

            # Rasterize the cell_id to be able to calculate the total population in each cell
            xds_mask = make_geocube(
                celdas_country,
                measurements=["cell_id"],
                like=xds_clipped,
                fill=0,
            )
            # print(f"{xds_mask=}")

            # Add a new dimension to the mask for the population using the clipped raster
            xds_clipped_squeezed = xds_clipped.squeeze()
            xds_mask[pop_col] = (
                xds_clipped_squeezed.dims,
                xds_clipped_squeezed.values,
                xds_clipped_squeezed.attrs,
                xds_clipped_squeezed.encoding,
            )

            # Now we can calculate the total population in each cell and add it back to the geodataframe
            agg_pop = (
                xds_mask.drop_vars("spatial_ref")
                .groupby(xds_mask.cell_id)
                .sum()
                .to_dataframe()
            )

            population_data.append(agg_pop)

        countries_geodata[country] = pd.concat(population_data)

In [None]:
import os

In [None]:
for key, value in countries_geodata.items():
    if key == "per":
        continue

    output_fn = f"outputs/{key}_pop_2020_04_06_2025.parquet"
    if os.path.exists(output_fn):
        print(f"File {output_fn} already exists. Skipping.")

    countries_geodata[key].reset_index().to_parquet(output_fn, index=False)

In [None]:
countries_geodata["per"].reset_index().to_parquet(
    "peru_worldpop_school_age_celdas.parquet", index=False
)

In [None]:
countries_geodata["bol"]

In [None]:
countries_geodata["col"]

In [None]:
# Check the number of cells per country
gdf_celdas_countries["code"].value_counts()

In [None]:
countries_geodata.keys()

In [None]:
for key, value in countries_geodata.items():
    print(f"{key}\t {value.shape[0]}")

In [None]:
cells_pop_bol = countries_geodata["bol"]

In [None]:
cells_pop_bol

In [None]:
clean_dfs = []

for col in cells_pop_bol.columns:
    clean_dfs.append(cells_pop_bol[col].dropna())

In [None]:
clean_dfs_bol = pd.concat(clean_dfs, axis=1)

In [None]:
clean_dfs_bol.shape[0], (gdf_celdas_countries["code"] == "bol").sum()

In [None]:
celdas_bol = gdf_celdas_countries[gdf_celdas_countries["code"] == "bol"]

In [None]:
celdas_bol_with_pop = celdas_bol.merge(
    clean_dfs_bol,
    how="left",
    left_on="cell_id",
    right_index=True,
)

In [None]:
celdas_bol_with_pop.shape

In [None]:
celdas_bol_with_pop.columns

In [None]:
celdas_bol_with_pop_df = celdas_bol_with_pop[
    [
        "cell_id",
        "smod",
        "geometry",
        "code",
        "pop_2020_m_5",
        "pop_2020_f_5",
        "pop_2020_m_10",
        "pop_2020_f_10",
        "pop_2020_m_15",
        "pop_2020_f_15",
    ]
]

In [None]:
# poblacion hombres
celdas_bol_with_pop_df["pop_m"] = (
    celdas_bol_with_pop_df["pop_2020_m_5"]
    + celdas_bol_with_pop_df["pop_2020_m_10"]
    + celdas_bol_with_pop_df["pop_2020_m_15"]
)

# poblacion mujeres
celdas_bol_with_pop_df["pop_f"] = (
    celdas_bol_with_pop_df["pop_2020_f_5"]
    + celdas_bol_with_pop_df["pop_2020_f_10"]
    + celdas_bol_with_pop_df["pop_2020_f_15"]
)
# poblacion total
celdas_bol_with_pop_df["pop_total"] = (
    celdas_bol_with_pop_df["pop_m"] + celdas_bol_with_pop_df["pop_f"]
)

# poblacion en edad de escuela primaria
celdas_bol_with_pop_df["pop_primary_school_age"] = (
    celdas_bol_with_pop_df["pop_2020_m_5"] + celdas_bol_with_pop_df["pop_2020_f_5"]
)

# poblacion en edad de escuela secundaria
celdas_bol_with_pop_df["pop_secondary_school_age"] = (
    celdas_bol_with_pop_df["pop_2020_m_10"]
    + celdas_bol_with_pop_df["pop_2020_f_10"]
    + celdas_bol_with_pop_df["pop_2020_m_15"]
    + celdas_bol_with_pop_df["pop_2020_f_15"]
)

In [None]:
celdas_bol_with_pop_df.info()

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Prepare the data for plotting
plot_data = celdas_bol_with_pop_df[
    ["code", "smod", "pop_primary_school_age", "pop_secondary_school_age"]
].copy()
plot_data = plot_data.melt(
    id_vars=["code", "smod"],
    value_vars=["pop_primary_school_age", "pop_secondary_school_age"],
    var_name="Age Group",
    value_name="Population",
)

# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=plot_data,
    x="smod",
    y="Population",
    hue="Age Group",
    ci=None,
    estimator=sum,
    dodge=True,
)

# Customize the plot
plt.title(
    "Distribution of Population in Primary and Secondary School Age per Country per SMOD"
)
plt.xlabel("Country Code")
plt.ylabel("Total Population")
plt.legend(title="Age Group")
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
peru_pop_data_proc = pd.read_parquet("peru_worldpop_school_age_celdas.parquet")

In [None]:
peru_pop_data_proc.shape

In [None]:
cells_pop_bol["pop_2020_f_5"].isna().sum()

In [None]:
countries_geodata["per"] = peru_pop_data_proca

In [None]:
# Clean the data by dropping NaN values for each column
clean_dfs_per = []
for col in countries_geodata["per"].columns:
    clean_dfs_per.append(countries_geodata["per"][col].dropna())

# Combine the cleaned data into a single DataFrame
clean_dfs_per = pd.concat(clean_dfs_per, axis=1)

# Merge the cleaned data with the grid data for Peru
celdas_per = gdf_celdas_countries[gdf_celdas_countries["code"] == "per"]
celdas_per_with_pop = celdas_per.merge(
    clean_dfs_per,
    how="left",
    left_on="cell_id",
    right_index=True,
)

# Prepare the data for plotting
plot_data_per = celdas_per_with_pop[
    [
        "code",
        "smod",
        "pop_2020_m_5",
        "pop_2020_f_5",
        "pop_2020_m_10",
        "pop_2020_f_10",
        "pop_2020_m_15",
        "pop_2020_f_15",
    ]
].copy()
plot_data_per["pop_primary_school_age"] = (
    plot_data_per["pop_2020_m_5"] + plot_data_per["pop_2020_f_5"]
)
plot_data_per["pop_secondary_school_age"] = (
    plot_data_per["pop_2020_m_10"]
    + plot_data_per["pop_2020_f_10"]
    + plot_data_per["pop_2020_m_15"]
    + plot_data_per["pop_2020_f_15"]
)
plot_data_per = plot_data_per.melt(
    id_vars=["code", "smod"],
    value_vars=["pop_primary_school_age", "pop_secondary_school_age"],
    var_name="Age Group",
    value_name="Population",
)

# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=plot_data_per,
    x="smod",
    y="Population",
    hue="Age Group",
    ci=None,
    estimator=sum,
    dodge=True,
)

# Customize the plot
plt.title(
    "Distribution of Population in Primary and Secondary School Age per Country per SMOD (Peru)"
)
plt.xlabel("Settlement Type (SMOD)")
plt.ylabel("Total Population")
plt.legend(title="Age Group")
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Extract the population data for Colombia
cells_pop_col = countries_geodata["col"]

In [None]:
# Clean the data by dropping NaN values for each column
clean_dfs_col = []
for col in cells_pop_col.columns:
    clean_dfs_col.append(cells_pop_col[col].dropna())

In [None]:
# Combine the cleaned data into a single DataFrame
clean_dfs_col = pd.concat(clean_dfs_col, axis=1)

In [None]:
# Merge the cleaned data with the grid data for Colombia
celdas_col = gdf_celdas_countries[gdf_celdas_countries["code"] == "col"]
celdas_col_with_pop = celdas_col.merge(
    clean_dfs_col,
    how="left",
    left_on="cell_id",
    right_index=True,
)

In [None]:
# Prepare the data for plotting
plot_data_col = celdas_col_with_pop[
    [
        "code",
        "smod",
        "pop_2020_m_5",
        "pop_2020_f_5",
        "pop_2020_m_10",
        "pop_2020_f_10",
        "pop_2020_m_15",
        "pop_2020_f_15",
    ]
].copy()
plot_data_col["pop_primary_school_age"] = (
    plot_data_col["pop_2020_m_5"] + plot_data_col["pop_2020_f_5"]
)
plot_data_col["pop_secondary_school_age"] = (
    plot_data_col["pop_2020_m_10"]
    + plot_data_col["pop_2020_f_10"]
    + plot_data_col["pop_2020_m_15"]
    + plot_data_col["pop_2020_f_15"]
)
plot_data_col = plot_data_col.melt(
    id_vars=["code", "smod"],
    value_vars=["pop_primary_school_age", "pop_secondary_school_age"],
    var_name="Age Group",
    value_name="Population",
)

In [None]:
# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=plot_data_col,
    x="smod",
    y="Population",
    hue="Age Group",
    ci=None,
    estimator=sum,
    dodge=True,
)

# Customize the plot
plt.title(
    "Distribution of Population in Primary and Secondary School Age per Country per SMOD (Colombia)"
)
plt.xlabel("Settlement Type (SMOD)")
plt.ylabel("Total Population")
plt.legend(title="Age Group")
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Clean the data by dropping NaN values for each column
clean_dfs_ecu = []
for col in countries_geodata["ecu"].columns:
    clean_dfs_ecu.append(countries_geodata["ecu"][col].dropna())

# Combine the cleaned data into a single DataFrame
clean_dfs_ecu = pd.concat(clean_dfs_ecu, axis=1)

# Merge the cleaned data with the grid data for Ecuador
celdas_ecu = gdf_celdas_countries[gdf_celdas_countries["code"] == "ecu"]
celdas_ecu_with_pop = celdas_ecu.merge(
    clean_dfs_ecu,
    how="left",
    left_on="cell_id",
    right_index=True,
)

# Prepare the data for plotting
plot_data_ecu = celdas_ecu_with_pop[
    [
        "code",
        "smod",
        "pop_2020_m_5",
        "pop_2020_f_5",
        "pop_2020_m_10",
        "pop_2020_f_10",
        "pop_2020_m_15",
        "pop_2020_f_15",
    ]
].copy()
plot_data_ecu["pop_primary_school_age"] = (
    plot_data_ecu["pop_2020_m_5"] + plot_data_ecu["pop_2020_f_5"]
)
plot_data_ecu["pop_secondary_school_age"] = (
    plot_data_ecu["pop_2020_m_10"]
    + plot_data_ecu["pop_2020_f_10"]
    + plot_data_ecu["pop_2020_m_15"]
    + plot_data_ecu["pop_2020_f_15"]
)
plot_data_ecu = plot_data_ecu.melt(
    id_vars=["code", "smod"],
    value_vars=["pop_primary_school_age", "pop_secondary_school_age"],
    var_name="Age Group",
    value_name="Population",
)

# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=plot_data_ecu,
    x="smod",
    y="Population",
    hue="Age Group",
    ci=None,
    estimator=sum,
    dodge=True,
)

# Customize the plot
plt.title(
    "Distribution of Population in Primary and Secondary School Age per Country per SMOD (Ecuador)"
)
plt.xlabel("Settlement Type (SMOD)")
plt.ylabel("Total Population")
plt.legend(title="Age Group")
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
cells_pop_bol["pop_2020_f_5"].isna().sum()

In [None]:
cells_pop_bol["pop_2020_f_10"].isna().sum()

In [None]:
cells_pop_bol["pop_2020_f_15"].isna().sum()

In [None]:
countries_geodata["bol"].shape

In [None]:
xds_clipped

In [None]:
celdas_country

In [None]:
fig, ax = plt.subplots()


xds_clipped.squeeze().plot.imshow(
    ax=ax,
    cmap="viridis",
    add_colorbar=True,
    cbar_kwargs={"label": "Population"},
)

celdas_country.plot(
    ax=ax,
    color="red",
    alpha=0.5,
)

plt.show()

In [None]:
celdas_country.head()

In [None]:
celdas_country.info()

In [None]:
celdas_country["cell_id"] = celdas_country["cell_id"].astype(float)

In [None]:
celdas_country.shape

In [None]:
celdas_country["cell_id"].unique().shape

In [None]:
xds_mask = make_geocube(
    vector_data=celdas_country[["cell_id", "geometry"]],
    # measurements=["cell_id"],
    like=xds_clipped,
    fill=0,
)
xds_mask

In [None]:
# Add a new dimension to the mask for the population using the clipped raster
xds_clipped_squeezed = xds_clipped.squeeze()
xds_mask[pop_col] = (
    xds_clipped_squeezed.dims,
    xds_clipped_squeezed.values,
    xds_clipped_squeezed.attrs,
    xds_clipped_squeezed.encoding,
)
xds_mask

In [None]:
# Now we can calculate the total population in each cell and add it back to the geodataframe
agg_pop = (
    xds_mask.drop_vars("spatial_ref").groupby(xds_mask.cell_id).sum().to_dataframe()
)

In [None]:
import dask

In [None]:
# Now we can calculate the total population in each cell and add it back to the geodataframe
agg_pop_1 = (
    xds_mask.drop_vars("spatial_ref").groupby(xds_mask.cell_id).sum().to_dataframe()
)

In [None]:
agg_pop.shape, celdas_country.shape

In [None]:
population_data.append(agg_pop)

countries_geodata[country] = pd.concat(population_data)

In [None]:
plotly.offline.init_notebook_mode()

In [None]:
fig = up.plotting.choropleth_map(
    peru_access,
    "pop_2020",
    title="Estimated Population - 2020",
    opacity=0.5,
    width=800,
    height=800,
)

# Make space for the title
fig.update_layout(margin=dict(l=0, r=0, b=0, t=40))

fig.show()

In [None]:
# Get ordered category labels
category_orders = (
    huancabamba_access["duration_to_nearest_schools_label"].unique().sort_values()
)
category_orders.categories

In [None]:
fig = up.plotting.choropleth_map(
    peru_access,
    color_column="duration_to_nearest_schools_label",
    color_discrete_sequence=px.colors.sequential.Plasma_r,
    category_orders={"duration_to_nearest_schools_label": category_orders},
    opacity=0.5,
    labels={"duration_to_nearest_schools_label": "Minutes"},
    title="Travel Time to Nearest School",
    width=800,
    height=800,
)

# Make space for the title
fig.update_layout(margin=dict(l=0, r=0, b=0, t=40))

# Remove the hexagon outlines to make the map clearer
fig.update_traces(marker_line_width=0)

fig.show()