In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
from tqdm.notebook import tqdm

In [None]:
tqdm.pandas()

In [None]:
# Read the combined cells with access variables
gdf_cells_access = gpd.read_parquet("outputs/celdas_combined_access_v3.parquet")
gdf_cells_access_br = gpd.read_parquet("outputs/celdas_combined_bra_access_v3.parquet")

In [None]:
gdf_cells_access.info()

In [None]:
gdf_cells_access.head()

In [None]:
gdf_cells_access_br.head()

In [None]:
gdf_cells_access_br = gdf_cells_access_br.drop("index", axis=1)

In [None]:
gdf_cells_access_br.columns

In [None]:
gdf_cells_access.columns

In [None]:
filter_cols = [
    # Cell variables
    "cell_id",
    "polygon_id",
    "smod",  # urbanization degree
    "category",  # urbanization category
    "code",  # country
    "lon",
    "lat",  # cell centroid
    "geometry",  # cell polygon
    # Accessibility to primary schools
    "nearest_primary_schools_ix",
    "distance_to_nearest_primary_schools",
    "duration_to_nearest_primary_schools",
    "duration_to_nearest_primary_schools_label",
    "id_edificio",
    "lat_primary_school",
    "lon_primary_school",
    # Acessibility to middle schools
    "nearest_middle_schools_ix",
    "distance_to_nearest_middle_schools",
    "duration_to_nearest_middle_schools",
    "duration_to_nearest_middle_schools_label",
    "id_edificio_middle_school",
    "lat_middle_school",
    "lon_middle_school",
    # Acessibility to secondary schools
    "nearest_secondary_schools_ix",
    "distance_to_nearest_secondary_schools",
    "duration_to_nearest_secondary_schools",
    "duration_to_nearest_secondary_schools_label",
    "id_edificio_secondary_school",
    "lat_secondary_school",
    "lon_secondary_school",
]

In [None]:
gdf_cells_access_concat = pd.concat(
    [gdf_cells_access[filter_cols], gdf_cells_access_br[filter_cols]], ignore_index=True
)

In [None]:
gdf_cells_access_concat.columns

In [None]:
# Read the combined cells with the worldpop variables
gdf_cells_pop = gpd.read_parquet("outputs/celdas_combined_pop.parquet")
gdf_cells_pop_bra = gpd.read_parquet(
    "outputs/brazil_worldpop_school_age_celdas.parquet"
)

In [None]:
gdf_cells_pop.info()

In [None]:
gdf_cells_pop.columns

In [None]:
gdf_cells_pop[
    [
        "pop_2020_m_5",
        "pop_2020_m_10",
        "pop_2020_m_15",
        "pop_2020_f_5",
        "pop_2020_f_10",
        "pop_2020_f_15",
    ]
]

In [None]:
gdf_cells_pop.head()

In [None]:
gdf_cells_pop_bra.head()

In [None]:
gdf_cells_pop.columns

In [None]:
gdf_cells_pop_bra["polygon_id"] = range(1000000, 1000000 + len(gdf_cells_pop_bra))

In [None]:
gdf_cells_pop_bra = gdf_cells_pop_bra.rename({"code": "country"}, axis=1)

In [None]:
gdf_cells_pop_bra.columns

In [None]:
gdf_cells_pop_concat = pd.concat(
    [gdf_cells_pop[gdf_cells_pop_bra.columns], gdf_cells_pop_bra], ignore_index=True
)

### IMPORTANT

Date: 04 Jun 2025

We are splitting the population in secondary in middel and secondary.

### Population Split

Middle: Population between 10 and 15 yo  
Secondary: Population between 15 and 20 yo


In [None]:
gdf_cells_pop_concat.columns

In [None]:
gdf_cells_pop_concat["pop_secondary_school_age"] = gdf_cells_pop_concat[
    ["pop_2020_m_15", "pop_2020_f_10"]
].sum(axis=1)
gdf_cells_pop_concat["pop_middle_school_age"] = gdf_cells_pop_concat[
    ["pop_2020_m_10", "pop_2020_f_10"]
].sum(axis=1)

In [None]:
index_col = ["cell_id"]
common_cols = ["smod", "polygon_id", "geometry", "country"]
access_cols = [
    "lat",
    "lon",
    "category",  # urbanization category (new in v3)
    "nearest_primary_schools_ix",
    "distance_to_nearest_primary_schools",
    "duration_to_nearest_primary_schools",
    "duration_to_nearest_primary_schools_label",
    "id_edificio",
    "lat_primary_school",
    "lon_primary_school",
    "nearest_middle_schools_ix",
    "distance_to_nearest_middle_schools",
    "duration_to_nearest_middle_schools",
    "duration_to_nearest_middle_schools_label",
    "id_edificio_middle_school",
    "lat_middle_school",
    "lon_middle_school",
    "nearest_secondary_schools_ix",
    "distance_to_nearest_secondary_schools",
    "duration_to_nearest_secondary_schools",
    "duration_to_nearest_secondary_schools_label",
    "id_edificio_secondary_school",
    "lat_secondary_school",
    "lon_secondary_school",
]
pop_cols = [
    "pop_2020_m_5",
    "pop_2020_f_5",
    "pop_2020_m_10",
    "pop_2020_f_10",
    "pop_2020_m_15",
    "pop_2020_f_15",
    "pop_m",
    "pop_f",
    "pop_total",
    "pop_primary_school_age",
    "pop_middle_school_age",
    "pop_secondary_school_age",
]

In [None]:
gdf_cells_pop_concat

In [None]:
gdf_cells_access_concat["polygon_id"].head()

In [None]:
gdf_cells_access_concat["polygon_id"].tail()

In [None]:
gdf_cells_pop_concat["polygon_id"].head()

In [None]:
gdf_cells_pop_concat["polygon_id"].tail()

In [None]:
gdf_cells_access_concat["cell_id"] = gdf_cells_access_concat["cell_id"].astype("int32")
gdf_cells_pop_concat["cell_id"] = gdf_cells_pop_concat["cell_id"].astype("int32")

In [None]:
gdf_cells_access_concat

In [None]:
# Combine the two datasets using the index_col
gdf_combined = gdf_cells_access_concat[index_col + access_cols].merge(
    gdf_cells_pop_concat[index_col + common_cols + pop_cols],
    on=index_col,
    suffixes=("_access", "_pop"),
)

# Display the combined dataset
gdf_combined.head()

In [None]:
celdas_original = gpd.read_file(
    "inputs/Asentamientos humanos 2/Polígonos/Nuevos/Educación/CELDAS.gpkg"
)
celdas_original.shape

In [None]:
len(celdas_original.polygon_id.unique())

In [None]:
celdas_original["pop_2020"].sum()

In [None]:
celdas_original.drop_duplicates(subset="polygon_id")["pop_2020"].sum()

In [None]:
celdas_original.columns

In [None]:
# 333 804 798 <--- celdas originales con polygon_id duplicados
#  38 028 451 <--- celdas originales sin polygon_id duplicados
#  48 359 084 <--- celdas corregidas con polygon_id duplicados
#   5 241 000 <--- celdas corregidas sin polygon_id duplicados
# 211.9M in 2020 en todo brazil segun el IBGE

In [None]:
gdf_combined["polygon_id"] = gdf_combined["polygon_id"].astype(str)

In [None]:
gdf_combined = gpd.GeoDataFrame(gdf_combined)

In [None]:
gdf_combined.crs

In [None]:
gdf_combined.to_parquet("outputs/celdas_pop_distance_complete_v3.parquet", index=False)

In [None]:
gdf_combined.shape[0], gdf_cells_access_concat.shape[0], gdf_cells_pop_concat.shape[0]

In [None]:
gdf_combined.columns

In [None]:
gdf_combined = gpd.GeoDataFrame(
    gdf_combined, geometry=gdf_combined.geometry, crs=gdf_cells_access.crs
)

In [None]:
gdf_combined["polygon_id"] = gdf_combined["polygon_id"].astype(str)

In [None]:
gdf_combined.to_file(
    "outputs/celdas_pop_distance_complete_v3.geojson", driver="GeoJSON", index=False
)

In [None]:
gdf_combined.to_parquet("outputs/celdas_pop_distance_complete_v3.parquet", index=False)

In [None]:
gdf_combined_stats = gdf_combined[
    [
        "cell_id",
        "category",  # urbanization category
        "country",
        "pop_total",
        "pop_primary_school_age",
        "pop_middle_school_age",
        "pop_secondary_school_age",
        "distance_to_nearest_primary_schools",
        "duration_to_nearest_primary_schools",
        "duration_to_nearest_primary_schools_label",
        "distance_to_nearest_middle_schools",
        "duration_to_nearest_middle_schools",
        "duration_to_nearest_middle_schools_label",
        "distance_to_nearest_secondary_schools",
        "duration_to_nearest_secondary_schools",
        "duration_to_nearest_secondary_schools_label",
    ]
]

In [None]:
from ydata_profiling import ProfileReport

In [None]:
profile = ProfileReport(gdf_combined_stats, title="Data Report")

In [None]:
profile.to_file("outputs/data_report.html")

In [None]:
gdf_combined_stats.head().to_clipboard()

In [None]:
# Add "No access" as a category for primary schools
if (
    "No access"
    not in gdf_combined_stats[
        "duration_to_nearest_primary_schools_label"
    ].cat.categories
):
    gdf_combined_stats["duration_to_nearest_primary_schools_label"] = (
        gdf_combined_stats[
            "duration_to_nearest_primary_schools_label"
        ].cat.add_categories("No access")
    )

# Fill missing values with "No access" for primary schools
gdf_combined_stats["duration_to_nearest_primary_schools_label"] = gdf_combined_stats[
    "duration_to_nearest_primary_schools_label"
].fillna("No access")

# Add "No access" as a category for middle schools
if (
    "No access"
    not in gdf_combined_stats["duration_to_nearest_middle_schools_label"].cat.categories
):
    gdf_combined_stats["duration_to_nearest_middle_schools_label"] = gdf_combined_stats[
        "duration_to_nearest_middle_schools_label"
    ].cat.add_categories("No access")

# Fill missing values with "No access" for middle schools
gdf_combined_stats["duration_to_nearest_middle_schools_label"] = gdf_combined_stats[
    "duration_to_nearest_middle_schools_label"
].fillna("No access")


# Add "No access" as a category for secondary schools
if (
    "No access"
    not in gdf_combined_stats[
        "duration_to_nearest_secondary_schools_label"
    ].cat.categories
):
    gdf_combined_stats["duration_to_nearest_secondary_schools_label"] = (
        gdf_combined_stats[
            "duration_to_nearest_secondary_schools_label"
        ].cat.add_categories("No access")
    )

# Fill missing values with "No access" for secondary schools
gdf_combined_stats["duration_to_nearest_secondary_schools_label"] = gdf_combined_stats[
    "duration_to_nearest_secondary_schools_label"
].fillna("No access")

In [None]:
# Group by "country", "smod", and "duration_to_nearest_primary_schools_label" and calculate the sum of "pop_primary_school_age"
result = gdf_combined_stats.groupby(
    ["country", "category", "duration_to_nearest_primary_schools_label"], as_index=False
)["pop_primary_school_age"].sum()

# Display the result
result

In [None]:
# Save to an excel file
result.to_excel("outputs/pop_primary_school_age_2025_06_04.xlsx", index=False)

In [None]:
# Group by "country", "category", and "duration_to_nearest_schools_label" and calculate the sum of "pop_middle_school_age"
result_middle = gdf_combined_stats.groupby(
    ["country", "category", "duration_to_nearest_middle_schools_label"],
    as_index=False,
)["pop_middle_school_age"].sum()

# Display the result
result_middle

In [None]:
# Group by "country", "category", and "duration_to_nearest_schools_label" and calculate the sum of "pop_secondary_school_age"
result_secondary = gdf_combined_stats.groupby(
    ["country", "category", "duration_to_nearest_secondary_schools_label"],
    as_index=False,
)["pop_secondary_school_age"].sum()

# Display the result
result_secondary

In [None]:
# Save to an excel file
result_secondary.to_excel(
    "outputs/pop_secondary_school_age_2025_06_04.xlsx", index=False
)

In [None]:
result.head().to_clipboard()

In [None]:
# Import required libraries
import pandas as pd
import seaborn as sns

In [None]:
%matplotlib inline

In [None]:
# Create a bar plot with seaborn
plt.figure(figsize=(8, 6))
sns.barplot(
    data=result_secondary,
    x="duration_to_nearest_secondary_schools_label",
    y="pop_secondary_school_age",
    hue="category",
    errorbar=None,
)
plt.title("Population in Secondary School Age by Travel Time")
plt.xlabel("Travel Time to Nearest School (minutes)")
plt.ylabel("Population (Secondary School Age)")
plt.tight_layout()
plt.show()

In [None]:
result_secondary

In [None]:
for col in result_secondary.country.unique():
    # Create a bar plot with seaborn
    plt.figure(figsize=(8, 6))
    sns.barplot(
        data=result_secondary[result_secondary["country"] == col],
        x="duration_to_nearest_secondary_schools_label",
        y="pop_secondary_school_age",
        hue="category",
        errorbar=None,
    )
    plt.title(f"{col}: Population in Secondary School Age by Travel Time")
    plt.xlabel("Travel Time to Nearest School (minutes)")
    plt.ylabel("Population (Secondary School Age)")
    plt.tight_layout()
    plt.savefig(f"outputs/{col}_secondary_school_access_plot.png")
    plt.show()

In [None]:
result

In [None]:
# (optional) set a clean style
sns.set(style="whitegrid", context="talk")

In [None]:
# make a working df
df = gdf_combined_stats.copy()

plt.figure(figsize=(8, 6))
sns.scatterplot(
    x="duration_to_nearest_primary_schools",
    y="pop_primary_school_age",
    hue="country",
    palette="tab10",
    alpha=0.1,
    data=df,
)
plt.xlabel("Travel time to nearest primary school (min)")
plt.ylabel("Primary - school - age population")
plt.title("Primary - age children vs. travel time to primary school")
plt.legend(title="Country", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x="duration_to_nearest_secondary_schools",
    y="pop_secondary_school_age",
    hue="country",
    palette="tab10",
    alpha=0.1,
    data=df,
)
plt.xlabel("Travel time to nearest secondary school (min)")
plt.ylabel("Secondary - school - age population")
plt.title("Secondary - age children vs. travel time to secondary school")
plt.legend(title="Country", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
# aggregate by country
country_stats = (
    df.groupby("country")
    .agg(
        total_primary_pop=("pop_primary_school_age", "sum"),
        mean_primary_time=("duration_to_nearest_primary_schools", "mean"),
        total_secondary_pop=("pop_secondary_school_age", "sum"),
        mean_secondary_time=("duration_to_nearest_secondary_schools", "mean"),
    )
    .reset_index()
)

fig, ax = plt.subplots(2, 1, figsize=(10, 12), sharex=True, sharey=True)

# primary
sns.scatterplot(
    x="mean_primary_time",
    y="total_primary_pop",
    size="total_primary_pop",
    sizes=(100, 2000),
    hue="country",
    data=country_stats,
    legend=False,
    alpha=0.5,
    ax=ax[0],
)
for _, row in country_stats.iterrows():
    ax[0].text(
        row.mean_primary_time,
        row.total_primary_pop,
        row.country.upper(),
        horizontalalignment="center",
        verticalalignment="center",
    )
ax[0].set_xlabel("Avg. primary - school travel time (min)")
ax[0].set_ylabel("Total primary - age population")
ax[0].set_title("Country - level: primary - age pop vs. avg. travel time")


# primary
sns.scatterplot(
    x="mean_secondary_time",
    y="total_secondary_pop",
    size="total_secondary_pop",
    sizes=(100, 2000),
    hue="country",
    data=country_stats,
    legend=False,
    alpha=0.5,
    ax=ax[1],
)
for _, row in country_stats.iterrows():
    ax[1].text(
        row.mean_secondary_time,
        row.total_secondary_pop,
        row.country.upper(),
        horizontalalignment="center",
        verticalalignment="center",
    )
ax[1].set_xlabel("Avg. secondary - school travel time (min)")
ax[1].set_ylabel("Total secondary - age population")
ax[1].set_title("Country - level: secondary - age pop vs. avg. travel time")

plt.tight_layout()
plt.show()

In [None]:
category_stats = (
    df.groupby("category")
    .agg(
        total_primary_pop=("pop_primary_school_age", "sum"),
        mean_primary_time=("duration_to_nearest_primary_schools", "mean"),
        total_secondary_pop=("pop_secondary_school_age", "sum"),
        mean_secondary_time=("duration_to_nearest_secondary_schools", "mean"),
    )
    .sort_values("mean_primary_time", ascending=False)
    .reset_index()
)

# Primary
fig, ax = plt.subplots(1, 2, figsize=(14, 6), sharex=True)
sns.barplot(
    x="mean_primary_time",
    y="category",
    data=category_stats,
    ax=ax[0],
    order=category_stats.category,
)
ax[0].set_title("Avg. primary travel time by settlement type")
ax[0].set_xlabel("Time (min)")
ax[0].set_ylabel("Settlement type")

# Secondary
sns.barplot(
    x="mean_secondary_time",
    y="category",
    data=category_stats.sort_values("mean_secondary_time", ascending=False),
    ax=ax[1],
)
ax[1].set_title("Avg. secondary travel time by settlement type")
ax[1].set_xlabel("Time (min)")
ax[1].set_ylabel("")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.cm as cm
import matplotlib.colors as mcolors

In [None]:
# Define duration bin order
duration_order = (
    gdf_combined_stats["duration_to_nearest_primary_schools_label"].dropna().unique()
)

In [None]:
duration_order.to_list()

In [None]:
duration_order = [
    "0-15",
    "15-30",
    "30-45",
    "45-60",
    "60-90",
    "90-120",
    ">120",
    "No access",
]

In [None]:
cmap = cm.get_cmap("viridis", len(duration_order))

In [None]:
duration_colors = {
    label: mcolors.rgb2hex(cmap(i)) for i, label in enumerate(duration_order)
}
duration_colors

In [None]:
duration_colors["No access"] = "#808080"  # Set "No access" to gray

In [None]:
def plot_school_age_distribution(
    education_level, hue, pop_label_df, duration_order, duration_colors, **kwargs
):
    """
    Plots the country-level distribution of school-age population by travel-time label.

    Parameters:
        education_level (str): The education level to plot ('primary', 'middle', or 'secondary').
        pop_label_df (pd.DataFrame): DataFrame with countries as index and travel-time bins as columns (absolute values).
        duration_order (list): List of travel-time bin labels in desired order.
        duration_colors (dict): Mapping from travel-time bin label to color.
    """

    # pivot to get total secondary‐age pop by country × label
    pop_label_country_sec = (
        pop_label_df.groupby(
            [hue, f"duration_to_nearest_{education_level}_schools_label"]
        )[f"pop_{education_level}_school_age"]
        .sum()
        .unstack(fill_value=0)
    )

    # Convert to fractions (so bars sum to 1)
    pop_label_pct = pop_label_country_sec.div(pop_label_country_sec.sum(axis=1), axis=0)

    # Plot
    ax = pop_label_pct[duration_order].plot(
        kind="bar",
        stacked=True,
        figsize=(10, 6),
        color=[duration_colors[label] for label in duration_order],
        width=0.8,
        linewidth=0,
        **kwargs,
    )
    ax.set_ylabel(f"Percentage of {education_level.capitalize()} population")
    ax.set_xlabel(hue.capitalize())
    ax.set_title(f"{education_level.capitalize()} age population by travel time bins")
    ax.legend(title="Travel time (min)", bbox_to_anchor=(1.05, 1), loc="upper left")

    # Invert the order of the legend to match the order of the bars
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(
        handles[::-1],
        labels[::-1],
        title="Travel time (min)",
        bbox_to_anchor=(1.05, 1),
        loc="upper left",
    )

    plt.tight_layout()

In [None]:
plot_school_age_distribution(
    "primary", "country", gdf_combined_stats, duration_order, duration_colors
)
plt.savefig("outputs/figures/school_age_distribution_primary_country", dpi=300)
plt.show()
plot_school_age_distribution(
    "middle", "country", gdf_combined_stats, duration_order, duration_colors
)
plt.savefig("outputs/figures/school_age_distribution_middle_country", dpi=300)
plt.show()
plot_school_age_distribution(
    "secondary", "country", gdf_combined_stats, duration_order, duration_colors
)
plt.savefig("outputs/figures/school_age_distribution_secondary_country", dpi=300)
plt.show()

In [None]:
plot_school_age_distribution(
    "primary",
    "category",
    gdf_combined_stats,
    duration_order,
    duration_colors,
    rot=0,
)
plt.savefig("outputs/figures/school_age_distribution_primary_urban", dpi=300)
plt.show()
plot_school_age_distribution(
    "middle",
    "category",
    gdf_combined_stats,
    duration_order,
    duration_colors,
    rot=0,
)
plt.savefig("outputs/figures/school_age_distribution_middle_urban", dpi=300)
plt.show()
plot_school_age_distribution(
    "secondary",
    "category",
    gdf_combined_stats,
    duration_order,
    duration_colors,
    rot=0,
)
plt.savefig("outputs/figures/school_age_distribution_secondary_urban", dpi=300)
plt.show()

In [None]:
# Create a copy with relevant columns
df_lorenz = (
    gdf_combined_stats[
        ["duration_to_nearest_primary_schools", "pop_primary_school_age"]
    ]
    .dropna()
    .copy()
)

# Remove zero-pop cells
df_lorenz = df_lorenz[df_lorenz.pop_primary_school_age > 0]

# Sort by duration
df_lorenz.sort_values("duration_to_nearest_primary_schools", inplace=True)

# Compute cumulative population and cumulative share
df_lorenz["cum_pop"] = df_lorenz["pop_primary_school_age"].cumsum()
df_lorenz["cum_pop_share"] = (
    df_lorenz["cum_pop"] / df_lorenz["pop_primary_school_age"].sum()
)

# Weight durations by population (population-weighted Lorenz curve)
df_lorenz["duration_weighted"] = (
    df_lorenz["duration_to_nearest_primary_schools"]
    * df_lorenz["pop_primary_school_age"]
)
df_lorenz["cum_duration"] = df_lorenz["duration_weighted"].cumsum()
df_lorenz["cum_duration_share"] = (
    df_lorenz["cum_duration"] / df_lorenz["duration_weighted"].sum()
)

# Create a copy with relevant columns
df_lorenz_sec = (
    gdf_combined_stats[
        ["duration_to_nearest_secondary_schools", "pop_secondary_school_age"]
    ]
    .dropna()
    .copy()
)

# Remove zero-pop cells
df_lorenz_sec = df_lorenz_sec[df_lorenz_sec.pop_secondary_school_age > 0]

# Sort by duration
df_lorenz_sec.sort_values("duration_to_nearest_secondary_schools", inplace=True)

# Compute cumulative population and cumulative share
df_lorenz_sec["cum_pop"] = df_lorenz_sec["pop_secondary_school_age"].cumsum()
df_lorenz_sec["cum_pop_share"] = (
    df_lorenz_sec["cum_pop"] / df_lorenz_sec["pop_secondary_school_age"].sum()
)

# Weight durations by population (population-weighted Lorenz curve)
df_lorenz_sec["duration_weighted"] = (
    df_lorenz_sec["duration_to_nearest_secondary_schools"]
    * df_lorenz_sec["pop_secondary_school_age"]
)
df_lorenz_sec["cum_duration"] = df_lorenz_sec["duration_weighted"].cumsum()
df_lorenz_sec["cum_duration_share"] = (
    df_lorenz_sec["cum_duration"] / df_lorenz_sec["duration_weighted"].sum()
)

# Plot both Lorenz curves on the same plot
plt.figure(figsize=(8, 6))
plt.plot(
    df_lorenz["cum_pop_share"],
    df_lorenz["cum_duration_share"],
    label="Primary school age",
    color="blue",
    alpha=0.5,
)
plt.plot(
    df_lorenz_sec["cum_pop_share"],
    df_lorenz_sec["cum_duration_share"],
    label="Secondary school age",
    color="green",
    alpha=0.5,
)
plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Line of equality")
plt.title("Lorenz Curve: Inequality in Access to Schools")
plt.xlabel("Cumulative share of population")
plt.ylabel("Cumulative share of total travel-time burden")
plt.legend()
plt.grid(True)
plt.tight_layout()

plt.savefig("outputs/figures/access_inequality_lorenz_curve.png", dpi=300)

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=gdf_combined_stats[gdf_combined_stats["pop_primary_school_age"] > 0],
    x="country",
    y="duration_to_nearest_primary_schools",
    color="skyblue",
    showfliers=False,
    width=0.4,
    # position=1,
    boxprops=dict(alpha=0.7),
    linewidth=1.5,
    dodge=True,
    label="Primary",
)
sns.boxplot(
    data=gdf_combined_stats[gdf_combined_stats["pop_secondary_school_age"] > 0],
    x="country",
    y="duration_to_nearest_secondary_schools",
    color="orange",
    showfliers=False,
    width=0.4,
    position=0,
    boxprops=dict(alpha=0.7),
    linewidth=1.5,
    dodge=True,
    label="Secondary",
)
plt.ylabel("Travel time to school (min)")
plt.title("Distribution of Travel Times to Schools by Country")
plt.legend(
    handles=[
        plt.Line2D([0], [0], color="skyblue", lw=8, label="Primary"),
        plt.Line2D([0], [0], color="orange", lw=8, label="Secondary"),
    ]
)
plt.tight_layout()
plt.show()

In [None]:
# Summary stats for Ecuador
ecuador = gdf_combined_stats[gdf_combined_stats["country"] == "ecu"]
print("Number of rows for Ecuador:", len(ecuador))
print("Non-zero population cells:", (ecuador["pop_primary_school_age"] > 0).sum())
print(
    "Non-null durations:",
    ecuador["duration_to_nearest_primary_schools"].notnull().sum(),
)
print(
    "Unique duration values:", ecuador["duration_to_nearest_primary_schools"].unique()
)

In [None]:
gdf_combined_stats[gdf_combined_stats["country"] == "ecu"][
    ["pop_primary_school_age", "duration_to_nearest_primary_schools"]
].describe()

In [None]:
sns.boxplot(
    data=gdf_combined_stats[
        (gdf_combined_stats["pop_primary_school_age"] > 0)
        & (gdf_combined_stats["duration_to_nearest_primary_schools"] > 0)
    ],
    x="country",
    y="duration_to_nearest_primary_schools",
    showfliers=False,  # hide outliers for readability,
)
plt.ylabel("Travel time to primary school (min)")
plt.title("Travel Times to Primary Schools by Country")
plt.tight_layout()
plt.show()

In [None]:
gdf_combined_stats[gdf_combined_stats["pop_primary_school_age"] > 0]

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=gdf_combined_stats[gdf_combined_stats["pop_primary_school_age"] > 0],
    x="category",
    y="duration_to_nearest_primary_schools",
    order=["non_urban_area", "urban_area"],
    showfliers=False,
)
plt.ylabel("Travel time to primary school (min)")
plt.title("Travel Time Distribution by Settlement Type")
plt.tight_layout()
plt.show()

In [None]:
# Make sure all needed columns are included in the copy
df_facet = gdf_combined_stats[
    [
        "country",
        "category",
        "pop_primary_school_age",
        "pop_middle_school_age",
        "pop_secondary_school_age",
        "duration_to_nearest_primary_schools_label",
        "duration_to_nearest_middle_schools_label",
        "duration_to_nearest_secondary_schools_label",
    ]
].copy()

# Melt the population columns
df_melted = pd.melt(
    df_facet,
    id_vars=[
        "country",
        "category",
        "duration_to_nearest_primary_schools_label",
        "duration_to_nearest_middle_schools_label",
        "duration_to_nearest_secondary_schools_label",
    ],
    value_vars=[
        "pop_primary_school_age",
        "pop_middle_school_age",
        "pop_secondary_school_age",
    ],
    var_name="school_level",
    value_name="pop_school_age",
)

# Assign matching duration labels
df_melted["duration_label"] = df_melted.apply(
    lambda row: (
        row["duration_to_nearest_primary_schools_label"]
        if row["school_level"] == "pop_primary_school_age"
        else (
            row["duration_to_nearest_middle_schools_label"]
            if row["school_level"] == "pop_middle_school_age"
            else row["duration_to_nearest_secondary_schools_label"]
        )
    ),
    axis=1,
)

# Clean up school level name
df_melted["school_level"] = df_melted["school_level"].map(
    {
        "pop_primary_school_age": "Primary",
        "pop_middle_school_age": "Middle",
        "pop_secondary_school_age": "Secondary",
    }
)

# Filter out zero-pop rows
df_melted = df_melted[df_melted["pop_school_age"] > 0]

# Facet by country
g = sns.catplot(
    data=df_melted,
    kind="bar",
    x="duration_label",
    y="pop_school_age",
    hue="school_level",
    hue_order=["Primary", "Middle", "Secondary"],
    col="country",
    col_wrap=3,
    order=duration_order,
    height=10,
    aspect=0.7,
    errorbar=None,
    sharex=False,
    sharey=True,
)

g.set_axis_labels("Travel time bin (min)", "School-age population")
# g.set_titles("Country: {col_name}")
g._legend.set_title("School level")
g._legend.set_loc("lower right")
# Rotate x-axis labels for better readability
for ax in g.axes.flat:
    for label in ax.get_xticklabels():
        label.set_rotation(45)
        label.set_ha("right")
# plt.subplots_adjust(top=0.9)
plt.tight_layout()

plt.savefig("outputs/figures/school_population_duration_country.png", dpi=300)
plt.show()