In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly
import plotly.express as px
import geopandas as gpd
import contextily as ctx
import rioxarray
import urbanpy as up
from mpl_toolkits.axes_grid1 import make_axes_locatable
from tqdm.notebook import tqdm
from geocube.api.core import make_geocube

In [2]:
tqdm.pandas()

In [3]:
# Save the combined GeoDataFrame to a new file
gdf_cells_access = gpd.read_parquet("outputs/celdas_combined_access.parquet")
gdf_cells_access.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 18275 entries, 0 to 18274
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   cell_id                            18275 non-null  object  
 1   smod                               18275 non-null  object  
 2   pop_1975                           18275 non-null  float64 
 3   pop_1980                           18275 non-null  float64 
 4   pop_1985                           18275 non-null  float64 
 5   pop_1990                           18275 non-null  float64 
 6   pop_1995                           18275 non-null  float64 
 7   pop_2000                           18275 non-null  float64 
 8   pop_2005                           18275 non-null  float64 
 9   pop_2010                           18275 non-null  float64 
 10  pop_2015                           18275 non-null  float64 
 11  pop_2020                         

In [4]:
gdf_cells_access.head()

Unnamed: 0,cell_id,smod,pop_1975,pop_1980,pop_1985,pop_1990,pop_1995,pop_2000,pop_2005,pop_2010,...,code,lon,lat,nearest_schools_ix,distance_to_nearest_schools,duration_to_nearest_schools,duration_to_nearest_schools_label,EduNivelPrimaria,EduNivelSecundariaTotal,nivel_educativo
0,13887,urban_cluster,104.825494,223.638531,365.367907,532.63118,756.33114,1045.05333,1385.446574,1766.728589,...,bol,-69.576016,-10.947963,8067,1.673,20.075,15-30,1.0,1.0,Primaria
1,13956,rural,27.399182,67.594099,122.557618,197.463636,290.430222,397.374168,536.507827,717.496279,...,bol,-69.221002,-11.143462,8063,1.0347,12.415,0-15,1.0,1.0,Primaria
2,13993,rural,1.727404,5.890069,19.501926,41.905899,93.348261,246.334796,451.661259,730.711286,...,bol,-69.069813,-11.281971,8032,0.203,2.436667,0-15,1.0,1.0,Primaria
3,13998,rural,10.727191,22.730851,37.306889,54.579556,88.652168,146.463163,221.419988,318.644286,...,bol,-69.025069,-11.184197,8047,0.5786,6.943333,0-15,1.0,1.0,Primaria
4,14081,rural,41.429706,93.69343,159.420188,241.236232,339.78843,458.116727,644.444557,928.415508,...,bol,-68.988741,-11.877038,7980,0.599,7.186667,0-15,1.0,1.0,Primaria


In [5]:
gdf_cells_access.columns

Index(['cell_id', 'smod', 'pop_1975', 'pop_1980', 'pop_1985', 'pop_1990',
       'pop_1995', 'pop_2000', 'pop_2005', 'pop_2010', 'pop_2015', 'pop_2020',
       'polygon_id', 'geometry', 'index_countries', 'ADM0_EN', 'ADM0_PCODE',
       'code', 'lon', 'lat', 'nearest_schools_ix',
       'distance_to_nearest_schools', 'duration_to_nearest_schools',
       'duration_to_nearest_schools_label', 'EduNivelPrimaria',
       'EduNivelSecundariaTotal', 'nivel_educativo'],
      dtype='object')

In [6]:
# Save the combined GeoDataFrame to a new file
gdf_cells_pop = gpd.read_parquet("outputs/celdas_combined_pop.parquet")
gdf_cells_pop.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 18275 entries, 0 to 22586
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   cell_id                   18275 non-null  int32   
 1   smod                      18275 non-null  object  
 2   pop_1975                  18275 non-null  float64 
 3   pop_1980                  18275 non-null  float64 
 4   pop_1985                  18275 non-null  float64 
 5   pop_1990                  18275 non-null  float64 
 6   pop_1995                  18275 non-null  float64 
 7   pop_2000                  18275 non-null  float64 
 8   pop_2005                  18275 non-null  float64 
 9   pop_2010                  18275 non-null  float64 
 10  pop_2015                  18275 non-null  float64 
 11  pop_2020                  18275 non-null  float64 
 12  polygon_id                18275 non-null  object  
 13  geometry                  18275 non-null  g

In [7]:
gdf_cells_pop.head()

Unnamed: 0,cell_id,smod,pop_1975,pop_1980,pop_1985,pop_1990,pop_1995,pop_2000,pop_2005,pop_2010,...,pop_2020_m_10,pop_2020_f_10,pop_2020_m_15,pop_2020_f_15,country,pop_m,pop_f,pop_total,pop_primary_school_age,pop_secondary_school_age
0,1,urban_cluster,773.876615,1497.169295,2214.783282,2936.382639,3759.68052,4684.930881,5639.160211,6578.468722,...,16.853684,15.952408,12.418421,11.182146,per,46.182592,43.726013,89.908605,33.501947,56.406659
1,2,rural,127.349266,262.085135,409.920769,567.952579,738.324368,919.774243,1098.632136,1267.729462,...,5.357383,5.070888,3.947519,3.554536,per,14.68034,13.899452,28.579792,10.649466,17.930326
2,3,rural,128.605609,273.402513,435.992956,620.728641,817.119036,1023.953282,1225.916293,1414.747791,...,4.599114,4.353169,3.388798,3.051437,per,12.602526,11.932163,24.534689,9.142171,15.392518
3,4,rural,127.938058,263.618495,410.432105,568.264752,735.777852,912.002276,1085.084391,1247.965492,...,5.083956,4.812084,3.746048,3.373122,per,13.931095,13.190062,27.121157,10.105947,17.015211
4,5,urban_cluster,176.856075,356.591034,548.408754,750.600224,1013.627816,1348.696712,1691.794096,2031.490143,...,8.575487,8.116899,6.318737,5.689696,per,23.498613,22.248657,45.747271,17.046451,28.70082


In [None]:
index_col = ["cell_id"]
common_cols = ["smod", "polygon_id", "geometry", "country"]
access_cols = [
    "lon",
    "lat",
    "nearest_schools_ix",
    "distance_to_nearest_schools",
    "duration_to_nearest_schools",
    "duration_to_nearest_schools_label",
    "EduNivelPrimaria",
    "EduNivelSecundariaTotal",
    "nivel_educativo",
]
pop_cols = [
    "pop_2020_m_5",
    "pop_2020_f_5",
    "pop_2020_m_10",
    "pop_2020_f_10",
    "pop_2020_m_15",
    "pop_2020_f_15",
    "pop_m",
    "pop_f",
    "pop_total",
    "pop_primary_school_age",
    "pop_secondary_school_age",
]

In [15]:
gdf_cells_access["cell_id"] = gdf_cells_access["cell_id"].astype("int32")

In [None]:
# Combine the two datasets using the index_col
gdf_combined = gdf_cells_access[index_col + access_cols].merge(
    gdf_cells_pop[index_col + common_cols + pop_cols],
    on=index_col,
    suffixes=("_access", "_pop"),
)

# Display the combined dataset
gdf_combined.head()

Unnamed: 0,cell_id,lon,lat,nearest_schools_ix,distance_to_nearest_schools,duration_to_nearest_schools,duration_to_nearest_schools_label,EduNivelPrimaria,EduNivelSecundariaTotal,nivel_educativo,...,pop_2020_f_5,pop_2020_m_10,pop_2020_f_10,pop_2020_m_15,pop_2020_f_15,pop_m,pop_f,pop_total,pop_primary_school_age,pop_secondary_school_age
0,13887,-69.576016,-10.947963,8067,1.673,20.075,15-30,1.0,1.0,Primaria,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,13956,-69.221002,-11.143462,8063,1.0347,12.415,0-15,1.0,1.0,Primaria,...,0.03482,0.035746,0.032994,0.036197,0.029092,0.108391,0.096905,0.205296,0.071269,0.134028
2,13993,-69.069813,-11.281971,8032,0.203,2.436667,0-15,1.0,1.0,Primaria,...,0.688169,0.706471,0.65208,0.715383,0.574967,2.142228,1.915215,4.057443,1.408542,2.648901
3,13998,-69.025069,-11.184197,8047,0.5786,6.943333,0-15,1.0,1.0,Primaria,...,0.465295,0.47767,0.440894,0.483696,0.388756,1.448436,1.294945,2.743381,0.952365,1.791016
4,14081,-68.988741,-11.877038,7980,0.599,7.186667,0-15,1.0,1.0,Primaria,...,0.405146,0.415921,0.383899,0.421168,0.338501,1.261195,1.127546,2.388741,0.829252,1.559489


In [17]:
gdf_combined.shape[0], gdf_cells_access.shape[0], gdf_cells_pop.shape[0]

(18275, 18275, 18275)

In [20]:
gdf_combined.columns

Index(['cell_id', 'lon', 'lat', 'nearest_schools_ix',
       'distance_to_nearest_schools', 'duration_to_nearest_schools',
       'duration_to_nearest_schools_label', 'EduNivelPrimaria',
       'EduNivelSecundariaTotal', 'nivel_educativo', 'smod', 'polygon_id',
       'geometry', 'country', 'pop_2020_m_5', 'pop_2020_f_5', 'pop_2020_m_10',
       'pop_2020_f_10', 'pop_2020_m_15', 'pop_2020_f_15', 'pop_m', 'pop_f',
       'pop_total', 'pop_primary_school_age', 'pop_secondary_school_age'],
      dtype='object')

In [None]:
gdf_combined = gpd.GeoDataFrame(
    gdf_combined, geometry=gdf_combined.geometry, crs=gdf_cells_access.crs
)

In [25]:
gdf_combined.to_parquet("outputs/celdas_pop_distance_complete.parquet", index=False)

In [None]:
gdf_combined_stats = gdf_combined[
    [
        "cell_id",
        "distance_to_nearest_schools",
        "duration_to_nearest_schools",
        "duration_to_nearest_schools_label",
        "nivel_educativo",
        "smod",
        "country",
        "pop_total",
        "pop_primary_school_age",
        "pop_secondary_school_age",
    ]
]

In [29]:
gdf_combined_stats["nivel_educativo"].unique()

array(['Primaria', 'Secundaria'], dtype=object)

In [None]:
# Filter the data for "nivel_educativo" == "primaria"
filtered_data = gdf_combined_stats[
    gdf_combined_stats["nivel_educativo"].str.lower() == "primaria"
]

# Group by "country", "smod", and "duration_to_nearest_schools_label" and calculate the sum of "pop_primary_school_age"
result = filtered_data.groupby(
    ["country", "smod", "duration_to_nearest_schools_label"], as_index=False
)["pop_primary_school_age"].sum()

# Display the result
result

  result = filtered_data.groupby(["country", "smod", "duration_to_nearest_schools_label"], as_index=False)["pop_primary_school_age"].sum()


Unnamed: 0,country,smod,duration_to_nearest_schools_label,pop_primary_school_age
0,bol,rural,0-15,20163.586458
1,bol,rural,15-30,11951.135395
2,bol,rural,30-45,4925.574391
3,bol,rural,45-60,1182.985832
4,bol,rural,60-90,1099.179866
...,...,...,...,...
79,per,urban_cluster,30-45,3231.143488
80,per,urban_cluster,45-60,906.192512
81,per,urban_cluster,60-90,1724.940328
82,per,urban_cluster,90-120,167.940212


In [28]:
# Save to an excel file
result.to_excel("outputs/pop_primary_school_age.xlsx", index=False)

In [None]:
# Filter the data for "nivel_educativo" == "primaria"
filtered_data = gdf_combined_stats[
    gdf_combined_stats["nivel_educativo"].str.lower() == "secundaria"
]

# Group by "country", "smod", and "duration_to_nearest_schools_label" and calculate the sum of "pop_secondary_school_age"
result = filtered_data.groupby(
    ["country", "smod", "duration_to_nearest_schools_label"], as_index=False
)["pop_secondary_school_age"].sum()

# Display the result
result

  result = filtered_data.groupby(["country", "smod", "duration_to_nearest_schools_label"], as_index=False)["pop_secondary_school_age"].sum()


Unnamed: 0,country,smod,duration_to_nearest_schools_label,pop_secondary_school_age
0,bol,rural,0-15,3865.630571
1,bol,rural,15-30,2130.201166
2,bol,rural,30-45,278.213823
3,bol,rural,45-60,0.000000
4,bol,rural,60-90,92.059248
...,...,...,...,...
79,per,urban_cluster,30-45,1595.391455
80,per,urban_cluster,45-60,431.225470
81,per,urban_cluster,60-90,326.611035
82,per,urban_cluster,90-120,45.678576


In [31]:
# Save to an excel file
result.to_excel("outputs/pop_secondary_school_age.xlsx", index=False)

In [None]:
# Set the base path for the data
BASE_PATH = "/Users/claudio/Documents/amazonia-bid/inputs/WorldPop"
countries_labels = [
    "per",
    "col",
    "bol",
    "ecu",
]  # "bra", TODO: Download brazil data (4GB per file * 6 files)
age_groups = [5, 10, 15]  # 5-9, 10-14, 15-19
genders = ["m", "f"]

In [None]:
# Read the country polygons
countries = gpd.read_parquet(
    "~/Documents/amazonia-bid/outputs/amazon_countries.parquet"
)
countries

In [None]:
# Visualize the countries
fig, ax = plt.subplots(figsize=(10, 10))

# Plot the polygon with the population data
countries.plot(ax=ax, column="ADM0_PCODE", alpha=0.5, edgecolor="k", legend=True)

# Add the basemap
ctx.add_basemap(ax, source=ctx.providers.Esri.WorldImagery, crs=countries.crs)

# Add the title
ax.set_title("Country Boundaries")

# Add the grid
ax.grid(True)

# Add the x and y labels
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")

# Show the plot
plt.show()

In [None]:
# Guyana, Suriname, Venezuela are not included in the analysis
countries_ADM0CODE = ["PE", "CO", "EC", "BO"]  # TODO: Fix "BR" data errors
amzn_countries = countries[countries["ADM0_PCODE"].isin(countries_ADM0CODE)]

In [None]:
countries_labels = sorted(countries_labels)
countries_labels

In [None]:
amzn_countries = amzn_countries.sort_values(by=["ADM0_PCODE"], ascending=True)
amzn_countries

In [None]:
amzn_countries["code"] = countries_labels

In [None]:
amzn_countries

In [None]:
# Now we will asign a country code to each cell in the grid
gdf_celdas_countries = gdf_celdas.sjoin(
    amzn_countries,
    how="left",
    predicate="intersects",
    lsuffix="celdas",
    rsuffix="countries",
)

In [None]:
# NOTE: We will simply drop the duplicated rows for now
gdf_celdas_countries = gdf_celdas_countries.drop_duplicates(subset=["cell_id"])

In [None]:
# Check the joined data has the same number of rows as the original grid data
try:
    assert (
        gdf_celdas.shape[0] == gdf_celdas_countries.shape[0]
    ), "Number of rows mismatch"
except AssertionError as e:
    print(f"AssertionError: {e}")
    print("Number of rows in gdf_celdas:", gdf_celdas.shape[0])
    print("Number of rows in gdf_celdas_countries:", gdf_celdas_countries.shape[0])

In [None]:
# Check the unique country codes
gdf_celdas_countries["code"].unique()

In [None]:
# Check the number of cells per country
gdf_celdas_countries["code"].value_counts()

In [None]:
# Check the percentage of cells per country
gdf_celdas_countries["code"].value_counts(normalize=True) * 100

In [None]:
# Check the number of cells without a country code
gdf_celdas_countries["code"].isna().sum()

In [None]:
# Check the percentage of cells without a country code
(gdf_celdas_countries["code"].isna().sum() / gdf_celdas_countries.shape[0]) * 100

In [None]:
# Convert the cell_id to integer so it can be rasterized for population aggregation
gdf_celdas_countries["cell_id"] = gdf_celdas_countries["cell_id"].astype("int32")
# check the type
gdf_celdas_countries["cell_id"].dtype

In [None]:
countries_labels

In [None]:
# Read and clean the population data for each country
country_geodata = {}
for country in countries_labels:
    print(country)
    country_geodata_raw = pd.read_parquet(
        f"./outputs/{country}_pop_2020.parquet"
    ).set_index("cell_id")
    print("raw n cols", country_geodata_raw.shape)
    country_geodata[country] = pd.concat(
        [country_geodata_raw[col].dropna() for col in country_geodata_raw.columns],
        axis=1,
    )
    country_geodata[country]["country"] = country
    print("clean n cols", country_geodata[country].shape)

In [None]:
total_pop = pd.concat(country_geodata.values(), axis=0)
total_pop.head()

In [None]:
celdas = gpd.read_parquet("outputs/celdas.parquet")

In [None]:
celdas.shape, total_pop.shape

In [None]:
celdas["cell_id"] = celdas["cell_id"].astype("int32")
celdas_w_pop = celdas.merge(
    total_pop,
    how="inner",
    left_on="cell_id",
    right_index=True,
)
celdas_w_pop.head()

In [None]:
celdas_w_pop.shape

In [None]:
# poblacion hombres
celdas_w_pop["pop_m"] = (
    celdas_w_pop["pop_2020_m_5"]
    + celdas_w_pop["pop_2020_m_10"]
    + celdas_w_pop["pop_2020_m_15"]
)

# poblacion mujeres
celdas_w_pop["pop_f"] = (
    celdas_w_pop["pop_2020_f_5"]
    + celdas_w_pop["pop_2020_f_10"]
    + celdas_w_pop["pop_2020_f_15"]
)
# poblacion total
celdas_w_pop["pop_total"] = celdas_w_pop["pop_m"] + celdas_w_pop["pop_f"]

# poblacion en edad de escuela primaria
celdas_w_pop["pop_primary_school_age"] = (
    celdas_w_pop["pop_2020_m_5"] + celdas_w_pop["pop_2020_f_5"]
)

# poblacion en edad de escuela secundaria
celdas_w_pop["pop_secondary_school_age"] = (
    celdas_w_pop["pop_2020_m_10"]
    + celdas_w_pop["pop_2020_f_10"]
    + celdas_w_pop["pop_2020_m_15"]
    + celdas_w_pop["pop_2020_f_15"]
)

In [None]:
celdas_w_pop.info()

In [None]:
# Save the combined GeoDataFrame to a new file
celdas_w_pop.to_parquet("outputs/celdas_combined_pop.parquet")

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Prepare the data for plotting
plot_data = celdas_w_pop[["country", "smod", "pop_total"]].copy()

# Aggregate the population data by country and smod
plot_data = plot_data.groupby(["country", "smod"], as_index=False).sum()

# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=plot_data,
    x="country",
    y="pop_total",
    hue="smod",
    ci=None,
    dodge=True,
)

# Customize the plot
plt.title("Population Distribution per Country and SMOD")
plt.xlabel("Country Code")
plt.ylabel("Total Population")
plt.legend(title="SMOD")
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

import seaborn as sns

import matplotlib.pyplot as plt

# Prepare the data for plotting

plot_data = celdas_bol_with_pop_df[
["code", "smod", "pop_primary_school_age", "pop_secondary_school_age"]
].copy()
plot_data = plot_data.melt(
id_vars=["code", "smod"],
value_vars=["pop_primary_school_age", "pop_secondary_school_age"],
var_name="Age Group",
value_name="Population",
)

# Create the bar plot

plt.figure(figsize=(12, 6))
sns.barplot(
data=plot_data,
x="smod",
y="Population",
hue="Age Group",
ci=None,
estimator=sum,
dodge=True,
)

# Customize the plot

plt.title(
"Distribution of Population in Primary and Secondary School Age per Country per SMOD"
)
plt.xlabel("Country Code")
plt.ylabel("Total Population")
plt.legend(title="Age Group")
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot

plt.show()


In [None]:
peru_pop_data_proc = pd.read_parquet("peru_worldpop_school_age_celdas.parquet")

In [None]:
peru_pop_data_proc.shape

In [None]:
cells_pop_bol["pop_2020_f_5"].isna().sum()

In [None]:
countries_geodata["per"] = peru_pop_data_proca

In [None]:
# Clean the data by dropping NaN values for each column
clean_dfs_per = []
for col in countries_geodata["per"].columns:
    clean_dfs_per.append(countries_geodata["per"][col].dropna())

# Combine the cleaned data into a single DataFrame
clean_dfs_per = pd.concat(clean_dfs_per, axis=1)

# Merge the cleaned data with the grid data for Peru
celdas_per = gdf_celdas_countries[gdf_celdas_countries["code"] == "per"]
celdas_per_with_pop = celdas_per.merge(
    clean_dfs_per,
    how="left",
    left_on="cell_id",
    right_index=True,
)

# Prepare the data for plotting
plot_data_per = celdas_per_with_pop[
    [
        "code",
        "smod",
        "pop_2020_m_5",
        "pop_2020_f_5",
        "pop_2020_m_10",
        "pop_2020_f_10",
        "pop_2020_m_15",
        "pop_2020_f_15",
    ]
].copy()
plot_data_per["pop_primary_school_age"] = (
    plot_data_per["pop_2020_m_5"] + plot_data_per["pop_2020_f_5"]
)
plot_data_per["pop_secondary_school_age"] = (
    plot_data_per["pop_2020_m_10"]
    + plot_data_per["pop_2020_f_10"]
    + plot_data_per["pop_2020_m_15"]
    + plot_data_per["pop_2020_f_15"]
)
plot_data_per = plot_data_per.melt(
    id_vars=["code", "smod"],
    value_vars=["pop_primary_school_age", "pop_secondary_school_age"],
    var_name="Age Group",
    value_name="Population",
)

# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=plot_data_per,
    x="smod",
    y="Population",
    hue="Age Group",
    ci=None,
    estimator=sum,
    dodge=True,
)

# Customize the plot
plt.title(
    "Distribution of Population in Primary and Secondary School Age per Country per SMOD (Peru)"
)
plt.xlabel("Settlement Type (SMOD)")
plt.ylabel("Total Population")
plt.legend(title="Age Group")
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Extract the population data for Colombia
cells_pop_col = countries_geodata["col"]

In [None]:
# Clean the data by dropping NaN values for each column
clean_dfs_col = []
for col in cells_pop_col.columns:
    clean_dfs_col.append(cells_pop_col[col].dropna())

In [None]:
# Combine the cleaned data into a single DataFrame
clean_dfs_col = pd.concat(clean_dfs_col, axis=1)

In [None]:
# Merge the cleaned data with the grid data for Colombia
celdas_col = gdf_celdas_countries[gdf_celdas_countries["code"] == "col"]
celdas_col_with_pop = celdas_col.merge(
    clean_dfs_col,
    how="left",
    left_on="cell_id",
    right_index=True,
)

In [None]:
# Prepare the data for plotting
plot_data_col = celdas_col_with_pop[
    [
        "code",
        "smod",
        "pop_2020_m_5",
        "pop_2020_f_5",
        "pop_2020_m_10",
        "pop_2020_f_10",
        "pop_2020_m_15",
        "pop_2020_f_15",
    ]
].copy()
plot_data_col["pop_primary_school_age"] = (
    plot_data_col["pop_2020_m_5"] + plot_data_col["pop_2020_f_5"]
)
plot_data_col["pop_secondary_school_age"] = (
    plot_data_col["pop_2020_m_10"]
    + plot_data_col["pop_2020_f_10"]
    + plot_data_col["pop_2020_m_15"]
    + plot_data_col["pop_2020_f_15"]
)
plot_data_col = plot_data_col.melt(
    id_vars=["code", "smod"],
    value_vars=["pop_primary_school_age", "pop_secondary_school_age"],
    var_name="Age Group",
    value_name="Population",
)

In [None]:
# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=plot_data_col,
    x="smod",
    y="Population",
    hue="Age Group",
    ci=None,
    estimator=sum,
    dodge=True,
)

# Customize the plot
plt.title(
    "Distribution of Population in Primary and Secondary School Age per Country per SMOD (Colombia)"
)
plt.xlabel("Settlement Type (SMOD)")
plt.ylabel("Total Population")
plt.legend(title="Age Group")
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Clean the data by dropping NaN values for each column
clean_dfs_ecu = []
for col in countries_geodata["ecu"].columns:
    clean_dfs_ecu.append(countries_geodata["ecu"][col].dropna())

# Combine the cleaned data into a single DataFrame
clean_dfs_ecu = pd.concat(clean_dfs_ecu, axis=1)

# Merge the cleaned data with the grid data for Ecuador
celdas_ecu = gdf_celdas_countries[gdf_celdas_countries["code"] == "ecu"]
celdas_ecu_with_pop = celdas_ecu.merge(
    clean_dfs_ecu,
    how="left",
    left_on="cell_id",
    right_index=True,
)

# Prepare the data for plotting
plot_data_ecu = celdas_ecu_with_pop[
    [
        "code",
        "smod",
        "pop_2020_m_5",
        "pop_2020_f_5",
        "pop_2020_m_10",
        "pop_2020_f_10",
        "pop_2020_m_15",
        "pop_2020_f_15",
    ]
].copy()
plot_data_ecu["pop_primary_school_age"] = (
    plot_data_ecu["pop_2020_m_5"] + plot_data_ecu["pop_2020_f_5"]
)
plot_data_ecu["pop_secondary_school_age"] = (
    plot_data_ecu["pop_2020_m_10"]
    + plot_data_ecu["pop_2020_f_10"]
    + plot_data_ecu["pop_2020_m_15"]
    + plot_data_ecu["pop_2020_f_15"]
)
plot_data_ecu = plot_data_ecu.melt(
    id_vars=["code", "smod"],
    value_vars=["pop_primary_school_age", "pop_secondary_school_age"],
    var_name="Age Group",
    value_name="Population",
)

# Create the bar plot
plt.figure(figsize=(12, 6))
sns.barplot(
    data=plot_data_ecu,
    x="smod",
    y="Population",
    hue="Age Group",
    ci=None,
    estimator=sum,
    dodge=True,
)

# Customize the plot
plt.title(
    "Distribution of Population in Primary and Secondary School Age per Country per SMOD (Ecuador)"
)
plt.xlabel("Settlement Type (SMOD)")
plt.ylabel("Total Population")
plt.legend(title="Age Group")
plt.xticks(rotation=45)
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
cells_pop_bol["pop_2020_f_5"].isna().sum()

In [None]:
cells_pop_bol["pop_2020_f_10"].isna().sum()

In [None]:
cells_pop_bol["pop_2020_f_15"].isna().sum()

In [None]:
countries_geodata["bol"].shape

In [None]:
xds_clipped

In [None]:
celdas_country

In [None]:
fig, ax = plt.subplots()


xds_clipped.squeeze().plot.imshow(
    ax=ax,
    cmap="viridis",
    add_colorbar=True,
    cbar_kwargs={"label": "Population"},
)

celdas_country.plot(
    ax=ax,
    color="red",
    alpha=0.5,
)

plt.show()

In [None]:
celdas_country.head()

In [None]:
celdas_country.info()

In [None]:
celdas_country["cell_id"] = celdas_country["cell_id"].astype(float)

In [None]:
celdas_country.shape

In [None]:
celdas_country["cell_id"].unique().shape

In [None]:
xds_mask = make_geocube(
    vector_data=celdas_country[["cell_id", "geometry"]],
    # measurements=["cell_id"],
    like=xds_clipped,
    fill=0,
)
xds_mask

In [None]:
# Add a new dimension to the mask for the population using the clipped raster
xds_clipped_squeezed = xds_clipped.squeeze()
xds_mask[pop_col] = (
    xds_clipped_squeezed.dims,
    xds_clipped_squeezed.values,
    xds_clipped_squeezed.attrs,
    xds_clipped_squeezed.encoding,
)
xds_mask

In [None]:
# Now we can calculate the total population in each cell and add it back to the geodataframe
agg_pop = (
    xds_mask.drop_vars("spatial_ref").groupby(xds_mask.cell_id).sum().to_dataframe()
)

In [None]:
import dask

In [None]:
# Now we can calculate the total population in each cell and add it back to the geodataframe
agg_pop_1 = (
    xds_mask.drop_vars("spatial_ref").groupby(xds_mask.cell_id).sum().to_dataframe()
)

In [None]:
agg_pop.shape, celdas_country.shape

In [None]:
population_data.append(agg_pop)

countries_geodata[country] = pd.concat(population_data)

In [None]:
plotly.offline.init_notebook_mode()

In [None]:
fig = up.plotting.choropleth_map(
    peru_access,
    "pop_2020",
    title="Estimated Population - 2020",
    opacity=0.5,
    width=800,
    height=800,
)

# Make space for the title
fig.update_layout(margin=dict(l=0, r=0, b=0, t=40))

fig.show()

In [None]:
# Get ordered category labels
category_orders = (
    huancabamba_access["duration_to_nearest_schools_label"].unique().sort_values()
)
category_orders.categories

In [None]:
fig = up.plotting.choropleth_map(
    peru_access,
    color_column="duration_to_nearest_schools_label",
    color_discrete_sequence=px.colors.sequential.Plasma_r,
    category_orders={"duration_to_nearest_schools_label": category_orders},
    opacity=0.5,
    labels={"duration_to_nearest_schools_label": "Minutes"},
    title="Travel Time to Nearest School",
    width=800,
    height=800,
)

# Make space for the title
fig.update_layout(margin=dict(l=0, r=0, b=0, t=40))

# Remove the hexagon outlines to make the map clearer
fig.update_traces(marker_line_width=0)

fig.show()