In [None]:
import os
import geopandas as gpd
import urbanpy as up
import pandas as pd
import contextily as ctx
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
gpd.io.file.uses_params

In [None]:
from osgeo import gdal

In [None]:
!ogr2ogr -f "ESRI Shapefile" "inputs/AreasProtegidasFixed/AreasProtegidasFixed.shp" "inputs/Areas protegidas/AFP AREAS PROTEGIDAS WGS84.shp" --config SHAPE_RESTORE_SHX YES

In [None]:
gdf = gpd.read_file("inputs/AreasProtegidasFixed/AreasProtegidasFixed.shp")

In [None]:
gdf.head()

In [None]:
gdf.plot("DESIG_ENG", legend=False, figsize=(10, 10))

In [None]:
gdal.SetConfigOption("SHAPE_RESTORE_SHX", "YES")
areas_protegidas = gpd.read_file(
    "inputs/Areas protegidas/AFP AREAS PROTEGIDAS WGS84.shp", encoding="utf-8"
)

In [None]:
amazonas_hexs5 = gpd.read_parquet("outputs/amazonas_hexs_5.parquet")

In [None]:
amazonas_hexs5.to_file("outputs/amazonas_hexs_5.geojson", driver="GeoJSON")

In [None]:
amazonas_hexs5.drop("geometry", axis=1).to_csv(
    "outputs/amazonas_hexs_5.csv", index=False
)

In [None]:
amazonas_hexs6 = gpd.read_parquet("outputs/amazonas_hexs_5.parquet")

In [None]:
amazonas_hexs6.drop("geometry", axis=1).to_csv(
    "outputs/amazonas_hexs_6.csv", index=False
)

In [None]:
amazonas_hexs7.drop("geometry", axis=1).to_csv(
    "outputs/amazonas_hexs_7.csv", index=False
)

In [None]:
# Load the data
amazonas_limits = gpd.read_parquet("outputs/amazonas_clean.parquet")
amazonas_hexs7 = gpd.read_parquet("outputs/amazonas_hexs_7.parquet")

In [None]:
# Directory path
directory = "inputs/08. Bases limpias"

# Get all CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith(".csv")]

In [None]:
csv_files

In [None]:
# Read and concatenate all CSV files
dfs = []
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

concatenated_df = pd.concat(dfs, ignore_index=True)

In [None]:
concatenated_df.shape

In [None]:
concatenated_df[["Longitud", "Latitud"]].head()

In [None]:
concatenated_df[["Longitud", "Latitud"]].dtypes

In [None]:
concatenated_df[["Longitud", "Latitud"]].isna().sum()

In [None]:
# Invert Bolivia lat lons bc of error
bol_lat = concatenated_df[concatenated_df["Pais"] == "Bolivia"]["Longitud"].copy()
bol_lon = concatenated_df[concatenated_df["Pais"] == "Bolivia"]["Latitud"].copy()
concatenated_df.loc[concatenated_df["Pais"] == "Bolivia", "Latitud"] = (
    bol_lat.values.tolist()
)
concatenated_df.loc[concatenated_df["Pais"] == "Bolivia", "Longitud"] = (
    bol_lon.values.tolist()
)

In [None]:
concatenated_geodf = concatenated_df

In [None]:
concatenated_geodf[["Pais", "IdEscuela"]].isna().sum()

In [None]:
concatenated_geodf[["Pais", "IdEscuela"]].duplicated().sum()

In [None]:
concatenated_geodf["unique_id"] = concatenated_geodf[
    "Pais"
].str.lower() + concatenated_geodf["IdEscuela"].astype(str)

In [None]:
edu_cols = [
    "EduNivelPrimaria",  # 5-9
    # "EduNivelSecundaria",  # 10-14
    # "EduNivelMedia",  # 15-19
    # "EduNivelSecundariaYMedia",  # <---- 10-19
    "EduNivelSecundariaTotal",  # <---- 10-19
]
for col in edu_cols:
    print(col)
    print(len(concatenated_geodf[col].unique()))
    print(concatenated_geodf[col].isna().sum())
    print(concatenated_geodf[col].value_counts())
    print()

In [None]:
concatenated_geodf[["EduNivelPrimaria", "EduNivelSecundariaTotal"]]

In [None]:
filter_nivel = (concatenated_geodf["EduNivelPrimaria"] == 1.0) | (
    concatenated_geodf["EduNivelSecundariaTotal"] == 1.0
)

In [None]:
primaria_secundaria_geodf = concatenated_geodf[filter_nivel]

In [None]:
(
    primaria_secundaria_geodf.shape[0] / concatenated_geodf.shape[0],
    primaria_secundaria_geodf.shape[0] / concatenated_df.shape[0],
)

In [None]:
primaria_secundaria_geodf = gpd.GeoDataFrame(
    primaria_secundaria_geodf,
    geometry=gpd.points_from_xy(
        primaria_secundaria_geodf.Longitud, primaria_secundaria_geodf.Latitud
    ),
    crs="EPSG:4326",
)

In [None]:
primaria_secundaria_geodf = primaria_secundaria_geodf.clip(amazonas_limits)

In [None]:
import contextily as ctx

In [None]:
type(primaria_secundaria_geodf)

In [None]:
ax = primaria_secundaria_geodf[primaria_secundaria_geodf["Pais"] == "Bolivia"].plot()
ctx.add_basemap(ax=ax, crs=primaria_secundaria_geodf.crs)

In [None]:
br_schools = gpd.read_parquet("inputs/brazil_schools_census_education_metrics.parquet")

In [None]:
br_schools.shape

In [None]:
br_schools["code_school"].isna().sum()

In [None]:
br_schools["code_school"].duplicated().sum()

In [None]:
br_schools["QT_MAT_FUND_AI"].isna().sum()
br_schools["QT_MAT_FUND_AF"].isna().sum()
br_schools["QT_MAT_MED"].isna().sum()

In [None]:
br_schools["QT_MAT_FUND_AI"].head()
# br_schools['QT_MAT_FUND_AF'].isna().sum()
# br_schools['QT_MAT_MED'].isna().sum()

In [None]:
mlb_edu = MultiLabelBinarizer(
    classes=[
        "Ensino Fundamental",
        "Educação Infantil",
        "Ensino Médio",
        "Educação de Jovens Adultos",
        "Educação Profissional",
        "",
    ]
)
edu_level = pd.DataFrame(
    mlb_edu.fit_transform(br_schools["education_level"].str.split(", ")),
    columns=mlb_edu.classes_,
    index=br_schools.index,
)
edu_level.columns = [
    "ensino_fundamental",
    "educacao_infantil",
    "ensino_medio",
    "educacao_jovens_adultos",
    "educacao_profissional",
    "no_specified",
]

In [None]:
edu_level.head()

In [None]:
br_schools["EduNivelPrimaria"] = edu_level["ensino_fundamental"] & (
    br_schools["QT_MAT_FUND_AI"] > 0
)  # ensino_fundamental años iniciales
br_schools["EduNivelSecundariaTotal"] = (
    edu_level["ensino_fundamental"] & (br_schools["QT_MAT_FUND_AF"] > 0)
    | edu_level["ensino_medio"]
)

In [None]:
br_schools["EduNivelSecundariaYMedia"] = (
    (edu_level["ensino_fundamental"] + edu_level["ensino_medio"])
    .astype(bool)
    .astype(int)
)

In [None]:
br_schools["unique_id"] = "brasil" + br_schools["code_school"].astype(str)

In [None]:
primaria_secundaria_geodf.crs.to_string()

In [None]:
br_schools = br_schools.to_crs(primaria_secundaria_geodf.crs)

In [None]:
primaria_secundaria_geodf["Pais"].head()

In [None]:
br_schools["Pais"] = "Brasil"

In [None]:
school_columns = [
    "EduNivelPrimaria",
    "EduNivelSecundariaTotal",
    "Pais",
    "unique_id",
    "geometry",
]
complete_schools = pd.concat(
    [br_schools[school_columns], primaria_secundaria_geodf[school_columns]]
)

In [None]:
complete_schools.head()

In [None]:
(complete_schools["Pais"] == "Bolivia").sum()

In [None]:
complete_schools["Pais"].value_counts()

In [None]:
complete_schools.to_parquet("outputs/complete_schools.parquet")

In [None]:
complete_schools["Pais"] == "Bolivia"

In [None]:
# Count number of schools hexagon using urbanpy
amazonas_hexs7_schools = up.geom.merge_shape_hex(
    hexs=amazonas_hexs7,
    shape=complete_schools,
    agg={
        # Education level
        "EduNivelPrimaria": "sum",
        "EduNivelSecundariaTotal": "sum",
        # School count
        "unique_id": "size",
    },
)

In [None]:
amazonas_hexs7_schools.head()

In [None]:
amazonas_hexs4 = up.geom.resolution_downsampling(
    amazonas_hexs7_schools,
    "hex",
    5,
    {"EduNivelPrimaria": "sum", "EduNivelSecundariaTotal": "sum", "unique_id": "sum"},
)

In [None]:
amazonas_hexs4["EduNivelPrimaria"] = amazonas_hexs4["EduNivelPrimaria"].astype(float)
amazonas_hexs4["EduNivelSecundariaTotal"] = amazonas_hexs4[
    "EduNivelSecundariaTotal"
].astype(float)

In [None]:
(
    amazonas_hexs4["EduNivelPrimaria"].hist(),
    amazonas_hexs4["EduNivelSecundariaTotal"].hist(),
)

In [None]:
# 4	= 1,770.347654491

In [None]:
amzn_countries = gpd.read_parquet("outputs/amazon_countries.parquet")

In [None]:
amzn_countries = amzn_countries.to_crs(amazonas_hexs4.crs)

In [None]:
amzn_countries

In [None]:
import numpy as np

In [None]:
amazonas_hexs4["COUNTRY"] = np.random.choice(
    amzn_countries["ADM0_PCODE"], amazonas_hexs4.shape[0]
)

In [None]:
amazonas_countries_random = amazonas_hexs4.groupby("COUNTRY").size().to_frame()
amazonas_countries_random

In [None]:
amazonas_countries_random["school_access"] = np.random.uniform(
    0, 100, amazonas_countries_random.shape[0]
)
amazonas_countries_random["school_count"] = np.random.randint(
    0, 100_000, amazonas_countries_random.shape[0]
)

In [None]:
amazonas_countries_random["school_access"].plot.box()

In [None]:
amazonas_hexs4.within(amzn_countries.iloc[0]["geometry"]).sum()

In [None]:
amzn_countries["var_random"] = 

In [None]:
amzn_countries.plot.box()

In [None]:
amzn_countries

In [None]:
fig, (ax, ax1) = plt.subplots(1, 2, figsize=(20, 10), sharex=True, sharey=True)


data = np.random.randn(25, 8)
df = pd.DataFrame(data, columns=amzn_countries["ADM0_PCODE"].unique())
df.plot.box(ax=ax)
ax.set_ylabel("(NOT FINAL) accesibility")


data = np.random.randn(25, 8)
df = pd.DataFrame(data, columns=amzn_countries["ADM0_PCODE"].unique())
df.plot.box(ax=ax1)
ax.set_ylabel("(NOT FINAL) quantity")

plt.tight_layout()
plt.savefig(
    "outputs/amazonas_2boxplots_schools.svg", dpi=300, bbox_inches="tight", format="svg"
)

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
fig, (ax, ax1) = plt.subplots(1, 2, figsize=(20, 10))

divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.1)

amazonas_limits.plot(facecolor="none", edgecolor="black", linewidth=0.7, ax=ax)
amazonas_hexs4.query("EduNivelPrimaria > 0").plot(
    "EduNivelPrimaria",
    ax=ax,
    linewidth=0.3,
    cmap="viridis",
    categorical=False,
    legend=True,
    vmin=0,
    vmax=10,
    cax=cax,
)
ctx.add_basemap(ax, crs=amazonas_limits.crs.to_string())
ax.set_axis_off()

divider = make_axes_locatable(ax1)
cax1 = divider.append_axes("right", size="5%", pad=0.1)

amazonas_limits.plot(facecolor="none", edgecolor="black", linewidth=0.7, ax=ax1)
amazonas_hexs4.query("EduNivelSecundariaTotal > 0").plot(
    "EduNivelSecundariaTotal",
    ax=ax1,
    linewidth=0.3,
    cmap="viridis",
    categorical=False,
    legend=True,
    vmin=0,
    vmax=10,
    cax=cax1,
)
ctx.add_basemap(ax1, crs=amazonas_limits.crs.to_string())
ax1.set_axis_off()

plt.tight_layout()
plt.savefig(
    "outputs/amazonas_hexs_4_2maps_schools.svg",
    dpi=300,
    bbox_inches="tight",
    format="svg",
)

In [None]:
amazonas_hexs4.drop("geometry", axis=1).to_csv(
    "outputs/amazonas_hexs_4.csv", index=False
)

In [None]:
amazonas_hexs7_schools["EduNivelPrimaria"].dropna().hist(bins=100)

In [None]:
amazonas_hexs7_schools.to_parquet("outputs/amazonas_hexs7_schools.parquet")

In [None]:
# Create a 2x2 subplot
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

# Plot EduNivelPrimaria
axs[0, 0].set_title("EduNivelPrimaria")
amazonas_hexs7_schools.plot(
    column="EduNivelPrimaria",
    cmap="Blues",
    linewidth=0.8,
    ax=axs[0, 0],
    edgecolor="0.8",
    legend=True,
)

# Plot EduNivelSecundaria
axs[0, 1].set_title("EduNivelSecundaria")
amazonas_hexs7_schools.plot(
    column="EduNivelSecundaria",
    cmap="Greens",
    linewidth=0.8,
    ax=axs[0, 1],
    edgecolor="0.8",
    legend=True,
)

# Plot EduNivelMedia
axs[1, 0].set_title("EduNivelMedia")
amazonas_hexs7_schools.plot(
    column="EduNivelMedia",
    cmap="Oranges",
    linewidth=0.8,
    ax=axs[1, 0],
    edgecolor="0.8",
    legend=True,
)

# Plot EduNivelSecundariaYMedia
axs[1, 1].set_title("EduNivelSecundariaYMedia")
amazonas_hexs7_schools.plot(
    column="EduNivelSecundariaYMedia",
    cmap="Purples",
    linewidth=0.8,
    ax=axs[1, 1],
    edgecolor="0.8",
    legend=True,
)

# Remove axis labels for subplots
for ax in axs.flat:
    amazonas_limits.plot(facecolor="none", edgecolor="black", ax=ax)
    ax.set_axis_off()
    ctx.add_basemap(ax, crs=amazonas_limits.crs.to_string())

# Adjust spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()