In [1]:
import pandas as pd
import geopandas as gpd

# 1. Load ALL meetpunten (with relevance flag) and give each an original index
mp = (
    pd.read_csv("alle_meetpunten_with_relevance.csv")
      .reset_index()
      .rename(columns={"index": "orig_index"})
)

# 2. Isolate only relevant & non-noise points for the spatial join
mp_rel = mp[(mp["relevant"]) & (mp["cluster"] != -1)].copy()

# 3. Create GeoDataFrame for those relevant points in RD and buffer 5 km
mp_rel_gdf = gpd.GeoDataFrame(
    mp_rel,
    geometry=gpd.points_from_xy(
        mp_rel["GeometriePuntX_RD"],
        mp_rel["GeometriePuntY_RD"]
    ),
    crs="EPSG:28992"
)
mp_rel_gdf["geometry"] = mp_rel_gdf.geometry.buffer(5000)  # 5 km

# 4. Load & prepare geocoded food-forests
ff = pd.read_excel("Voedselbossen_geocoded.xlsx")
ff = ff[ff["Lat"].notna() & ff["Lon"].notna()]
ff_gdf = (
    gpd.GeoDataFrame(
        ff,
        geometry=gpd.points_from_xy(ff["Lon"], ff["Lat"]),
        crs="EPSG:4326"
    )
    .to_crs("EPSG:28992")
)

# 5. Spatial join: which forests fall inside each 5 km buffer?
joined = gpd.sjoin(
    mp_rel_gdf,
    ff_gdf,
    how="left",
    predicate="contains"
)

# 6. Rename & keep only the three forest columns plus orig_index
joined = joined.rename(columns={
    "Voedselbos":      "ForestName",
    "Eerste aanplant": "PlantingYear",
    "Grootte":         "ForestSize"
})
forest_info = joined[[
    "orig_index",
    "ForestName",
    "PlantingYear",
    "ForestSize"
]]

# 7. Merge back onto the full meetpunt table by orig_index
final = mp.merge(forest_info, on="orig_index", how="left")

# 8. Drop the helper column, and save
final = final.drop(columns=["orig_index"])
final.to_csv("meetpunten_plus_forests_info.csv", index=False)
print("Wrote meetpunten_plus_forests_info.csv")




from pathlib import Path
import pandas as pd
from tqdm import tqdm

# 1. Read & concatenate all CSVs
wkp_folder = Path("meetwaarden") / "Waterschap Brabantse Delta"
csv_files = list(wkp_folder.glob("WKP_Meetwaarden_*.csv"))
usecols = ["MeetobjectCode", "Meetjaar", "ParameterOmschrijving", "Numeriekewaarde"]
dtype = {"Meetjaar": "Int64", "Numeriekewaarde": "float64"}

dfs = [
    pd.read_csv(fp, sep=";", usecols=usecols, dtype=dtype, low_memory=False)
    for fp in tqdm(csv_files, desc="Reading CSVs")
]
all_wkp = pd.concat(dfs, ignore_index=True)

# 2. Load your clusters file
clusters = pd.read_csv(
    "alle_meetpunten_with_relevance.csv",
    usecols=["MeetobjectCode", "cluster", "relevant"]
)

# 3. (Optional) Only include “relevant” points
clusters = clusters.query("relevant == True")

# 4. Merge cluster info onto your measurements
all_wkp = all_wkp.merge(
    clusters[["MeetobjectCode", "cluster"]],
    on="MeetobjectCode",
    how="inner"
)

# 5. Compute total meetpunten per cluster
n_per_cluster = (
    clusters
    .drop_duplicates("MeetobjectCode")
    .groupby("cluster")["MeetobjectCode"]
    .nunique()
    .rename("total_meetpunten")
    .reset_index()
)

# 6. For each (cluster, parameter), count unique meetpunten with at least one measurement
coverage_cluster = (
    all_wkp
    .drop_duplicates(["cluster", "MeetobjectCode", "ParameterOmschrijving"])
    .groupby(["cluster", "ParameterOmschrijving"])["MeetobjectCode"]
    .nunique()
    .reset_index(name="n_meetpunten")
)

# 7. Merge in the cluster‐sizes and compute pct coverage
coverage_cluster = coverage_cluster.merge(n_per_cluster, on="cluster")
coverage_cluster["pct_coverage"] = (
    coverage_cluster["n_meetpunten"] / coverage_cluster["total_meetpunten"]
)

# 8. Filter by your threshold (e.g. ≥50% of points in cluster)
threshold_pct = 0.5
good_params = coverage_cluster.query("pct_coverage >= @threshold_pct")

# 8b. Save the coverage‐by‐cluster table
good_params.to_csv("coverage_by_cluster.csv", index=False)
print(f"Wrote coverage-by-cluster for {good_params['cluster'].nunique()} clusters "
      f"and {good_params['ParameterOmschrijving'].nunique()} parameters to "
      f"'coverage_by_cluster.csv'")

# 9. Roll up to see, for each parameter, in how many clusters it meets the threshold
n_clusters = coverage_cluster["cluster"].nunique()

param_cluster_counts = (
    good_params
    .groupby("ParameterOmschrijving")["cluster"]
    .nunique()
    .reset_index(name="n_clusters")
)
param_cluster_counts["pct_clusters"] = param_cluster_counts["n_clusters"] / n_clusters

# 10. Sort and save
param_cluster_counts = param_cluster_counts.sort_values(
    "pct_clusters", ascending=False
).reset_index(drop=True)

param_cluster_counts.to_csv("parameter_across_clusters.csv", index=False)
print(f"Wrote parameter-coverage-across-clusters for {len(param_cluster_counts)} parameters to "
      f"'parameter_across_clusters.csv'")

# (Optional) Inspect the top few
print(param_cluster_counts.head(10))

FileNotFoundError: [Errno 2] No such file or directory: 'alle_meetpunten_with_relevance.csv'