### Import packages

In [1]:
import geopandas as gpd
import rioxarray as rxr
import xarray as xr
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from pathlib import Path
import geopandas as gpd
import rioxarray as rxr
import numpy as np
import pandas as pd
from pathlib import Path
import geopandas as gpd
from pathlib import Path
import pandas as pd
from sklearn.utils import resample

### Define paths Paths

In [2]:

base = Path().resolve()
data_dir = base / "Data"
gpkg_path = data_dir / "Corallinales" / "gbif_corallinales_all_20250405" / "corallinales_2010_onward.gpkg"
raster_dir = data_dir / "Bio_oracle" / "2010_2020" / "Mean_Cropped"
output_path = gpkg_path.parent / "corallinales_2010_onward_deduplicated.gpkg"

# GbIF paths
coralline_files = {
    "2000_2010": data_dir / "Corallinales" / "gbif_corallinales_all_20250405" / "corallinales_2000_2010_deduplicated.gpkg",
    "2010_onward": data_dir / "Corallinales" / "gbif_corallinales_all_20250405" / "corallinales_2010_onward_deduplicated.gpkg"
}

pseudo_absence_combined = {
    "2000_2010": data_dir /"Pseudo_absence" / "Combined" / "comb_2000_2010.gpkg",
    "2010_onward": data_dir / "Pseudo_absence" / "Combined" / "comb_2010_onward.gpkg"
}



### Reduce datasets to environmental unique points

In [3]:
# Define list of selected environmental variable raster filenames
selected_vars_tiff_files = [
    "chl", "clt", "dfe", "o2", "thetao",
    "no3", "dfe", "kdpar", "ph",
    "sws", "swd", "terrain"
]

# Load GeoPackage and reproject to WGS84
gdf = gpd.read_file(gpkg_path).to_crs("EPSG:4326")

# Initialize empty DataFrame to store sampled values
sampled_data = pd.DataFrame(index=gdf.index)

# Extract coordinates in (lat, lon) format for sampling
coords = [(geom.y, geom.x) for geom in gdf.geometry]

# Loop through each selected raster variable
for var in selected_vars_tiff_files:
    raster_path = raster_dir / f"{var}.tif"  # construct raster file path
    raster = rxr.open_rasterio(raster_path, masked=True).squeeze()  # load raster and remove singleton dimension

    # Sample raster values at each coordinate
    samples = []
    for lat, lon in coords:
        try:
            val = raster.sel(y=lat, x=lon, method='nearest').item()  # sample raster using nearest pixel
        except Exception:
            val = np.nan  # assign NaN if sampling fails
        samples.append(val)

    sampled_data[var] = samples  # store results for current variable

# Create mask to keep only rows with complete data
mask = sampled_data.notnull().all(axis=1)
gdf_filtered = gdf.loc[mask]  # filter original GeoDataFrame
sampled_data = sampled_data.loc[mask]  # filter sampled data

# Combine sampled data with spatial geometry
sampled_data["geometry"] = gdf_filtered.geometry.values
dedup_gdf = gpd.GeoDataFrame(sampled_data, geometry="geometry", crs="EPSG:4326")

# Drop duplicate combinations of environmental variables
dedup_gdf = dedup_gdf.drop_duplicates(subset=selected_vars_tiff_files)

# Save filtered and deduplicated GeoDataFrame to GeoPackage
dedup_gdf.to_file(output_path, driver="GPKG")
print(f"Saved {len(dedup_gdf)} unique environmental points to {output_path}")

Saved 285 unique environmental points to C:\Users\rasmu\Desktop\Universitet\Maerl\Data\Corallinales\gbif_corallinales_all_20250405\corallinales_2010_onward_deduplicated.gpkg


In [4]:
# Set base path to data directory
base = Path("C:/Users/rasmu/Desktop/Universitet/Maerl/Data")

# Define input paths for Corallinales occurrence data
coralline_files = {
    "2000_2010": base / "Corallinales" / "gbif_corallinales_all_20250405" / "corallinales_2000_2010_deduplicated.gpkg",
    "2010_onward": base / "Corallinales" / "gbif_corallinales_all_20250405" / "corallinales_2010_onward_deduplicated.gpkg"
}

# Define input paths for pseudo-absence datasets
pseudo_absence_combined = {
    "2000_2010": base / "Pseudo_absence" / "Combined" / "comb_2000_2010.gpkg",
    "2010_onward": base / "Pseudo_absence" / "Combined" / "comb_2010_onward.gpkg"
}

# Create output directory for balanced pseudo-absence files
output_dir = base / "Pseudo_absence" / "Combined" / "Balanced"
output_dir.mkdir(exist_ok=True)

# Function to balance a dataset by equal sampling from unique values in a given column
def balance_by_order(gdf, n_total, order_col="order"):
    order_list = gdf[order_col].unique()  # get unique group labels
    n_per_order = n_total // len(order_list)  # calculate how many to sample from each group
    balanced_parts = []

    for order in order_list:
        subset = gdf[gdf[order_col] == order]  # subset for current group
        if len(subset) < n_per_order:
            resampled = resample(subset, replace=True, n_samples=n_per_order, random_state=42)  # upsample if needed
        else:
            resampled = resample(subset, replace=False, n_samples=n_per_order, random_state=42)  # downsample
        balanced_parts.append(resampled)  # store resampled part

    balanced = pd.concat(balanced_parts).sample(n=n_total, random_state=42)  # combine and shuffle
    return gpd.GeoDataFrame(balanced, geometry="geometry", crs=gdf.crs)  # return as GeoDataFrame

# Loop through both time periods and process
for period in ["2000_2010", "2010_onward"]:
    print(f"Processing {period}...")

    cor_gdf = gpd.read_file(coralline_files[period])  # load presence points
    n_target = len(cor_gdf)  # determine how many absences to match
    print(f"Target number of observations: {n_target}")

    pseudo_gdf = gpd.read_file(pseudo_absence_combined[period])  # load pseudo-absence data

    if "order" not in pseudo_gdf.columns:  # check for required column
        raise ValueError("Expected an 'Order' column in pseudo absence dataset.")

    balanced_gdf = balance_by_order(pseudo_gdf, n_total=n_target, order_col="order")  # balance data

    output_file = output_dir / f"pseudo_absence_balanced_{period}.gpkg"  # define output path
    balanced_gdf.to_file(output_file, driver="GPKG")  # save to GeoPackage
    print(f"Saved balanced pseudo absence data to {output_file} ({len(balanced_gdf)} rows)\n")  # confirm


Processing 2000_2010...
Target number of observations: 42
Saved balanced pseudo absence data to C:\Users\rasmu\Desktop\Universitet\Maerl\Data\Pseudo_absence\Combined\Balanced\pseudo_absence_balanced_2000_2010.gpkg (42 rows)

Processing 2010_onward...
Target number of observations: 285
Saved balanced pseudo absence data to C:\Users\rasmu\Desktop\Universitet\Maerl\Data\Pseudo_absence\Combined\Balanced\pseudo_absence_balanced_2010_onward.gpkg (285 rows)

