<a href="https://colab.research.google.com/github/7yl4r/0ad-unit-net/blob/master/02_Sample_environmental_data_at_each_occurrence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [5]:
# Modify this section as needed.

# ==============================================================================
# === directory setup
# ==============================================================================
# If using google colab and google drive:
from google.colab import drive
drive.mount('/content/drive')

PROJECT_DIR = '/content/drive/MyDrive/GSoC_SDM_Project'

# === if using local machine
# PROJECT_DIR = './'
#import os
#if not os.path.exists(PROJECT_DIR):
#    os.makedirs(PROJECT_DIR)

# ==============================================================================
# ==============================================================================
# === spatial coverage
# ==============================================================================
LATMIN = 24.11637699635014
LATMAX = 26.11949526731449
LONMIN = -82.51572158798965
LONMAX = -79.61106009492724
# ==============================================================================

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
!pip install rasterio pandas geopandas matplotlib  requests xarray netCDF4



Importing libraries

In [7]:
import pandas as pd
%matplotlib inline

mount google drive for saving result

In [8]:
# read parquet file of occurrences
# see step 01
df_clean = pd.read_parquet(f'{PROJECT_DIR}/occurrences.parquet')

Function to prepare data for environmental sampling

In [9]:
from typing import Optional, List
import pandas as pd

def prepare_for_environmental_sampling(
    df: pd.DataFrame,
    species_column: str = 'scientificName',
    lat_column: str = 'decimalLatitude',
    lon_column: str = 'decimalLongitude',
    additional_columns: Optional[List[str]] = None
) -> pd.DataFrame:

    columns = [species_column, lat_column, lon_column]

    if additional_columns:
        columns.extend([col for col in additional_columns if col in df.columns])

    prepared_df = df[columns].copy()

    prepared_df['occurrence_id'] = range(len(prepared_df))

    cols = ['occurrence_id'] + [col for col in prepared_df.columns if col != 'occurrence_id']
    prepared_df = prepared_df[cols]

    return prepared_df

This is just and example for testing the general functions


In [10]:
# TODO: move this to pyOBIS notebok

def save_occurrence_data(
    df: pd.DataFrame,
    filename: str,
    format: str = 'csv'
) -> None:

    if format == 'csv':
        df.to_csv(f"{filename}.csv", index=False)
        print(f"Data saved to {filename}.csv")
    elif format == 'parquet':
        df.to_parquet(f"{filename}.parquet", index=False)
        print(f"Data saved to {filename}.parquet")
    else:
        raise ValueError(f"Unsupported format: {format}")


bringing in environmental data. Functions for Environmental Data Sampling

In [11]:
import rasterio
import xarray as xr
import numpy as np
from typing import Union, List, Tuple, Optional, Dict

In [13]:
def add_environmental_data(
    df: pd.DataFrame,
    raster_path: str,
    column_name: str = 'temperature',
    lat_col: str = 'decimalLatitude',
    lon_col: str = 'decimalLongitude'
) -> pd.DataFrame:

    df_result = df.copy()

    with rasterio.open(raster_path) as src:
        coords = [(row[lon_col], row[lat_col]) for _, row in df.iterrows()]
        sampled_values = list(src.sample(coords))
        values = [val[0] if val[0] != src.nodata else np.nan for val in sampled_values]
        df_result[column_name] = values

    print(f"Added {column_name}: {df_result[column_name].notna().sum()}/{len(df_result)} valid values")
    return df_result

# multidecadal environmental data from NOAA World Ocean Atlas
# https://www.ncei.noaa.gov/access/world-ocean-atlas-2023/bin/woa23.pl
df_with_env = add_environmental_data(df_clean, f'{PROJECT_DIR}/woa23_decav_t00_04_t_mn.tif', 'temperature')
df_with_env = add_environmental_data(df_with_env, f'{PROJECT_DIR}/woa23_decav_t00_04_t_sd.tif', 'temperature_sd')
# TODO: add more files here

print(df_with_env.head())

df_with_env.to_parquet(f'{PROJECT_DIR}/occurrences_and_environment.parquet', engine='pyarrow', index=False)

Added temperature: 570/3013 valid values
Added temperature_sd: 570/3013 valid values
   decimalLatitude  decimalLongitude occurrenceStatus  date_year  \
0        25.138383        -80.624983                Q     2006.0   
1        24.996280        -80.649250                Q     2006.0   
2        25.248783        -80.403150                Q     2005.0   
3        25.252020        -80.397290                Q     2007.0   
4        25.472200        -80.325950                Q     2005.0   

       date_mid  temperature  temperature_sd  
0  1.147133e+12          NaN             NaN  
1  1.160006e+12    26.382675        2.261415  
2  1.115683e+12    27.391775        1.604826  
3  1.189469e+12          NaN             NaN  
4  1.129680e+12          NaN             NaN  
