In [1]:
import sys
import pathlib
import os
from skmap.catalog import s3_setup, DataCatalog
from skmap.loader import TiledDataLoader
from skmap.overlay import SpaceOverlay, SpaceTimeOverlay
from skmap.misc import find_files, GoogleSheet, ttprint
import random
import pandas as pd
import time
import skmap_bindings as sb
import numpy as np
from shapely.geometry import Point
from geopandas import gpd 
from pathlib import Path
import rasterio
from shapely.geometry import box
# warnings.filterwarnings('default')

folder_path = '/home/xuemeng/work_xuemeng/soc/SoilHealthDataCube/data'
df = pd.read_parquet(f'{folder_path}/003_data_overlaid.organized.pq')


# Split train, test and calibration dataset

In [2]:
from sklearn.model_selection import train_test_split

# Step 1: Bin `hzn_dep`
bins = [0, 20, 50, 100, 200]
labels = ['0-20', '20-50', '50-100', '100-200']
df['hzn_dep_bin'] = pd.cut(df['hzn_dep'], bins=bins, labels=labels, include_lowest=True)

# Step 2: Pre-isolate "water areas & wetland"
water_wetland_df = df[df['lc_survey'] == 'water areas & wetland']
other_lc_df = df[df['lc_survey'] != 'water areas & wetland']

# Step 3: Create stratification column for other land covers
other_lc_df['stratify_col'] = (
    other_lc_df['time'].astype(str) + "_" +
    other_lc_df['hzn_dep_bin'].astype(str) + "_" +
    other_lc_df['lc_survey'].astype(str)
)

# Step 4: Merge classes with too few samples for other land covers
class_counts = other_lc_df['stratify_col'].value_counts()
underrepresented_classes = class_counts[class_counts < 3].index
other_lc_df['stratify_col'] = other_lc_df['stratify_col'].replace(underrepresented_classes, 'merged_class')

# Step 5: Split other land covers
rest_other_lc, test_other_lc = train_test_split(
    other_lc_df,
    test_size=0.05,
    stratify=other_lc_df['stratify_col'],
    random_state=42
)

# Step 6: Split "water areas & wetland" separately
rest_water_wetland, test_water_wetland = train_test_split(
    water_wetland_df,
    test_size=0.2,  # Adjust as needed (e.g., 1:2 split between test and rest)
    random_state=42
)

# Step 7: Combine test and rest datasets
test_df = pd.concat([test_other_lc, test_water_wetland]).reset_index(drop=True)

# step 8: split the rest into train and calibration
train_other_lc, cal_other_lc = train_test_split(
    rest_other_lc,
    test_size=0.115,  # calibration: train = 15:80
    stratify=rest_other_lc['stratify_col'],
    random_state=42
)

# Step 9: Split "water areas & wetland" separately for train & cal
train_water_wetland, cal_water_wetland = train_test_split(
    rest_water_wetland,
    test_size=0.4, 
    random_state=42
)

# Step 10: Combine train and cal datasets
train_df = pd.concat([train_other_lc, train_water_wetland]).reset_index(drop=True)
cal_df = pd.concat([cal_other_lc, cal_water_wetland]).reset_index(drop=True)

# Step 11: Print dataset sizes and validate
print(f"Original DataFrame size: {len(df)}")
print(f"Test DataFrame size: {len(test_df)}")
print(f"Train DataFrame size: {len(train_df)}")
print(f"Calibration DataFrame size: {len(cal_df)}")


# Drop `stratify_col` from test data
test_df = test_df.drop(columns=['stratify_col'])
cal_df = cal_df.drop(columns=['stratify_col'])
train_df = train_df.drop(columns=['stratify_col'])

test_df.to_parquet(f'{folder_path}/004_data_test.pq')
cal_df.to_parquet(f'{folder_path}/005_data_cal.pq')
train_df.to_parquet(f'{folder_path}/006_data_train.pq')

print("Data successfully split into test, calibration, and training sets.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_lc_df['stratify_col'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other_lc_df['stratify_col'] = other_lc_df['stratify_col'].replace(underrepresented_classes, 'merged_class')


Original DataFrame size: 45616
Test DataFrame size: 2283
Train DataFrame size: 38348
Calibration DataFrame size: 4985
Data successfully split into test, calibration, and training sets.


In [3]:
print(len(test_df['time'].unique()),len(test_df['lc_survey'].unique()), len(test_df['hzn_dep_bin'].unique()))

19 8 4


In [4]:
print(len(train_df['time'].unique()),len(train_df['lc_survey'].unique()), len(train_df['hzn_dep_bin'].unique()))

19 8 4


In [5]:
print(len(cal_df['time'].unique()),len(cal_df['lc_survey'].unique()), len(cal_df['hzn_dep_bin'].unique()))

19 8 4


In [6]:
print(len(df['time'].unique()),len(df['lc_survey'].unique()), len(df['hzn_dep_bin'].unique()))

19 8 4


In [7]:
len(test_df)/len(df)

0.050048228691687124

In [8]:
len(train_df)/len(df)

0.8406699403717993

In [9]:
len(cal_df)/len(df)

0.1092818309365135