This notebook processes Sentinel-2 time series data to extract relevant features and saves the entire dataset in Parquet format for each configuration. The configurations vary by feature extraction methods and time spans. The data includes features such as amplitude, phase, offset, elevation, and aspect, along with target variables for classification (deciduous or evergreen). This approach facilitates efficient storage and subsequent cross-validation for feature selection and model evaluation.

In [None]:
import os
from tqdm import tqdm
from utils import load_data

# Remove warnings
import warnings
warnings.filterwarnings("ignore")

# Define methods and years
methods = ["resampled_no_weights",
           "no_resample_no_weights", 
           "no_resample_cloud_weights",
           "no_resample_cloud_disturbance_weights"]

years = [1, 2, 3]
configs = [f"{method}_{year}Y" for method in methods for year in years]

data_dir = '/Users/arthurcalvi/Data/species/validation/tiles'
os.makedirs('data', exist_ok=True)
for config in tqdm(configs): 
    print(f'Processing config: {config}')
    data, all_weights, tile_to_greco = load_data(data_dir, config)
    data['greco_region'] = data['tile_id'].map(tile_to_greco)
    
    # Save the entire dataset to a Parquet file
    data.to_parquet(f'data/entire_dataset_{config}.parquet')

print("All configurations processed and saved.")
