In [7]:
import os
import pickle
from datetime import datetime, timedelta
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler,
                                   QuantileTransformer, StandardScaler)
from tqdm.auto import tqdm


# helper functions

In [8]:
def sort_loc_time(data_path: str, output_path: str) -> None:
    """Sort data by location and time and save to a parquet file."""
    df = pd.read_parquet(data_path)
    df = df.sort_values(['location', 'time'])
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(output_path)


# processing functions

In [9]:
def tft_process(path: str, hist_len: int, fut_len: int, output_path: str) -> None:
    """Create TFT training data from a sorted parquet file."""
    output_filename = Path(path).with_suffix('.pkl').name

    df = pd.read_parquet(path)
    df['time'] = pd.to_datetime(df['time'])
    # Extract day of year
    df['doy'] = df['time'].dt.dayofyear

    df = df[['time', 'location', 'latitude', 'longitude', 'tmin', 'tmax',
             'precipitation', 'radiation', 'photoperiod', 'swvl1',
             'sif_clear_inst']]
    df = df.dropna()
    df['time'] = pd.to_datetime(df['time'])

    meta_attrs = ['time', 'location']
    known_attrs = ['tmin', 'tmax', 'radiation', 'precipitation', 'swvl1', 'photoperiod', 'doy']
    static_attrs = ['latitude', 'longitude']
    categorical_attrs = []

    all_cols = list(df.columns)
    feature_cols = [c for c in all_cols if c not in meta_attrs]

    feature_map = {
        'static_feats_numeric': [c for c in feature_cols if c in static_attrs and c not in categorical_attrs],
        'static_feats_categorical': [c for c in feature_cols if c in static_attrs and c in categorical_attrs],
        'historical_ts_numeric': [c for c in feature_cols if c not in static_attrs and c not in categorical_attrs],
        'historical_ts_categorical': [c for c in feature_cols if c not in static_attrs and c in categorical_attrs],
        'future_ts_numeric': [c for c in feature_cols if c in known_attrs and c not in categorical_attrs],
        'future_ts_categorical': [c for c in feature_cols if c in known_attrs and c in categorical_attrs],
    }

    scalers = {'numeric': {}, 'categorical': {}}
    categorical_cardinalities = {}

    for col in tqdm(feature_cols, desc="fit_scalers"):
        if col in categorical_attrs:
            enc = LabelEncoder().fit(df[col].values)
            scalers['categorical'][col] = enc
            categorical_cardinalities[col] = df[col].nunique()
        else:
            if col == 'sif_clear_inst':
                scaler = StandardScaler()
            elif col == 'day_of_year':
                scaler = MinMaxScaler()
            else:
                scaler = QuantileTransformer(n_quantiles=256)
            scalers['numeric'][col] = scaler.fit(df[col].astype(float).values.reshape(-1, 1))

    for col in tqdm(feature_cols, desc="transform"): 
        if col in categorical_attrs:
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = scalers['numeric'][col].transform(df[col].values.reshape(-1, 1)).squeeze().astype(np.float32)

    train_subset = df[(df['time'] >= datetime(1982, 1, 1)) &
                      (df['time'] < datetime(2012, 1, 1))]
    
    val_subset   = df[(df['time'] >= datetime(2012, 1, 1)) &
                      (df['time'] < datetime(2017, 1, 1))]
    
    test_subset  = df[(df['time'] >= datetime(2017, 1, 1)) &
                      (df['time'] < datetime(2022, 1, 1))]
    
    subsets = {'train': train_subset,
               'validation': val_subset,
               'test': test_subset}

    data_sets = {k: {} for k in ['train', 'validation', 'test']}
    for subset in subsets.values():
        subset['id'] = subset['location'].astype(str) + '_' + subset['time'].astype(str)

    for subset_key, subset_data in subsets.items():
        samp_interval = hist_len + fut_len
        for i in range(0, len(subset_data), samp_interval):
            slc = subset_data.iloc[i:i + samp_interval]
            if len(slc) < samp_interval or slc.iloc[0]['location'] != slc.iloc[-1]['location']:
                continue
            data_sets[subset_key].setdefault('time_index', []).append(slc.iloc[hist_len - 1]['location'])
            data_sets[subset_key].setdefault('static_feats_numeric', []).append(
                slc.iloc[0][feature_map['static_feats_numeric']].values.astype(np.float32))
            data_sets[subset_key].setdefault('static_feats_categorical', []).append(
                slc.iloc[0][feature_map['static_feats_categorical']].values.astype(np.int32))
            data_sets[subset_key].setdefault('historical_ts_numeric', []).append(
                slc.iloc[:hist_len][feature_map['historical_ts_numeric']].values.astype(np.float32))
            data_sets[subset_key].setdefault('historical_ts_categorical', []).append(
                slc.iloc[:hist_len][feature_map['historical_ts_categorical']].values.astype(np.int32))
            data_sets[subset_key].setdefault('future_ts_numeric', []).append(
                slc.iloc[hist_len:][feature_map['future_ts_numeric']].values.astype(np.float32))
            data_sets[subset_key].setdefault('future_ts_categorical', []).append(
                slc.iloc[hist_len:][feature_map['future_ts_categorical']].values.astype(np.int32))
            data_sets[subset_key].setdefault('target', []).append(
                slc.iloc[hist_len:]['sif_clear_inst'].values.astype(np.float32))
            data_sets[subset_key].setdefault('id', []).append(
                slc.iloc[hist_len:]['id'].values.astype(str))

    for set_key, comps in data_sets.items():
        for arr_key, arr in comps.items():
            data_sets[set_key][arr_key] = np.array(arr)

    output_dir = Path(output_path)
    output_dir.mkdir(parents=True, exist_ok=True)
    with open(output_dir / output_filename, 'wb') as f:
        pickle.dump({'data_sets': data_sets,
                     'feature_map': feature_map,
                     'scalers': scalers,
                     'categorical_cardinalities': categorical_cardinalities}, f, pickle.HIGHEST_PROTOCOL)


In [10]:
path= '/home/jovyan/research_code/Transformers/temportal_fusion_transformers/data/CSIFMETEO/BDT_50_20/sorted_BDT_50_20_merged_1982_2021_US_MMS.parquet'
hist_len = 60
fut_len = 10
output_path = '/home/jovyan/phenology-ml-clm/data/'
tft_process(path, hist_len, fut_len, output_path)

fit_scalers:   0%|          | 0/9 [00:00<?, ?it/s]

transform:   0%|          | 0/9 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['id'] = subset['location'].astype(str) + '_' + subset['time'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['id'] = subset['location'].astype(str) + '_' + subset['time'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['id'] = subset['location'].astype(st

In [10]:
!ls ../data/


sorted_BDT_50_20_merged_1982_2021_US_MMS.pkl


In [7]:
!pwd

/home/jovyan/phenology-ml-clm/notebooks


In [6]:
path= '/home/jovyan/research_code/Transformers/temportal_fusion_transformers/data/CSIFMETEO/BDT_50_20/sorted_BDT_50_20_merged_1982_2021_US_MMS.parquet'

df = pd.read_parquet(path)
df['time'] = pd.to_datetime(df['time'])

# Extract day of year
df['doy'] = df['time'].dt.dayofyear

df

Unnamed: 0,time,location,sif_clear_inst,tmin,tmax,radiation,precipitation,latitude,longitude,soil,photoperiod,swvl1,doy
0,1982-01-15,13243,0.106985,247.8750,271.5625,7538240.0,0.000977,39.50,-86.50,4.0,9.504436,-1.320243e-05,15
1,1982-01-16,13243,0.105275,248.4375,273.1875,11439232.0,0.004761,39.50,-86.50,4.0,9.527534,-1.043081e-06,16
2,1982-01-17,13243,0.103565,240.5000,256.9375,6629952.0,0.000916,39.50,-86.50,4.0,9.551383,-1.043081e-06,17
3,1982-01-18,13243,0.101855,257.0000,270.3750,8585728.0,0.000000,39.50,-86.50,4.0,9.575972,-1.043081e-06,18
4,1982-01-19,13243,0.100145,263.9375,274.2500,2473216.0,0.000122,39.50,-86.50,4.0,9.601283,1.111627e-05,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58379,2021-12-27,13437,0.077710,280.7500,292.2500,3886656.0,0.003540,39.25,-86.25,4.0,9.251109,-1.233816e-05,361
58380,2021-12-28,13437,0.077557,278.5000,286.8750,1294464.0,0.016479,39.25,-86.25,4.0,9.257565,-5.960464e-08,362
58381,2021-12-29,13437,0.077403,278.6250,282.0000,3664384.0,0.000488,39.25,-86.25,4.0,9.264963,1.221895e-05,363
58382,2021-12-30,13437,0.077249,277.6875,282.9375,3648448.0,0.002014,39.25,-86.25,4.0,9.273299,1.221895e-05,364
