# Libraries

## I/O

In [1]:
import os
from pprint import pprint
import pickle
from util_IO import (
    load_pickle_from_main_project_dir,
    load_attributes_df,
    load_timeseries_df
)
from datetime import datetime
from tqdm import tqdm

## Analysis

In [2]:
import pandas as pd
import numpy as np
from keras.utils import timeseries_dataset_from_array

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

2025-05-09 13:12:47.298824: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-09 13:12:48.257109: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib
2025-05-09 13:12:48.257217: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local

# Settings

## Packages

In [3]:
# Set pandas to display a maximum of 300 columns
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 1000)

# Suppress the SettingWithCopyWarning
pd.options.mode.chained_assignment = None

## Parameters

### Load metadata from *1-DataAggregation.ipynb*

In [4]:
aggr_parameters_dict, camels_gb_use_case_dir = load_pickle_from_main_project_dir(
    'aggr_parameters_dict.pkl'
)

# # Print imported variable
pprint(aggr_parameters_dict)

{'attributes': {'aggregations': {'fundamental': {'chalk_streams_df': ['chalk_stream_flag'],
                                                 'climatic_attributes_df': [],
                                                 'humaninfluence_attributes_df': ['surfacewater_abs',
                                                                                  'groundwater_abs',
                                                                                  'discharges',
                                                                                  'num_reservoir',
                                                                                  'reservoir_cap'],
                                                 'hydrogeology_attributes_df': [],
                                                 'hydrologic_attributes_df': ['baseflow_index'],
                                                 'hydrometry_attributes_df': ['bankfull_flow'],
                                                 'landc

In [5]:
# Variables picked
attributes_index = aggr_parameters_dict["attributes"]["attributes_index"]
date_field = aggr_parameters_dict['timeseries']['date_field']
label_field = aggr_parameters_dict['timeseries']['label_field']
camels_gb_silver_dir = aggr_parameters_dict['camels_gb_silver_dir']
camels_gb_data_attributes_aggr_dir = aggr_parameters_dict['camels_gb_data_attributes_aggr_dir']
camels_gb_data_timeseries_aggr_dir = aggr_parameters_dict['camels_gb_data_timeseries_aggr_dir']

### Define reference data

In [6]:
# Setting 
example_input = False

In [7]:
if example_input:

    # Example 2D dataset with date, dynamic, and label columns
    timeseries = {
        date_field: pd.date_range(start='2023-01-01', periods=20+15+10, freq='D'),
        'catchmentID': ['10002'] * 20 + ['10003'] * 15 + ['10004'] * 10,
        f"{date_field}_group": ['00'] * 15 + ['01'] * 5 + ['0'] * 8 + ['01'] * 7 + ['00'] * 8 + ['01'] * 2,
        'precipitation': np.random.rand(20+15+10)*100,
        'temperature': np.random.rand(20+15+10)*100,
        'humidity': np.random.rand(20+15+10)*100,
        'time_ref': range(20+15+10),
        label_field: np.arange(20+15+10)*20
    }

    # Static variables
    attributes = {
        'catchmentID': ['10002', '10003', '10004'],
        'silt_perc': [10, 20, 30],
        'clay_perc': [0.5, 0.6, 0.7]
    }

    # Create DataFrames
    attributes_df = pd.DataFrame(attributes).set_index('catchmentID')
    timeseries_df = pd.DataFrame(timeseries)
    
    # Variables to be windowed
    X_vars_names = [
        "precipitation",
        "temperature",
        "humidity",
        "time_ref"
    ]
    
    # Define which columns need to be scaled with specific scaler
    X_minmax_columns_names_list = ['time_ref']  # Column names for MinMaxScaler
    X_standard_columns_names_list = ['precipitation', 'temperature', 'humidity']  # Column names for StandardScaler
    
    # Get sets to perform checks and warnings
    X_minmax_columns_names_set = set(X_minmax_columns_names_list)
    X_standard_columns_names_set = set(X_standard_columns_names_list)
    
    # Define window size
    sequence_length = 2
    
    # Define label field for model
    label_field_model = label_field
    
   
    # Time range's first year
    start_year = 2020

    # NO `example_input` parameters
    bfi = cs = csf = "N"
    
else:

    # __________
    # Timeseries
    
    timeseries_df = load_timeseries_df(
        camels_gb_data_timeseries_aggr_dir,
        "timeseries_postFEa.csv",
        date_field
    )

    display(timeseries_df.head(3))
    
    
    # __________
    # Attributes  

    attributes_df = load_attributes_df(
        camels_gb_data_attributes_aggr_dir,
        "fundamental_postFEa.csv",
        attributes_index
    )

    display(attributes_df.head(3))
    
    
    # _____________
    # Aux variables
    
    # Variables to be windowed
    X_vars_names = [
        "precipitation",
        "temperature",
        "humidity",
        "shortwave_rad",
        "longwave_rad",
        "windspeed",
        "sin_year",
        "cos_year",
        "time_ref"
    ]
    
    # Column names for MinMaxScaler
    X_minmax_columns_names_list = [     
        'time_ref'
    ] 
    
    # Column names for StandardScaler
    X_standard_columns_names_list = [
        "precipitation",
        "temperature",
        "humidity",
        "shortwave_rad",
        "longwave_rad",
        "windspeed"
    ]  
    
    # Get sets to perform checks and warnings
    X_minmax_columns_names_set = set(X_minmax_columns_names_list)
    X_standard_columns_names_set = set(X_standard_columns_names_list)
    
    # Define window size
    sequence_length = 30 # 30 15 10 5
    
    # Define transformed label
    label_transformation = "log1p"
    
    # Define label field for model
    label_transformed_field = f"{label_transformation}_{label_field}"

    #___________________________
    # "Moving fields" management
    
    # Which one do you want??
    label_field_model = label_transformed_field # label_transformed_field / label_field
    
    # Time range's first year
    start_year = 1985

    # Do you want to include `baseflow_index`?
    bfi = "N" # "Y" / "N"

    # Do you want to include chalk stream catchments?
    cs = "N" # "Y" / "N"

    # Do you want to include `chalk_stream_flag`?
    csf = "N" # "Y" / "N"

    # Overwrite, as `chalk_stream_flag` is useless without chalk stream catchments..
    #..as it would be a column of `False`, but cs == "Y" and csf = "N" is allowed
    if cs == "N":
        csf = "N"
    #___________________________

Unnamed: 0,catchmentID,date,precipitation,temperature,humidity,shortwave_rad,longwave_rad,windspeed,discharge_vol,date_group,log1p_discharge_vol,sin_year,cos_year,time_ref,time_ref_scaled
0,101002,1997-03-01,0.26,8.31,6.11,73.68,320.09,6.23,0.319,0,0.276874,-0.326029,0.94536,9648,0.58704
1,101002,1997-03-02,0.1,9.55,5.64,89.11,315.49,6.51,0.314,0,0.273076,-0.309718,0.950828,9649,0.587101
2,101002,1997-03-03,24.15,5.84,4.98,51.62,320.09,2.75,0.397,0,0.334327,-0.293316,0.956015,9650,0.587162


Unnamed: 0_level_0,baseflow_index,sand_perc,silt_perc,clay_perc,organic_perc,gauge_elev,area,dpsbar,elev_mean,elev_min,elev_10,elev_50,elev_90,elev_max,dwood_perc,ewood_perc,grass_perc,shrub_perc,crop_perc,urban_perc,inwater_perc,bares_perc,surfacewater_abs,groundwater_abs,discharges,num_reservoir,reservoir_cap,chalk_stream_flag
gauge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
53018,0.57,31.51,32.07,36.42,0.74,18.0,1569.35,48.9,109.0,20.3,53.3,104.0,172.1,304.6,6.38,0.65,50.46,0.04,35.01,6.91,0.19,0.48,0.034,0.051,0.066,0,0,False
33034,0.77,56.74,23.23,20.03,1.31,7.2,707.75,16.3,42.0,8.1,25.3,41.7,60.0,94.6,8.49,7.07,13.07,0.33,65.76,5.14,0.3,0.0,0.004,0.046,0.02,0,0,True
55018,0.56,36.89,36.01,27.09,0.47,55.4,143.82,68.0,138.0,57.2,77.0,142.3,199.2,252.4,5.4,0.12,56.11,0.0,36.59,1.97,0.04,0.03,0.0,0.0,0.007,0,0,False


#### Checks & Warnings

In [8]:
assert X_minmax_columns_names_set.isdisjoint(X_standard_columns_names_set), (
    "There are repetition between columns to be scaled with MinMax scaler and those to be scaled with Standard scaler"
)

# Warnings
X_no_scaled_columns_names_list = list(set(X_vars_names) - X_minmax_columns_names_set - X_standard_columns_names_set)
if len(X_no_scaled_columns_names_list) != 0:
    print("Columns that will NOT be scaled:")
    pprint(X_no_scaled_columns_names_list)

Columns that will NOT be scaled:
['sin_year', 'cos_year']


In [9]:
if example_input:
    display(attributes_df)

In [10]:
if example_input:
    display(timeseries_df)

### General parameters

In [11]:
train_size = 0.7
test_size = 1 - train_size
min_required_obs = sequence_length / test_size
X_n_vars=len(X_vars_names)

# Attributes

## `baseflow_index`

In [12]:
n_col_pre = attributes_df.shape[1]

if bfi == "N":
    (
        attributes_df
            .drop(
                columns=['baseflow_index'],
                inplace=True
            )
    )

    assert n_col_pre - attributes_df.shape[1] == 1, ('Inconsistency after trying to remove `baseflow_index`')

## Chalk streams

In [13]:
n_row_pre, n_col_pre = attributes_df.shape

# If chalk stream catchments need to be removed
if cs == "N":
    
    # Define chalk stream catchments list (useful for time series)
    chalk_streams_list = attributes_df[attributes_df['chalk_stream_flag'] == True].index.to_list()
    
    # Remove chalk stream catchments
    attributes_df = attributes_df[~attributes_df.index.isin(chalk_streams_list)]

    assert n_row_pre > attributes_df.shape[0], ('Inconsistency after trying to remove chalk stream catchments')

# If either "N" conditions on chalk streams are met
if cs == "N" or csf == "N":

    # Remove `chalk_stream_flag`
    attributes_df.drop(columns=['chalk_stream_flag'], inplace=True)

    assert n_col_pre -  attributes_df.shape[1], ('Inconsistency after trying to remove `chalk_stream_flag`')

## Scaler details

In [None]:
# Retrieving all columns from attributes as list
X_static_vars_names = attributes_df.columns.to_list()
X_static_standard_columns_names_list = X_static_vars_names.copy()
X_static_no_scaled_columns_names_list = []

# Removing 'baseflow_index' as spans [0, 1] by construction
if bfi == "Y":
    X_static_standard_columns_names_list.remove('baseflow_index')
    X_static_no_scaled_columns_names_list.append('baseflow_index')

# Removing 'chalk_stream_flag' as is already Boolean
if csf == "Y":
    X_static_standard_columns_names_list.remove('chalk_stream_flag')
    X_static_no_scaled_columns_names_list.append('chalk_stream_flag')

# Timeseries

## Filter for time range

In [15]:
timeseries_df = (
    timeseries_df[
        pd.to_datetime(timeseries_df[date_field]) >= datetime(start_year, 9, 30) # Because most recent observation: 30/09/2015
    ]
)

## Chalk streams

In [16]:
n_row_pre = timeseries_df.shape[0]

if cs == "N":
    timeseries_df = (
        timeseries_df[
            ~timeseries_df['catchmentID'].isin(chalk_streams_list)
        ]
    )

    assert n_row_pre > timeseries_df.shape[0], ("Inconsistency after trying to remove chalk stream catchments' TS")

## Sensor data windowing and registry (function definition)

In [17]:
def create_timeseries_for_sensor(sensor_data,
                                 sensor_id,
                                 group,
                                 sequence_length=15,
                                 sampling_rate=1,
                                 sequence_stride=1
):
    
    # ____________________________________
    # Split train/test (without shuffling)
    
    # Train and test set
    X_train, X_test, y_train, y_test = (
        train_test_split(
            sensor_data[X_vars_names].values,
            sensor_data[label_field_model].values,
            train_size=train_size,
            shuffle=False
        )
    )
    
    # Timestamps
    timestamps_train, timestamps_test = (
        train_test_split(
            sensor_data[date_field].values,
            train_size=train_size,
            shuffle=False
        )
    )
    
    
    # _________
    # Windowing
    for X, y, timestamps, curr_set in zip(
        [X_train, X_test],
        [y_train, y_test],
        [timestamps_train, timestamps_test],
        ['train', 'test']
    ):
    
        # ______________________________
        # Windowing and numpy conversion

        # Launch `keras.utils.timeseries_dataset_from_array` to generate the windows
        dataset = timeseries_dataset_from_array(
            X,
            targets=y[(sequence_length-1):],
            sequence_length=sequence_length,
            sampling_rate=sampling_rate,
            sequence_stride=sequence_stride,
            batch_size=1 # Must be ️1, see related comment🅰️
        )

        # Collect the data from batches in the dataset
        sequence_list=[]
        target_list = []
        for curr_sequence, curr_target in dataset:  # See previous comment 🅰️
            sequence_list.append(curr_sequence)
            target_list.append(curr_target)

        # Convert into a 3D numpy array
        sequence = np.array(sequence_list).reshape(-1, sequence_length, X_n_vars)
        target = np.array(target_list).reshape(-1)


        # ____________________
        # Observation registry

        # Calculate timestamps tuple with start and end dates for each observation
        timestamps = [
            (timestamps[i], timestamps[i + sequence_length - 1])
            for i in range(len(timestamps) - sequence_length + 1)
        ]

        # Generate data frame to store registry
        sensor_registry_df = pd.DataFrame(timestamps, columns=['start_date', 'end_date'])
        sensor_registry_df.insert(0, 'group', group)
        sensor_registry_df.insert(0, 'catchmentID', sensor_id)


        # ______
        # Checks
        n_sequence = sequence.shape[0]
        n_target = target.shape[0]
        n_timestamps = len(timestamps)
        assert n_sequence == n_target == n_timestamps, "Sensor' sequence, target, and timestamp for data frame must have the same length for first dimension"
    
        # ______________
        # Set allocation
        if curr_set == 'train':
            sequence_train = sequence
            target_train = target
            sensor_registry_train_df = sensor_registry_df
        
        elif curr_set == 'test':
            sequence_test = sequence
            target_test = target
            sensor_registry_test_df = sensor_registry_df
    
    return (sequence_train, target_train), (sequence_test, target_test), (sensor_registry_train_df, sensor_registry_test_df)

## Sensor data aggregation

In [None]:
# Initialize aggregate variables
X_train,  X_test = np.empty((0, sequence_length, X_n_vars)), np.empty((0, sequence_length, X_n_vars))
y_train, y_test = np.array([]), np.array([])

# Initialize data frames for registry
registry_train_df = pd.DataFrame()
registry_test_df = pd.DataFrame()

# Set verbosity
verbose=example_input

# Loop on sensor data to aggregate the data 
for (curr_sensor_id, curr_date_group), curr_sensor_df in tqdm(
        timeseries_df.groupby(['catchmentID', f"{date_field}_group"]),
        desc="Processing catchmentID-groups"
):

    # Define current "sensor-group"
    curr_sensor_group = f"{curr_sensor_id}-{curr_date_group}"
    
    # Calculate number of observations
    n_curr_sensor_obs = curr_sensor_df.shape[0]
    
    # Check on the minimum of observation required
    if n_curr_sensor_obs >= min_required_obs:

        if verbose:
            print(f"{curr_sensor_group}... ", end="")
    
        # Function call
        (curr_X_train, curr_y_train), (curr_X_test, curr_y_test), (curr_registry_train_df, curr_registry_test_df)  = (
            create_timeseries_for_sensor(
                curr_sensor_df,
                curr_sensor_id,
                curr_date_group,
                sequence_length=sequence_length)
        )

        # Aggregate Train
        X_train = np.concatenate((X_train, curr_X_train), axis=0)
        y_train = np.concatenate((y_train, curr_y_train), axis=0)

        # Aggregate Test
        X_test = np.concatenate((X_test, curr_X_test), axis=0)
        y_test = np.concatenate((y_test, curr_y_test), axis=0)

        # Aggregate registry
        registry_train_df = pd.concat([registry_train_df, curr_registry_train_df], ignore_index=True)
        registry_test_df = pd.concat([registry_test_df, curr_registry_test_df], ignore_index=True)
        
        if verbose:
            print("OK!\t", end="")
        
    else:
        
        if verbose:
            print(f"{curr_sensor_group}: too short timeseries ({n_curr_sensor_obs} obs.)\t", end="")


# Checks
n_sequence = X_train.shape[0]
n_target = y_train.shape[0]
n_registry = registry_train_df.shape[0]
assert n_sequence == n_target == n_registry, "Train sequence, target must, and registry data frame must have the same length for first dimension"

n_sequence = X_test.shape[0]
n_target = y_test.shape[0]
n_registry = registry_test_df.shape[0]
assert n_sequence == n_target == n_registry, "Tests sequence, target must, and registry data frame must have the same length for first dimension"

Processing catchmentID-groups:   0%|          | 0/15522 [00:00<?, ?it/s]2025-05-09 13:13:29.392687: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-05-09 13:13:29.395449: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-05-09 13:13:29.397651: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-05-09 13:13:29.399781: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node z

In [None]:
if example_input:
    print(X_train)
else:
    print(f"X train dimensions: {X_train.shape}")

X train dimensions: (2403628, 30, 9)


In [None]:
if example_input:
    print(y_train)
else:
    print(f"y train dimensions: {y_train.shape}")

y train dimensions: (2403628,)


In [None]:
if example_input:
    print(X_test)
else:
    print(f"X test dimensions: {X_test.shape}")

X test dimensions: (940116, 30, 9)


In [None]:
if example_input:
    print(y_test)
else:
    print(f"y test dimensions: {y_test.shape}")

y test dimensions: (940116,)


## `X_train` and `X_test` scaling

### Columns management

In [None]:
# Get the indices of the columns
X_minmax_columns_index = [X_vars_names.index(col) for col in X_minmax_columns_names_list]
X_standard_columns_index = [X_vars_names.index(col) for col in X_standard_columns_names_list]
X_no_scaled_columns_index = [X_vars_names.index(col) for col in X_no_scaled_columns_names_list]

### `multi_scaling` function

In [None]:
def multi_scaling(
    X,
    preprocessor
):
 
    # Store original dimensions
    n_samples, n_timesteps, n_features = X.shape
    print(f"Input with dimensions:\t\t\t{X.shape}")

    # Reshape to 2D
    X_2D = X.reshape(-1, n_features)
    print(f"Reshaped to 2D with dimensions:\t\t{X_2D.shape}")

    # Scale the 2D
    X_scaled_2D = preprocessor.transform(X_2D)
    print(f"After transformation dimensions:\t{X_scaled_2D.shape}")

    # Reshape back to 3D
    X_scaled_3D = X_scaled_2D.reshape(n_samples, n_timesteps, n_features)
    print(f"Reshaped to 3D with dimensions:\t\t{X_scaled_3D.shape}")

    return X_scaled_3D

### Scaling

In [None]:
# _______________________
# Define the preprocessor

# Reshape train set (used as mold data) to 2D
X_mold = X_train.reshape(-1, X_n_vars)
print(f"Mold created with dimensions:\t\t\t\t{X_mold.shape}")

# Remove repeated rows which comes with the windowing (avoiding to distort the distribution)
X_mold = np.unique(X_mold, axis=0)
print(f"Mold reduced by duplicated rows with dimensions:\t{X_mold.shape}", end="\n")

# Define the ColumnTransformer
X_preprocessor = ColumnTransformer(
    transformers=[
        ('minmax', MinMaxScaler(), X_minmax_columns_index),
        ('standard', StandardScaler(), X_standard_columns_index),
        ('no_scale', 'passthrough', X_no_scaled_columns_index)
    ]
)

# Numpy new columns names order
X_cols_names = X_minmax_columns_names_list + X_standard_columns_names_list + X_no_scaled_columns_names_list

# Fit the preprocessor
X_preprocessor.fit(X_mold)

print("\nTrain")

# _____________
# Apply scaling

# Train
X_train_scaled = multi_scaling(
    X_train,
    X_preprocessor
)

print("\nTest")

# Test
X_test_scaled = multi_scaling(
    X_test,
    X_preprocessor
)

Mold created with dimensions:				(72108840, 9)
Mold reduced by duplicated colums with dimensions:	(2567849, 9)

Train
Input with dimensions:			(2403628, 30, 9)
Reshaped to 2D with dimensions:		(72108840, 9)
After transformation dimensions:	(72108840, 9)
Reshaped to 3D with dimensions:		(2403628, 30, 9)

Test
Input with dimensions:			(940116, 30, 9)
Reshaped to 2D with dimensions:		(28203480, 9)
After transformation dimensions:	(28203480, 9)
Reshaped to 3D with dimensions:		(940116, 30, 9)


In [None]:
if example_input:
    print(X_train_scaled)
else:
    print(f"Scaled X train dimensions:{X_train_scaled.shape}")

Scaled X train dimensions:(2403628, 30, 9)


In [None]:
if example_input:
    print(X_test_scaled)
else:
    print(f"Scaled X test dimensions: {X_test_scaled.shape}")

Scaled X test dimensions: (940116, 30, 9)


## Train set shuffling

In [None]:
# Get the number of samples
n_samples = X_train_scaled.shape[0]

# Set the seed
np.random.seed(82)

# Generate a random permutation of indices
shuffle_indices = np.random.permutation(n_samples)

# Shuffle the X training set
X_train_scaled_shuffled = X_train_scaled[shuffle_indices]

# Shuffle the y training set
y_train_shuffled = y_train[shuffle_indices]

# Shuffle the dataframe using the same indices (🚩 without resetting the index!! 🚩)
registry_train_df_shuffled = registry_train_df.iloc[shuffle_indices]

In [None]:
if example_input:
    print(X_train_scaled_shuffled)
else:
    print(f"Shuffled & scaled X train dimensions: {X_train_scaled_shuffled.shape}")

Shuffled & scaled X train dimensions: (2403628, 30, 9)


In [None]:
if example_input:
    print(y_train_shuffled)
else:
    print(f"Shuffled & scaled y train dimensions: {y_train_shuffled.shape}")

Shuffled & scaled y train dimensions: (2403628,)


In [None]:
if example_input:
    display(registry_train_df_shuffled)
else:
    print(f"Shuffled registry df dimensions: {registry_train_df_shuffled.shape}")

Shuffled registry df dimensions: (2403628, 4)


# Attributes (*static variables*)

## Attributes merge via registry (function definition)

In [None]:
def merge_attributes(registry):
    
    # Define columns not to be put in the output (any columns from the registry which is not 'catchmentID')
    registry_redundant_columns_set = set(registry.columns)
    registry_redundant_columns_set.remove('catchmentID')

    # Perform the left join
    merged_df = registry.merge(
        attributes_df,
        left_on='catchmentID',
        right_index=True,
        how='left'
    )

    # Drop redundant columns
    merged_df = merged_df.drop(columns=registry_redundant_columns_set)

    # Set the index to 'catchmentID'
    merged_df = merged_df.set_index('catchmentID')
    
    return merged_df

## Train and test set merge & scale

In [None]:
# Get the indices of the columns for standard scaler
X_static_standard_columns_index = [X_static_vars_names.index(col) for col in X_static_standard_columns_names_list]

# ________________________
# Define transformers list
transformers=[
    ('standard', StandardScaler(), X_static_standard_columns_index)
]

if csf == "Y" or bfi == "Y":

    # Get the indices of the columns with no scaling
    X_static_no_scaled_columns_index = [X_static_vars_names.index(col) for col in X_static_no_scaled_columns_names_list]
    transformers.append(
         ('no_scale', 'passthrough', X_static_no_scaled_columns_index)
    )


X_static_cols_names = X_static_standard_columns_names_list + X_static_no_scaled_columns_names_list

# _____
# Train

# Merge
X_train_static_df = merge_attributes(registry_train_df_shuffled)

# Define the ColumnTransformer
X_static_preprocessor = ColumnTransformer(
    transformers=transformers
)

# Fit and transform
X_train_static_scaled_df = (
    pd.DataFrame(
        X_static_preprocessor.fit_transform(X_train_static_df),
        columns=X_train_static_df.columns
    )
)


# ____
# Test

# Merge
X_test_static_df = merge_attributes(registry_test_df)

# Transform only
X_test_static_scaled_df = (
    pd.DataFrame(
        X_static_preprocessor.transform(X_test_static_df),
        columns=X_test_static_df.columns
    )
)

In [None]:
if example_input:
    display(X_train_static_scaled_df)
else:
    print(f"Shuffled registry df AFTER merging dimensions: {X_train_static_scaled_df.shape}")

Shuffled registry df AFTER merging dimensions: (2403628, 27)


In [None]:
if example_input:
    display(X_train_static_scaled_df)
else:
    print(f"Shuffled registry df AFTER merging dimensions: {X_test_static_scaled_df.shape}")

Shuffled registry df AFTER merging dimensions: (940116, 27)


# Save

## Reset train registry index 🚩

In [None]:
# Train registry
(
    registry_train_df_shuffled
        .reset_index(
            inplace=True,
            drop=True
        )
)

display(registry_train_df_shuffled)

Unnamed: 0,catchmentID,group,start_date,end_date
0,54096,01,2001-12-13,2002-01-11
1,40004,31,2010-09-02,2010-10-01
2,33023,44,2004-09-01,2004-09-30
3,39061,05,1987-12-28,1988-01-26
4,76005,37,2004-03-03,2004-04-01
...,...,...,...,...
2403623,42008,02,1987-08-24,1987-09-22
2403624,44006,00,1986-09-22,1986-10-21
2403625,45005,16,1985-12-05,1986-01-03
2403626,76014,27,1999-08-20,1999-09-18


## Create the dictionary to collect data

In [None]:
if not example_input:
    model_feed = {
        'bfi': bfi,
        'cs': cs,
        'csf': csf,  
        'train_size': train_size,
        'test_size': test_size,
        'min_required_obs': min_required_obs,
        'X_minmax_columns_names_list': X_minmax_columns_names_list,
        'X_standard_columns_names_list': X_standard_columns_names_list,
        'X_no_scaled_columns_names_list': X_no_scaled_columns_names_list,
        'X_static_standard_columns_names_list': X_static_standard_columns_names_list,
        'X_static_no_scaled_columns_names_list': X_static_no_scaled_columns_names_list,
        "X_train": X_train_scaled_shuffled,
        "y_train": y_train_shuffled,
        "X_test": X_test_scaled,
        "y_test": y_test,
        "X_train_static_df": X_train_static_scaled_df,
        "X_test_static_df": X_test_static_scaled_df,
        "X_train_registry_df": registry_train_df_shuffled,
        "X_test_registry_df": registry_test_df,
        "X_preprocessor": X_preprocessor,
        "X_static_preprocessor": X_static_preprocessor,
        "X_cols_names": X_cols_names,
        "X_static_cols_names": X_static_cols_names,
        "label_field_model": label_field_model
    }

## Save the dictionary

In [None]:
# Store
if not example_input:
    with open(
        os.path.join(
            camels_gb_silver_dir,
            f"model_feed-w{sequence_length}-{label_field_model}-{start_year}-bfi_{bfi}-cs_{cs}_csf-{csf}.pkl"
        ),
        'wb'
    ) as f:
        pickle.dump(model_feed, f)