In [1]:
import pandas as pd
import numpy as np

In [1]:
import pandas as pd
import numpy as np
import h5py
import random
from sklearn.model_selection import train_test_split

def split_hdf5_by_models(input_file_path, train_output_path='training.h5', valid_output_path='validation.h5', 
                         test_size=0.3, random_state=42):
    """
    Reads an HDF5 file, sorts it by ["Models", "Time"], extracts unique models,
    randomly splits them (70% training, 30% validation), and saves the split data
    into two separate HDF5 files.
    
    Parameters:
    -----------
    input_file_path : str
        Path to the input HDF5 file
    train_output_path : str
        Path to save the training data (default: 'training.h5')
    valid_output_path : str
        Path to save the validation data (default: 'validation.h5')
    test_size : float
        Proportion of data to be used for validation (default: 0.3)
    random_state : int
        Random seed for reproducibility (default: 42)
    """
    # Read the HDF5 file using pandas
    df = pd.read_hdf(input_file_path, key='models')
    
    # Sort the data by ["Models", "Time"]
    df_sorted = df.sort_values(by=["Model", "Time"])
    
    # Get list of unique models
    unique_models = df_sorted["Model"].unique().tolist()
    
    # Split models into training and validation sets
    train_models, valid_models = train_test_split(
        unique_models, 
        test_size=test_size, 
        random_state=random_state
    )
    
    # Filter dataframes based on model lists
    train_df = df_sorted[df_sorted["Model"].isin(train_models)]
    valid_df = df_sorted[df_sorted["Model"].isin(valid_models)]
    
    # Save training data to HDF5
    train_df.to_hdf(train_output_path, key='models', mode='w')
    
    # Save validation data to HDF5
    valid_df.to_hdf(valid_output_path, key='models', mode='w')
    
    print(f"Data split complete:")
    print(f"- Training set: {len(train_models)} models ({len(train_df)} records) saved to {train_output_path}")
    print(f"- Validation set: {len(valid_models)} models ({len(valid_df)} records) saved to {valid_output_path}")
    
    return train_models, valid_models

if __name__ == "__main__":
    # Example usage
    input_file = "data/uclchem_rawdata_training.h5"
    
    train_models, valid_models = split_hdf5_by_models(
        input_file_path=input_file,
        train_output_path='data/training.h5',
        valid_output_path='data/validation.h5'
    )

: 