In [None]:
# cache.py

import numpy as np
import hashlib
from typing import Optional, Union, List, Dict, Any
from sklearn.base import BaseEstimator, TransformerMixin
import dataclasses


class AddVal(TransformerMixin, BaseEstimator):
    def __init__(self, val):
        self.val = val

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X + self.val
    
data = np.array([ [1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400] ], dtype=np.float32)
pipeline = [AddVal(1), {"s": [None, AddVal(0.1), AddVal(0.01)]}, {"f": [None, AddVal(0.5)]}, [AddVal(1000), AddVal(10000)]]
pipeline = [AddVal(0), {"s": [None, AddVal(1)]}, {"f": [None, AddVal(0.5)]}]

def run_pipeline(pipeline, data):
    # id = register_data(data)
    data = data.reshape(1, data.shape[0], 1, data.shape[1])    
    print(data.shape)
    return exec_step(pipeline, data)        
    

def exec_pipeline(pipeline, data):
    for step in pipeline:
        data = exec_step(step, data)
    return data

def exec_step(step, data):
    n_augmentations = data.shape[0]
    n_samples = data.shape[1]
    n_transformations = data.shape[2]
    n_features = data.shape[3]
    
    if isinstance(step, dict):
        if 's' in step: 
            # Sample augmentation
            sample_transformers = step['s']
            data = np.repeat(data, len(sample_transformers), axis=0)
            for (i, st) in enumerate(sample_transformers): # should be parallel execution
                aug_range = range(i * n_augmentations, (i + 1) * n_augmentations)
                aug_data = data[aug_range, :, :, :]
                aug_data = exec_step(st, aug_data)
                data[aug_range, :, :, :] = aug_data
            return data
        
        elif 'f' in step: 
            # Feature augmentation
            feature_transformers = step['f']
            data = np.repeat(data, len(feature_transformers), axis=2)
            for (i, ft) in enumerate(feature_transformers): # should be parallel execution
                tr_range = range(i * n_transformations, (i + 1) * n_transformations)
                tr_data = data[:, :, tr_range, :]
                tr_data = exec_step(ft, tr_data)
                data[:, :, tr_range, :] = tr_data
            return data
        return None
    
    elif isinstance(step, list):
        # Sequential execution
        for s in step: 
            data = exec_step(s, data)
        return data
    
    elif isinstance(step, TransformerMixin):
        # Transformer execution (to apply to every transformation dimension)
        for tr in range(n_transformations): # should be parallel execution
            dataview = data[:, :, tr, :].reshape(n_augmentations * n_samples, n_features)
            new_data = dataview if step is None else step.fit_transform(dataview)
            data[:, :, tr, :] = new_data.reshape(n_augmentations, n_samples, n_features)
        return data
    
    elif step is None:
        # Identity transformation
        return data
    
    return None

def filter_data(data, bool_mask=None, union_type=None):
    n_augmentations = data.shape[0]
    n_samples = data.shape[1]
    n_transformations = data.shape[2]
    n_features = data.shape[3]
    
    if bool_mask is not None:
        data = data[:, bool_mask, :, :]
        n_samples = data.shape[1]
    
    if union_type is None:
        return data
    
    total_samples = n_augmentations * n_samples
    if union_type  == 'concat': 
        # concat features - (n_transformations, n_features) > [ [1, 2, 3, 4], [10, 20, 30, 40] ] -> [1, 2, 3, 4, 10, 20, 30, 40]
        return data.reshape(total_samples, n_transformations * n_features)
    elif union_type == 'interlaced': 
        # interlace features - (n_transformations, n_features) > [ [1, 2, 3, 4], [10, 20, 30, 40] ] -> [1, 10, 2, 20, 3, 30, 4, 40]
        data = data.reshape(total_samples, n_transformations, n_features)
        return data.reshape(total_samples, n_transformations * n_features, order='F')
    elif union_type == 'union': 
        # fold augmentation axis
        return data.reshape(total_samples, n_transformations, n_features)
    elif union_type == 'transpose_union': 
        # fold augmentation axis and transpose transformation and feature axis
        return np.transpose(data.reshape(total_samples, n_transformations, n_features), (0, 2, 1))

    return None

print(data.shape)
print(data, "\n", "x" * 50)
data = run_pipeline(pipeline, data)
print(data.shape)
print(data)
print("*" * 50)

## test filter_data function
print(filter_data(data, bool_mask=[0, 2]))
print("=" * 50)
print(filter_data(data, bool_mask=[0, 2], union_type='concat'))
print("=" * 50)
print(filter_data(data, union_type='interlaced'))
print("=" * 50)
print(filter_data(data, union_type='union'))
print("=" * 50)
print(filter_data(data, union_type='transpose_union'))

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from joblib import Parallel, delayed


class PipelineRunner:
    def __init__(self, pipeline, cache_enabled=True, fit_on_train_only=True):
        self.pipeline = pipeline
        self.cache_enabled = cache_enabled
        self.fit_on_train_only = fit_on_train_only
        self.cache = {}

    def run_pipeline(self, data):
        if isinstance(data, tuple):
            train_data, test_data = data
        else:
            train_data = data
            test_data = None

        combined_data = (train_data, test_data) if test_data is not None else (train_data,)
        data_shape = train_data.shape
        # Reshape data to match expected dimensions
        reshaped_data = [d.reshape(1, d.shape[0], 1, d.shape[1]) for d in combined_data]

        # Execute the pipeline
        transformed_data = self.exec_pipeline(self.pipeline, reshaped_data)

        # Reshape back to original dimensions
        transformed_data = [d.reshape(-1, d.shape[-1]) for d in transformed_data]
        return tuple(transformed_data) if test_data is not None else transformed_data[0]

    def exec_pipeline(self, pipeline, data_list):
        return self.exec_step(pipeline, data_list)

    def exec_step(self, step, data_list):
        if isinstance(step, dict):
            if 's' in step:
                # Sample augmentation
                sample_transformers = step['s']
                augmented_data_list = []
                for st in sample_transformers:
                    transformed = [self.exec_step(st, [data.copy() for data in data_list]) for data in data_list]
                    augmented_data_list.extend(transformed)
                return augmented_data_list

            elif 'f' in step:
                # Feature augmentation
                feature_transformers = step['f']

                def transform_features(ft):
                    return [self.exec_step(ft, [data.copy() for data in data_list]) for data in data_list]

                transformed_data = Parallel(n_jobs=-1)(
                    delayed(transform_features)(ft) for ft in feature_transformers
                )
                # Flatten the list of lists
                transformed_data = [item for sublist in transformed_data for item in sublist]
                return transformed_data

            else:
                raise Exception(f"Unknown augmentation key in dict: {step.keys()}")

        elif isinstance(step, list):
            # Sequential execution
            for s in step:
                data_list = self.exec_step(s, data_list)
            return data_list

        elif isinstance(step, TransformerMixin):
            # Transformer execution
            transformed_data_list = []
            for idx, data in enumerate(data_list):
                n_augmentations, n_samples, n_transformations, n_features = data.shape

                # We combine augmentations and samples for processing
                data_view = data.reshape(-1, n_features)

                cache_key = self.get_cache_key(step, idx)
                if self.cache_enabled and cache_key in self.cache:
                    transformed_data = self.cache[cache_key]
                else:
                    # Fit on train data if specified
                    if self.fit_on_train_only and idx == 0:
                        step.fit(data_view)
                    elif not self.fit_on_train_only:
                        step.fit(data_view)

                    transformed_data = step.transform(data_view)
                    if self.cache_enabled:
                        self.cache[cache_key] = transformed_data

                # Reshape back to original dimensions
                transformed_data = transformed_data.reshape(n_augmentations, n_samples, n_transformations, n_features)
                transformed_data_list.append(transformed_data)
            return transformed_data_list

        elif step is None:
            return data_list

        else:
            raise Exception(f"Unknown step type: {type(step)}")

    def get_cache_key(self, transformer, data_idx):
        transformer_id = id(transformer)
        return (transformer_id, data_idx)

    def filter_data(self, data, bool_mask=None, union_type=None):
        n_augmentations, n_samples, n_transformations, n_features = data.shape

        if bool_mask is not None:
            data = data[:, bool_mask, :, :]
            n_samples = data.shape[1]

        total_samples = n_augmentations * n_samples

        if union_type is None:
            return data

        if union_type == 'concat':
            return data.reshape(total_samples, n_transformations * n_features)
        elif union_type == 'interlaced':
            data = data.reshape(total_samples, n_transformations, n_features)
            return data.reshape(total_samples, n_transformations * n_features, order='F')
        elif union_type == 'union':
            return data.reshape(total_samples, n_transformations, n_features)
        elif union_type == 'transpose_union':
            return np.transpose(data.reshape(total_samples, n_transformations, n_features), (0, 2, 1))
        else:
            raise Exception(f"Unknown union_type: {union_type}")

# Example usage


class AddVal(TransformerMixin, BaseEstimator):
    def __init__(self, val):
        self.val = val

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X + self.val


data = np.array([[1, 2, 3, 4],
                 [10, 20, 30, 40],
                 [100, 200, 300, 400]], dtype=np.float32)

pipeline = [AddVal(0), {"s": [None, AddVal(1)]}, {"f": [None, AddVal(0.5)]}]

runner = PipelineRunner(pipeline)
train_data, test_data = data, data * 2
train_transformed, test_transformed = runner.run_pipeline((train_data, test_data))

print("Transformed Train Data:")
print(train_transformed)
print("Transformed Test Data:")
print(test_transformed)

# test filter_data function
print(filter_data(data, bool_mask=[0, 2]))
print("=" * 50)
print(filter_data(data, bool_mask=[0, 2], union_type='concat'))
print("=" * 50)
print(filter_data(data, union_type='interlaced'))
print("=" * 50)
print(filter_data(data, union_type='union'))
print("=" * 50)
print(filter_data(data, union_type='transpose_union'))

In [3]:
import numpy as np

random_2D_array = np.random.rand(10, 5)
expanded_4d_array = random_2D_array[np.newaxis, :, np.newaxis, :]
print(random_2D_array, random_2D_array.shape)
print(expanded_4d_array, expanded_4d_array.shape)

[[0.5487874  0.83054995 0.85328039 0.34156538 0.68640189]
 [0.75376423 0.9629305  0.23135579 0.05982465 0.93923489]
 [0.67724546 0.88052452 0.19532868 0.06138371 0.61079279]
 [0.56105231 0.51670443 0.61890682 0.57356218 0.55995812]
 [0.50246966 0.18248516 0.05080848 0.70851576 0.57797089]
 [0.5573045  0.24025563 0.2526094  0.40955923 0.41291195]
 [0.98278545 0.4290053  0.29444606 0.26173855 0.39284563]
 [0.09153474 0.4609441  0.09106405 0.63149356 0.33590656]
 [0.28339143 0.64299079 0.31539358 0.08798257 0.92695799]
 [0.64796579 0.0352152  0.74557884 0.28477654 0.34293417]] (10, 5)
[[[[0.5487874  0.83054995 0.85328039 0.34156538 0.68640189]]

  [[0.75376423 0.9629305  0.23135579 0.05982465 0.93923489]]

  [[0.67724546 0.88052452 0.19532868 0.06138371 0.61079279]]

  [[0.56105231 0.51670443 0.61890682 0.57356218 0.55995812]]

  [[0.50246966 0.18248516 0.05080848 0.70851576 0.57797089]]

  [[0.5573045  0.24025563 0.2526094  0.40955923 0.41291195]]

  [[0.98278545 0.4290053  0.29444606 0.