In [None]:
import os
import pandas as pd

# Get all subdirs in "preprocessed_workers"
base_dir = "02_grouped_workers/minimized_warehouse_4/"
combined_dir = "03_combined_data/minimized_warehouse_4/"

"""
Combine all worker-wise dataframes from the different runs from a single experiment to a single dataframe

Example:

Input:
    Multiple worker-wise dataframes from different runs:
    - CPU/run_a/worker1.feather
    - CPU/run_a/worker2.feather
    - CPU/run_b/worker1.feather
    - CPU/run_b/worker2.feather

Output:
    Single CPU-dataframe:
    - CPU/CPU.feather
"""
def combine(input_dir, output_dir):
    # Create 'combined_data' directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Iterate over each subdirectory
    for subfolder in os.listdir(input_dir):
        subfolder_path = os.path.join(input_dir, subfolder)

        if os.path.isdir(subfolder_path):
            # List to store DataFrames
            data_frames = []

            # Iterate over possible worker{id}.feather files
            for worker_id in range(1, 6):
                feather_file = f"worker{worker_id}.feather"
                feather_path = os.path.join(subfolder_path, feather_file)
                if os.path.exists(feather_path):
                    df = pd.read_feather(feather_path)
                    data_frames.append(df)

            if data_frames:
                # Find common columns across all DataFrames
                common_columns = list(set(data_frames[0].columns))
                for df in data_frames[1:]:
                    common_columns = list(set(common_columns).intersection(df.columns))

                # Keep only common columns in each DataFrame
                data_frames = [df[common_columns] for df in data_frames]

                # Combine all data frames
                combined_df = pd.concat(data_frames, ignore_index=True, axis=0)
                # Save combined dataframe to "combined_data" directory
                combined_feather_path = os.path.join(output_dir, f"{subfolder}.feather")
                combined_df.to_feather(combined_feather_path)
                print(f"Saved combined DataFrame to: {combined_feather_path}")


for folder in os.listdir(base_dir):
    print(folder)
    input_dir = f"{base_dir}/{folder}"
    output_dir = f"{combined_dir}/{folder}"
    combine(input_dir, output_dir)