In [None]:
import pre_processing as prep

# Get paths to all pre-processed worker files
root_folder = '01_preprocessed_workers/minimized_warehouse_4/'
save_folder = "02_grouped_workers/minimized_warehouse_4/"
worker_paths = prep.find_files_in_folder(root_folder, "worker*.feather")

In [None]:
""" Split the data into subgroups (e.g., CPU.feather, Memory.feather, ...) """
import pandas as pd
import os
from filters import categorize_columns
from collections import Counter
import numpy as np

total_size_mb = 0
category_col_counts = Counter()

all_col_strings = {}
col_occurrences = {}
column_counts_per_category = {}  # A dictionary to collect column counts for each category across all files

for p in worker_paths:
    df = pd.read_feather(p)
    categorized_cols = categorize_columns(df)
    for category, cols in categorized_cols.items():
        category_col_counts[category] += len(cols)  # Count the number of columns per category
        for col in cols:
            if category not in col_occurrences:
                col_occurrences[category] = {}
            col_occurrences[category][col] = col_occurrences[category].get(col, 0) + 1
        all_col_strings[category] = all_col_strings.get(category, []) + [cols]
        if category not in column_counts_per_category:
            column_counts_per_category[category] = []  # Initialize the list for this category
        column_counts_per_category[category].append(len(cols))  # Add the column count for this category

        df2 = df[cols].copy()
        df2["timestamp"] = df["timestamp"]  # Keep timestamp for now
        if root_folder not in p:
            raise ValueError("FIXME: Root folder not found in path - cant replace")
        output_path = p.replace(root_folder, f"{save_folder}/{category}/")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        df2.to_feather(output_path)
        size_mb = df2.memory_usage().sum() / (1024 * 1024)
        total_size_mb += size_mb
        print(
            f"Saved to {output_path} with {len(cols)} columns, size {df2.memory_usage().sum() / (1024 * 1024):.2f} MB")

# Compute and print distributions of the number of columns per category
print("\nColumn distributions per category (min, max, median):")
for category, column_counts in column_counts_per_category.items():
    min_val = min(column_counts)
    max_val = max(column_counts)
    median_val = np.median(column_counts)
    print(f"{category}: min={min_val}, max={max_val}, median={median_val}")

print("\n\n>>> Total size: MB", total_size_mb)

In [None]:
from pprint import pprint

max_cols = len(worker_paths)
tolerance = max_cols - 400
x = 0
for key, cols in col_occurrences.items():
    print(key)
    sorted_items = sorted(cols.items(), key=lambda item: item[1])  # Sort items by values
    for item in sorted_items:
        if item[1] >= tolerance:
            print(item)
            x += 1
    print("\n")

print(f"Max is {len(worker_paths)}")
print(f"Found {x} cols with maximum value")