# Create dataset which uses power metric as prediction goal

In [7]:
import difflib
from typing import List
import pandas as pd
import glob
import os
from matplotlib import pyplot as plt

# Path to the directory containing feather files
hpa_dir = "../../data/minimized_warehouse_6b/"
base_dir = "../../src_predict/03_combined_data/minimized_warehouse_6b/"
output_base_dir = "04_datasets/power_6b/"
timezone_difference = 2  # hours (hpa-files and prometheus files were using different timezone for the timestamp


def get_power_column(dataframe):
    cleaned_df = dataframe.copy()

    """ Sort by timestamp to make sure it makes sense to compute difference between first and last values """
    cleaned_df.sort_values(by="timestamp", inplace=True)

    """ Get all relevant columns for power calculation """
    target_word = 'kepler node package joules total dynamic'
    kepler_cols = cleaned_df.filter(regex="kepler").columns
    closest_matches = difflib.get_close_matches(target_word, kepler_cols, n=5, cutoff=0.05)

    """ Compute joules per match """
    for i, match in enumerate(closest_matches):
        worker_power = cleaned_df[match].diff()
        cleaned_df[f'worker_{i + 1}_power'] = worker_power

    print(closest_matches)


    # Compute total power as sum of all worker powers
    power_columns = [f'worker_{i + 1}_power' for i in range(len(closest_matches))]
    cleaned_df['total_power'] = cleaned_df[power_columns].sum(axis=1)

    return cleaned_df[["total_power", "timestamp"]]



power_dfs = []
for file in glob.glob(f"{hpa_dir}/*/intermediate/full.feather"):
    # Read the dataframe
    df = pd.read_feather(file)
    print(file)
    # fig = plt.figure()
    powerr = get_power_column(df)
    # powerr.plot(x='timestamp', y='total_power')
    power_dfs.append(powerr)
power_df = pd.concat(power_dfs)
# power_df.index = power_df["timestamp"]
# power_df.drop(columns=["timestamp"], inplace=True)
# power_df = power_df.sort_index()


../../data/minimized_warehouse_6b\1738295802_(30.1000)\intermediate\full.feather
['{"__name__":"kepler_node_package_joules_total","container":"kepler-exporter","endpoint":"http","exported_instance":"worker5","instance":"worker5","job":"kepler-exporter","mode":"dynamic","namespace":"kepler","package":"0","pod":"kepler-exporter-znsrp","service":"kepler-exporter","source":"intel_rapl"}', '{"__name__":"kepler_node_package_joules_total","container":"kepler-exporter","endpoint":"http","exported_instance":"worker4","instance":"worker4","job":"kepler-exporter","mode":"dynamic","namespace":"kepler","package":"0","pod":"kepler-exporter-jjmt4","service":"kepler-exporter","source":"intel_rapl"}', '{"__name__":"kepler_node_package_joules_total","container":"kepler-exporter","endpoint":"http","exported_instance":"worker3","instance":"worker3","job":"kepler-exporter","mode":"dynamic","namespace":"kepler","package":"0","pod":"kepler-exporter-7vb55","service":"kepler-exporter","source":"intel_rapl"}', 

In [8]:
def create_dataset(input_dir, output_path):
    # Initialize an empty list to store individual DataFrames with yolomodel
    dataframes = []

    # Read each feather file
    for file in glob.glob(f"{input_dir}/*.feather"):
        # Read the dataframe
        df = pd.read_feather(file)

        # Add power as prediction target
        df = df.merge(
            power_df[["timestamp", "total_power"]],
            on="timestamp",
            how="left"
        )
        df["target"] = df["total_power"]
        df.drop(columns=["total_power"], inplace=True)

        # Remove timestamp column, assuming it's named 'timestamp' as a placeholder
        df = df.drop(columns=['timestamp']) if 'timestamp' in df.columns else df

        # Append to the list of DataFrames
        dataframes.append(df)

        print(file)
        # df.plot(x='timestamp', y='target')
        # df.plot(y='timestamp')
        # plt.show()
        # get_total_joules(df)
        # Determine common columns across all DataFrames
    common_columns = list(set.intersection(*(set(df.columns) for df in dataframes)))

    # Standardize each DataFrame to have only columns that are common across all DataFrames
    dataframes = [df[common_columns] for df in dataframes]

    # Combine all DataFrames into a single DataFrame
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    combined_df = pd.concat(dataframes, ignore_index=True)
    combined_df.to_feather(output_path)
    print(f"Saved to {output_path}")
    return combined_df


for folder in os.listdir(base_dir):
    print(folder)
    input_dir = os.path.join(base_dir, folder)
    output_path = os.path.join(output_base_dir, f"{folder}.feather")
    create_dataset(input_dir, output_path)

CPU
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738295802_(30.1000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738299373_(30.5000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738302955_(30.10000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738306582_(20.1000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738310315_(20.5000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738313769_(20.10000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738317446_(10.1000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738320455_(10.5000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738323262_(10.10000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738326706_(9.1000).feather
../../src_predict/03_combined_data/minimized_warehouse_6b/CPU\1738329593_(9.5000).fe