In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import config

Plot mean source and target features as bar plots.

In [None]:
# Load and save source features
feats = ['abs_energy','intermittency','mean','median','kurtosis','skewness','standard_deviation','agg_autocorrelation_max','erraticness','agg_linear_trend_slope']

results_list = []

for div in ["10", "5", "0"]:
    for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]: 
        concat = pd.read_parquet(f"{config.path_to_data}/{run}/interim/CONCAT_feat.parquet")[['id_ts']+feats]
        source = pd.read_parquet(f"{config.path_to_data}/{run}/interim/source_div{div}{run}/source_div{div}{run}.parquet")
        source_feat = concat[concat.id_ts.isin(source.id_ts.unique())]
        source_feat_mean = source_feat[feats].mean(numeric_only=True, axis=0) 
        
        row_result = {'div': div, 'run': run}
        for feat in feats:
            row_result[feat] = source_feat_mean[feat]
        results_list.append(row_result)

final_source_df = pd.DataFrame(results_list)
print(final_source_df)
final_source_df.to_parquet(f"{config.path_to_evaluation}/source_features_num.parquet")


In [None]:
# get average source features per div
feats = ['abs_energy','intermittency','mean','median','kurtosis','skewness','standard_deviation','agg_autocorrelation_max','erraticness','agg_linear_trend_slope']

div_results_mean = {}
div_results_std = {}

for div in ["10", "5", "0"]:
    all_means = []
    for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]: #
        concat = pd.read_parquet(f"{config.path_to_data}/{run}/interim/CONCAT_feat.parquet")[['id_ts']+feats]
        source = pd.read_parquet(f"{config.path_to_data}/{run}/interim/source_div{div}{run}/source_div{div}{run}.parquet")
        source_feat = concat[concat.id_ts.isin(source.id_ts.unique())]
        source_feat_mean = source_feat[feats].mean(numeric_only=True, axis=0) 
        all_means.append(source_feat_mean)

    # calculate mean and standard deviation of feature means over runs
    means_df = pd.DataFrame(all_means)

    overall_mean = means_df.mean(axis=0)
    overall_std = means_df.std(axis=0)
    
    div_results_mean[div] = overall_mean
    div_results_std[div] = overall_std

# Dataframe for all divs
plot_data_mean = pd.DataFrame(div_results_mean)
plot_data_std = pd.DataFrame(div_results_std)
plot_data_mean.rename(index={
    "agg_autocorrelation_max": "autocorr",
    "agg_linear_trend_slope": "trend",
    "standard_deviation": "sd"
}, inplace=True)

plot_data_std.rename(index={
    "agg_autocorrelation_max": "autocorr",
    "agg_linear_trend_slope": "trend",
    "standard_deviation": "sd"
}, inplace=True)

feats = ['abs_energy','intermittency','mean','median','kurtosis','skewness','sd','autocorr','erraticness','trend']

# create barplot for each feature with one bar per div step
fig, axes = plt.subplots(5, 2, figsize=(16, 25))
axes = axes.flatten()

colors = ['skyblue', 'lightcoral', 'lightgreen']
div_labels = ["10", "5", "0"]

for idx, feature in enumerate(feats):
    ax = axes[idx]
    
    values = [plot_data_mean.loc[feature, div] for div in div_labels]
    errors = [plot_data_std.loc[feature, div] for div in div_labels]
    
    bars = ax.bar(div_labels, values, yerr=errors, color=colors, 
                  capsize=8, alpha=0.8, error_kw={'elinewidth': 2})
    
    ax.set_title(f'{feature}', fontsize=24, pad=15)
    ax.set_xlabel('Div', fontsize=20)
    ax.set_ylabel('Mean (std)', fontsize=20)
    ax.grid(axis='y', alpha=0.3)
    
    ax.set_xticks(range(len(div_labels)))
    ax.set_xticklabels(["Most", "Median", "Least"], fontsize=18)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)
    

plt.tight_layout(pad=3.0)
plt.savefig(f"{config.path_to_evaluation}/source_features.pdf")
plt.show()

In [None]:
# get average source features over divs
feats = ['abs_energy','intermittency','mean','median','kurtosis','skewness','standard_deviation','agg_autocorrelation_max','erraticness','agg_linear_trend_slope']


all_means = []
for div in ["10", "5", "0"]:
    for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]: 
        concat = pd.read_parquet(f"{config.path_to_data}/{run}/interim/CONCAT_feat.parquet")[['id_ts']+feats]
        source = pd.read_parquet(f"{config.path_to_data}/{run}/interim/source_div{div}{run}/source_div{div}{run}.parquet")
        source_feat = concat[concat.id_ts.isin(source.id_ts.unique())]
        source_feat_mean = source_feat[feats].mean(numeric_only=True, axis=0) 
        all_means.append(source_feat_mean)

means_df = pd.DataFrame(all_means)

overall_mean = means_df.mean(axis=0)
overall_std = means_df.std(axis=0)

result = pd.DataFrame(index=overall_mean.index, columns=['mean (std)'])
for feat in feats:
    mean_val = overall_mean[feat]
    std_val = overall_std[feat]
    result.loc[feat, 'mean (std)'] = f"{mean_val:.2f} ({std_val:.2f})"

print(result)

In [None]:
# get and save target features per div, p, and run
feats = ['abs_energy','intermittency','mean','median','kurtosis','skewness','standard_deviation','agg_autocorrelation_max','erraticness','agg_linear_trend_slope']
results_list = []

for div in ["10", "5", "0"]:
    for p in ["5000", "12000", "25000", "50000", "100000", "all"]: 
        for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]: 
            concat = pd.read_parquet(f"{config.path_to_data}/{run}/interim/CONCAT_feat.parquet")[['id_ts']+feats]
            target = pd.read_parquet(f"{config.path_to_data}/{run}/interim/target_div{div}_sim{p}{run}/target_div{div}_sim{p}{run}.parquet")
            target_ids = target.id_ts.unique().tolist()
            target_feat = concat[concat["id_ts"].isin(target_ids)]
            target_feat_mean = target_feat[feats].mean(numeric_only=True, axis=0) 
            
            row_result = {'div': div, 'p': p, 'run': run}
            for feat in feats:
                row_result[feat] = target_feat_mean[feat]
            results_list.append(row_result)

final_df = pd.DataFrame(results_list)
print(final_df)
final_df.to_parquet("/home/dev/projects/data/evaluation/target_features_num.parquet")

In [None]:
# get average target features per p over div and runs

feats = ['abs_energy','intermittency','mean','median','kurtosis','skewness','standard_deviation','agg_autocorrelation_max','erraticness','agg_linear_trend_slope']

p_results_mean = {}
p_results_std = {}

for p in ["5000", "12000", "25000", "50000", "100000", "all"]: 
    all_means = []
    for div in ["10", "5", "0"]:
        
        for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]: 
            concat = pd.read_parquet(f"{config.path_to_data}/{run}/interim/CONCAT_feat.parquet")[['id_ts']+feats]
            target = pd.read_parquet(f"{config.path_to_data}/{run}/interim/target_div{div}_sim{p}{run}/target_div{div}_sim{p}{run}.parquet")
            target_ids = target.id_ts.unique().tolist()
            target_feat = concat[concat["id_ts"].isin(target_ids)]
            target_feat_mean = target_feat[feats].mean(numeric_only=True, axis=0) 
            all_means.append(target_feat_mean)

    means_df = pd.DataFrame(all_means)

    overall_mean = means_df.mean(axis=0)
    overall_std = means_df.std(axis=0)
    p_results_mean[p] = overall_mean
    p_results_std[p] = overall_std
    
    result = pd.DataFrame(index=overall_mean.index, columns=['mean (std)'])
    for feat in feats:
        mean_val = overall_mean[feat]
        std_val = overall_std[feat]
        result.loc[feat, 'mean (std)'] = f"{mean_val:.2f} ({std_val:.2f})"

    print(f"p: {p}", result)


In [None]:
# plot mean target features per p
plot_data_mean = pd.DataFrame(p_results_mean)
plot_data_std = pd.DataFrame(p_results_std)
plot_data_mean.rename(index={
    "agg_autocorrelation_max": "autocorr",
    "agg_linear_trend_slope": "trend",
    "standard_deviation": "sd"
}, inplace=True)

plot_data_std.rename(index={
    "agg_autocorrelation_max": "autocorr",
    "agg_linear_trend_slope": "trend",
    "standard_deviation": "sd"
}, inplace=True)

feats = ['abs_energy','intermittency','mean','median','kurtosis','skewness','sd','autocorr','erraticness','trend']

# Erstelle einen Barplot pro Feature mit Error Bars
fig, axes = plt.subplots(5, 2, figsize=(18, 25))
axes = axes.flatten()

colors = ['skyblue', 'lightcoral', 'lightgreen', 'orange', 'purple', 'brown']
p_labels = ["5000", "12000", "25000", "50000", "100000", "all"]

for idx, feature in enumerate(feats):
    ax = axes[idx]
    
    values = [plot_data_mean.loc[feature, p] for p in p_labels]
    errors = [plot_data_std.loc[feature, p] for p in p_labels]
    
    bars = ax.bar(range(len(p_labels)), values, yerr=errors, color=colors, 
                  capsize=5, alpha=0.8, error_kw={'elinewidth': 1.5})
    
    ax.set_title(f'{feature}', fontsize=24, pad=15)
    ax.set_xlabel('p', fontsize=20)
    ax.set_ylabel('Mean (std)', fontsize=20)
    ax.grid(axis='y', alpha=0.3)
    
    ax.set_xticks(range(len(p_labels)))
    ax.set_xticklabels(["5k", "12k", "25k", "50k", "100k", "all"], fontsize=18)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)

plt.tight_layout(pad=3.0)
plt.savefig(f"{config.path_to_data}/evaluation/target_features.pdf")
plt.show()