# DATAIMPORT/ CLEANUP

In [7]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
import numpy as np
import glob
from sklearn.preprocessing import MinMaxScaler
import scipy.stats as stats
from scipy.stats import levene
from itertools import pairwise
from summarytools import dfSummary



In [8]:
path = r"C:\Users\cr\Downloads"

dfAll = pd.DataFrame()
for version in range(7):
    pattern = os.path.join(path, f"Version{version}_*{"1_1_true"}.txt")
    files = glob.glob(pattern)

    for file in files:
        df = pd.read_csv(file, header=0, names=["ObjInstance", "Identifier", "MethodName", "ElapsedMilliseconds"])
        df["Version"] = f"Version{version}"
        df = df.loc[df.groupby(["ObjInstance", "MethodName"])["ElapsedMilliseconds"].idxmax()]
        dfAll = pd.concat([dfAll, df], ignore_index=True)

output_path = os.path.join(r"D:\projects\private\Bachelorarbeit_Blazor_Wasm\BenchmarkResults", "dfAll_1_1_true.csv")
dfAll.to_csv(output_path, index=False, encoding="utf-8")


In [9]:
dfAll.head()

In [10]:
dfAll = dfAll.dropna(subset=["ElapsedMilliseconds"])
dfAll = dfAll.drop_duplicates(subset=["ObjInstance", "Identifier", "MethodName", "ElapsedMilliseconds"])

dfAll["ElapsedMilliseconds"] = pd.to_numeric(dfAll["ElapsedMilliseconds"])
dfAll = dfAll[dfAll["ElapsedMilliseconds"] >= 0]

#Reihenfolge für Plots
custom_order = [
    "SetParam",
    "SetParam_OnInit",
    "GenerateOrders",
    "PopulateChartOrderState",
    "OnInit_OnParam",
    "OnParam_OnAfterRender",
    "VisualizeOrderStatusSuccess",
    "FINISH"
]

lifecycle = [
    "SetParam",
    "SetParam_OnInit",
    "OnInit_OnParam",
    "OnParam_OnAfterRender",
    "FINISH"
]


dfAll["MethodName"] = pd.Categorical(dfAll["MethodName"], categories=custom_order, ordered=True)
#Um das Gruppieren zum umgehen und bei Plots nach Version und Komponente zu unterscheiden
dfAll["VersionObjInstance"] = dfAll["Version"].astype(str) + " - " + dfAll["ObjInstance"].astype(str)

dfLifecycle = dfAll[dfAll["MethodName"].isin(lifecycle)]
dfLifecycle["VersionObjInstance"] = dfAll["Version"].astype(str) + " - " + dfAll["ObjInstance"].astype(str)




KeyError: ['ElapsedMilliseconds']

# CHECK SIGNIFICANCE

## PLOT

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 8))

sns.boxplot(data=dfAll, x="MethodName", y="ElapsedMilliseconds", hue="VersionObjInstance", ax=ax1, legend=False)
ax1.set_ylim(0, 30)

sns.boxplot(data=dfAll, x="MethodName", y="ElapsedMilliseconds", hue="VersionObjInstance", ax=ax2 , legend=False)
ax2.set_ylim(60, 180)

sns.boxplot(data=dfAll, x="MethodName", y="ElapsedMilliseconds", hue="VersionObjInstance", ax=ax3)
ax3.set_ylim(180, 300)

for ax in [ax1, ax2, ax3]:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    ax.set_xlabel("Methodenname")
    ax.set_ylabel("ElapsedMilliseconds")

plt.legend(title="Version+Komponenten", bbox_to_anchor=(1.05, 0.5), loc="center left")

plt.tight_layout()
plt.show()


In [None]:
#chatgpt
def remove_outliers_grouped(df, value_col, group_cols, threshold=1.5):
    def filter_group(group):
        Q1 = group[value_col].quantile(0.25)
        Q3 = group[value_col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        return group[(group[value_col] >= lower_bound) & (group[value_col] <= upper_bound)]
    
    return df.groupby(group_cols, group_keys=False).apply(filter_group)

dfAll = remove_outliers_grouped(dfAll, "ElapsedMilliseconds", ["MethodName", "VersionObjInstance"])


In [None]:
uniqueVersionObjInstances = dfAll["VersionObjInstance"].unique()

num_plots = len(uniqueVersionObjInstances)
num_cols = 3  
num_rows = (num_plots + num_cols - 1) // num_cols 

fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5 * num_rows), sharey=True)
axes = axes.flatten()  

for i, VersionObjInstance in enumerate(uniqueVersionObjInstances):
    ax = axes[i]
    
    sns.kdeplot(data=dfAll[dfAll["VersionObjInstance"] == VersionObjInstance], 
                x="ElapsedMilliseconds", 
                hue="MethodName", 
                fill=True, 
                common_norm=False, 
                legend=True if i % 2 else False,
                ax=ax)
    ax.set_title(f"{VersionObjInstance}")
    ax.set_xlabel("ElapsedMilliseconds")
    ax.set_ylabel("Dichte")
    ax.set_ylim(0, 0.2)

plt.tight_layout()
plt.show()



In [None]:
plt.figure(figsize=(12, 6))
sns.stripplot(data=dfAll, x="MethodName", y="ElapsedMilliseconds", hue="VersionObjInstance", dodge=True)
plt.xticks(rotation=90)
plt.title("Version+Komponenten nach ElapsedMilliseconds")
plt.show()

## CALCULATION

In [None]:
uniqueVersionObjInstances = dfAll["VersionObjInstance"].unique()
uniqueMethods = dfAll["MethodName"].unique()

for version in uniqueVersionObjInstances:
    versionObjIni = dfAll[dfAll["VersionObjInstance"] == version]
    
    for method in uniqueMethods:
        #Fehler, weil kindkomponente nur lifecyclemethoden hat
        if (method not in lifecycle and (versionObjIni["ObjInstance"] == "BenchmarkComponent").any()):
            continue

        data = versionObjIni[versionObjIni["MethodName"] == method]["ElapsedMilliseconds"]

        modeResult = stats.mode(data).mode
        median = data.median()
        mean = data.mean()
        
        skew = stats.skew(data)
        #unbiased ddof
        std = np.std(data, ddof=1) 

        if modeResult is not None:
            stdPercent = std / mean
            
            if stdPercent > 0.1:
                print(f"{version}, Method: {method}; Mode: {modeResult}; Median: {median}; Std: {std:.2f}")

            #if abs(skew) > 0.1:
                #print(f"{version}; Method: {method}; Skew: {skew:.2f}")


In [None]:
versions = [f"Version{i}" for i in range(7)]


for version in uniqueVersionObjInstances:
    for method in uniqueMethods:
        print(f"\n{version} - Methode: {method}")
    
        df_method = dfAll[dfAll["MethodName"] == method]
    
        for version_a, version_b in pairwise(versions):
            version_a_data = df_method[df_method["Version"] == version_a]["ElapsedMilliseconds"]
            version_b_data = df_method[df_method["Version"] == version_b]["ElapsedMilliseconds"]
        
            t_stat, p_value = stats.ttest_ind(version_a_data, version_b_data, equal_var=False)
            if p_value < 0.05:
                print(f"{version_b} und {version_a} welchs : t-statistic = {t_stat}, p-value = {p_value}")
        
            u_stat, p_value_u = stats.mannwhitneyu(version_a_data, version_b_data)
            if p_value_u < 0.05:
                print(f"{version_b} und {version_a} Mann : U-statistic = {u_stat}, p-value = {p_value_u}")


# DATA EXPLORATION

In [None]:
dfTilt = dfAll.pivot_table(
    values="ElapsedMilliseconds",
    index=["MethodName", "ObjInstance"],
    columns="Version",
    aggfunc=np.median
).sort_index(level=0)

plt.figure(figsize=(12, 8))
sns.heatmap(dfTilt, annot=True, fmt=".0f")

plt.title("Auflistung Methodenmittelwert")
plt.xlabel("Versionen")
plt.ylabel("Methoden und Komponenten")

plt.tight_layout()
plt.show()


In [None]:
df_pivot = dfAll.pivot_table(
    values="ElapsedMilliseconds",
    index=["MethodName", "ObjInstance"],
    columns="Version",
    aggfunc=np.median
).sort_index(level=0)

df_diff = df_pivot.diff(axis=1).iloc[:, 1:]

plt.figure(figsize=(12, 8))
sns.heatmap(df_diff, annot=True, fmt=".0f", linewidths=0.5)

plt.title("Differenz Mittelwert zur vorherigen Version")
plt.xlabel("Versionen")
plt.ylabel("Methoden und Komponenten")

plt.tight_layout()
plt.show()
