In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import config

Data shares of the target datasets

In [None]:
# calculate average data shares of the target for each div and p over runs
for div in ["10", "5", "0"]:
    print(f"\ndiv={div}:")
    for p in ["5000", "12000", "25000", "50000", "100000", "all"]:
        dataframes = []
        for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]:
            target = pd.read_parquet(f"{config.path_to_data}/{run}/interim/target_div{div}_sim{p}{run}/target_div{div}_sim{p}{run}.parquet")
            shares = target.drop_duplicates(subset="id_ts").groupby(by="data").count()["id_ts"]
            dataframes.append(shares)
        shares_over_runs = pd.concat(dataframes, axis=1)
        mean_shares = shares_over_runs.mean(axis=1)
        std_shares = shares_over_runs.std(axis=1)
        
        result = pd.DataFrame({
            'data': mean_shares.index,
            f'shares_p{p}': [f"{mean_val:.2f} ({std_val:.2f})" for mean_val, std_val in zip(mean_shares.values, std_shares.values)]
        })
        
        
        print(f"\np={p}:")
        print(result.to_string(index=False))

In [None]:
# create a pie chart with average data shares over runs per p and for one div
div = "10" #"5", "0"
p_values = ["5000", "12000", "25000", "50000", "100000", "all"]

fig, axes = plt.subplots(2, 3, figsize=(18, 12), gridspec_kw={'hspace': 0.4, 'wspace': 0.3})
axes = axes.flatten()

for idx, p in enumerate(p_values):
    dataframes = []
    for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]:
        target = pd.read_parquet(f"{config.path_to_data}/{run}/interim/target_div{div}_sim{p}{run}/target_div{div}_sim{p}{run}.parquet")
        shares = target.drop_duplicates(subset="id_ts").groupby(by="data").count()["id_ts"]
        dataframes.append(shares)
    
    shares_over_runs = pd.concat(dataframes, axis=1)
    mean_shares = shares_over_runs.mean(axis=1)
    
    # Group the data into m4, m5, kaggle and one group for the rest
    grouped_data = {}
    other_sum = 0
    
    for dataset, value in mean_shares.items():
        if dataset in ['m4', 'm5', 'kaggle']:
            grouped_data[dataset] = value
        else:
            other_sum += value
    
    if other_sum > 0:
        grouped_data['others'] = other_sum
    
    grouped_series = pd.Series(grouped_data)
    
    color_map = {
    'others': '#cc0000',  
    'kaggle': '#3366cc',  
    'm4': '#ff9900',      
    'm5': '#339933'       
}

    colors = []
    for label in grouped_series.index:
        if label in color_map:
            colors.append(color_map[label])
        else:
            colors.append(None)
    
    ax = axes[idx]
    wedges, texts, autotexts = ax.pie(
        grouped_series.values, 
        labels=grouped_series.index,
        autopct='%1.1f%%',
        startangle=90,
        pctdistance=0.75,
        labeldistance=1.1,
        colors=colors,
        textprops={'fontsize': 18}
    )

    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
        autotext.set_fontsize(16)
    
    for text in texts:
        text.set_fontsize(18)
    
    ax.set_title(f'p={p}', fontsize=18, pad=10)

plt.tight_layout()
plt.savefig(f"{config.path_to_evaluation}/target_shares_div10.pdf")
plt.show()

Data shares of the source datasets

In [None]:
# calculate the average data share for the source data sets of each div over runs
results_list = []

for div in ["10", "5", "0"]:
    dataframes = []
    for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]:
        source = pd.read_parquet(f"{config.path_to_data}/{run}/interim/source_div{div}{run}/source_div{div}{run}.parquet")
        shares = source.drop_duplicates(subset="id_ts").groupby(by="data").count()["id_ts"]
        dataframes.append(shares)
    
    shares_over_runs = pd.concat(dataframes, axis=1)
    mean_shares = shares_over_runs.mean(axis=1)
    std_shares = shares_over_runs.std(axis=1)
    
    result = pd.DataFrame({
        'data': mean_shares.index,
        f'div={div}': [f"{mean_val:.2f} ({std_val:.2f})" for mean_val, std_val in zip(mean_shares.values, std_shares.values)]
    })
    
    results_list.append(result)

final_result = results_list[0]
for i in range(1, len(results_list)):
    final_result = pd.merge(final_result, results_list[i], on='data', how='outer')

final_result = final_result.sort_values('data').reset_index(drop=True)
final_result.to_csv(f"{config.path_to_evaluation}/source_data_shares.csv")
print(final_result.to_string(index=False))

In [None]:
# Plot the average source data shares over runs for one div
div = "10"
dataframes = []
for run in ["case1_run1", "case1_run2", "case1_run3", "case1_run4", "case1_run5"]:
    source = pd.read_parquet(f"{config.path_to_data}/{run}/interim/source_div{div}{run}/source_div{div}{run}.parquet")
    shares = source.drop_duplicates(subset="id_ts").groupby(by="data").count()["id_ts"]
    dataframes.append(shares)

shares_over_runs = pd.concat(dataframes, axis=1)
mean_shares = shares_over_runs.mean(axis=1)

# Group the data into m4, m5, kaggle and one group for the rest
grouped_data = {}
other_sum = 0

for dataset, value in mean_shares.items():
    if dataset in ['m4', 'm5', 'kaggle']:
        grouped_data[dataset] = value
    else:
        other_sum += value

if other_sum > 0:
    grouped_data['others'] = other_sum

grouped_series = pd.Series(grouped_data)


color_map = {
    'others': '#cc0000',  
    'kaggle': '#3366cc',  
    'm4': '#ff9900',      
    'm5': '#339933'       
}


colors = []
for label in grouped_series.index:
    if label in color_map:
        colors.append(color_map[label])
    else:
        colors.append(None)  


plt.figure(figsize=(10, 8))


wedges, texts, autotexts = plt.pie(
    grouped_series.values, 
    labels=grouped_series.index,
    autopct='%1.1f%%',
    startangle=90,
    pctdistance=0.75,      
    labeldistance=1.1,     
    colors=colors,
    textprops={'fontsize': 18}
)


for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')
    autotext.set_fontsize(16)


for text in texts:
    text.set_fontsize(18)
    text.set_horizontalalignment('center')

plt.axis('equal')  
plt.tight_layout()
plt.savefig(f"{config.path_to_evaluation}/source_shares_div10.pdf")
plt.show()