# Imports

In [1]:
import os
import json
from collections import defaultdict
import pandas as pd

# Functions

In [2]:
def json_to_df(file_path):
    """
    Extract training accuracy and loss + validation accuracy and loss from the raw json files
    
    From each correlationId there is one json item for each client
    
    In this functions all items for a specific correlationId are merged to estimate
    the performance of the global model
    
    """
    
    # Load the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Extracting relevant data from each record
    extracted_data = []
    for record in data:
        client_name = record['sender']['name']
        correlation_id = record['correlationId']
        training_info = json.loads(record['data'])
        extracted_data.append({
            'client': client_name,
            'correlationId': correlation_id,
            'training_loss': training_info['training_loss'],
            'training_accuracy': training_info['training_accuracy'],
            'test_loss': training_info['test_loss'],
            'test_accuracy': training_info['test_accuracy']
        })

    # Creating a DataFrame from the extracted data
    df = pd.DataFrame(extracted_data)

    # Creating a mapping of correlationId to readable round numbers
    unique_correlation_ids = df['correlationId'].unique()
    round_mapping = {correlation_id: f'Round {i+1}' for i, correlation_id in enumerate(unique_correlation_ids)}

    # Applying the mapping to the DataFrame
    df['Round'] = df['correlationId'].map(round_mapping)

    # Initializing the aggregated data dictionary with lists for each metric
    aggregated_data = {
        'Round': [],
        'training_accuracy': [],
        'training_loss': [],
        'test_accuracy': [],
        'test_loss': []
    }

    # Aggregating data for each round
    for round_label in df['Round'].unique():
        round_data = df[df['Round'] == round_label]
        aggregated_data['Round'].extend([round_label] * len(round_data))
        aggregated_data['training_accuracy'].extend(round_data['training_accuracy'])
        aggregated_data['training_loss'].extend(round_data['training_loss'])
        aggregated_data['test_accuracy'].extend(round_data['test_accuracy'])
        aggregated_data['test_loss'].extend(round_data['test_loss'])

    # Converting aggregated data into a DataFrame for plotting
    df_aggregated = pd.DataFrame(aggregated_data)

    # Create copy of df
    df = df_aggregated.copy()

    # Group by Round and calculate mean for 
    df = df.groupby('Round').mean()

    # Sort by index
    df.index = df.index.astype(str)
    numeric_index = df.index.str.extract('(\d+)')[0].astype(int)
    df = df.iloc[numeric_index.argsort()]
    
    return df

# Solution

First we call the json_to_pdf function to estimate the performance of the global model for each .json file and create a pandas dataframe for each of those. Containing 5 attributes: Round, train_acc, train_loss, val_acc and val_loss


Then we calculate the mean for each attribute over all the pandas dataframes that were created with the function above.


Finally we save the merged dataframe as a json file

In [7]:
# Specify the directory
root = "Gradient_X100/50_clients_40_rounds"
dir_to_merge = "10%"

# Define paths
path = os.path.join(root, dir_to_merge)
file_paths = []
for item in os.listdir(path):
    if os.path.isfile(os.path.join(path, item)) and not item.endswith('merged.json') and item.endswith('.json'):
        file_paths.append(os.path.join(path, item))


# Load data from files
dfs = []
for path in file_paths:
    dfs.append(json_to_df(path))  # Each json becomes a df with 4 columns. One for each metric
    

# Merge rounds
merged_df = pd.concat(dfs) # Concatenate the DataFrames
merged_df = merged_df.groupby('Round').mean() # Group by 'Round' and calculate the mean


# Sort by round (index)
merged_df.index = merged_df.index.astype(str)
numeric_index = merged_df.index.str.extract('(\d+)')[0].astype(int)
merged_df = merged_df.iloc[numeric_index.argsort()]


# View result
print(merged_df)


# Save
json_output = merged_df.to_json(orient='index', indent=4)
filename = "merged.json"
with open(os.path.join(root, dir_to_merge, filename), 'w') as file:
    file.write(json_output)

          training_accuracy  training_loss  test_accuracy  test_loss
Round                                                               
Round 1            0.122389       2.294455       0.121867   2.294880
Round 2            0.151417       2.281256       0.151633   2.281484
Round 3            0.189517       2.266066       0.189200   2.266156
Round 4            0.230383       2.248210       0.229300   2.248124
Round 5            0.269933       2.226629       0.269767   2.226350
Round 6            0.307611       2.200446       0.305500   2.200016
Round 7            0.345817       2.168743       0.342533   2.168198
Round 8            0.385961       2.130873       0.381733   2.130079
Round 9            0.425217       2.086286       0.420600   2.085092
Round 10           0.458728       2.034281       0.455133   2.032522
Round 11           0.489522       1.974421       0.485600   1.971864
Round 12           0.517606       1.906608       0.515967   1.903047
Round 13           0.546817       