# Imports

In [1]:
import os
import json
from collections import defaultdict
import pandas as pd

# Functions

In [43]:
def json_to_df(file_path):
    """
    Extract training accuracy and loss + validation accuracy and loss from the raw json files
    
    From each correlationId there is one json item for each client
    
    In this functions all items for a specific correlationId are merged to estimate
    the performance of the global model
    
    """
    
    # Load the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Extracting relevant data from each record
    extracted_data = []
    for record in data:
        client_name = record['sender']['name']
        correlation_id = record['correlationId']
        training_info = json.loads(record['data'])
        extracted_data.append({
            'client': client_name,
            'correlationId': correlation_id,
            'training_loss': training_info['training_loss'],
            'training_accuracy': training_info['training_accuracy'],
            'test_loss': training_info['test_loss'],
            'test_accuracy': training_info['test_accuracy']
        })

    # Creating a DataFrame from the extracted data
    df = pd.DataFrame(extracted_data)

    # Creating a mapping of correlationId to readable round numbers
    unique_correlation_ids = df['correlationId'].unique()
    round_mapping = {correlation_id: f'Round {i+1}' for i, correlation_id in enumerate(unique_correlation_ids)}

    # Applying the mapping to the DataFrame
    df['Round'] = df['correlationId'].map(round_mapping)

    # Initializing the aggregated data dictionary with lists for each metric
    aggregated_data = {
        'Round': [],
        'training_accuracy': [],
        'training_loss': [],
        'test_accuracy': [],
        'test_loss': []
    }


    # Aggregating data for each round
    for round_label in df['Round'].unique():
        round_data = df[df['Round'] == round_label]
        aggregated_data['Round'].extend([round_label] * len(round_data))
        aggregated_data['training_accuracy'].extend(round_data['training_accuracy'])
        aggregated_data['training_loss'].extend(round_data['training_loss'])
        aggregated_data['test_accuracy'].extend(round_data['test_accuracy'])
        aggregated_data['test_loss'].extend(round_data['test_loss'])

    # Converting aggregated data into a DataFrame for plotting
    df_aggregated = pd.DataFrame(aggregated_data)

    # Create copy of df
    df = df_aggregated.copy()

    # Group by Round and calculate mean for 
    df = df.groupby('Round').mean()

    # Sort by index
    df.index = df.index.astype(str)
    numeric_index = df.index.str.extract('(\d+)')[0].astype(int)
    df = df.iloc[numeric_index.argsort()]
    
    return df

# Solution

First we call the json_to_pdf function to estimate the performance of the global model for each .json file and create a pandas dataframe for each of those. Containing 5 attributes: Round, train_acc, train_loss, val_acc and val_loss


Then we calculate the mean for each attribute over all the pandas dataframes that were created with the function above.


Finally we save the merged dataframe as a json file

In [93]:
# Specify the directory
#root = "Backdoor/50_clients_40_rounds"
#dir_to_merge_list = ["Baseline", "2%", "4%", "6%", "8%", "10%", "12%", "14%", "16%", "18%", "20%" ]

#root = "Backdoor/10_clients_40_rounds"
#dir_to_merge_list = ["0%", "10%", "20%" ]

root = "Gradient_X100/50_clients_40_rounds"
dir_to_merge_list = ["Baseline", "2%", "4%", "6%", "8%", "10%"]



print(root)
for dir_to_merge in dir_to_merge_list:
    # Define paths
    path = os.path.join(root, dir_to_merge)
    file_paths = []
    for item in os.listdir(path):
        if os.path.isfile(os.path.join(path, item)) and not item.endswith('merged.json') and item.endswith('.json'):
            file_paths.append(os.path.join(path, item))
    
    
    # Load data from files
    dfs = [] # List of pandas dataframes
    for path in file_paths:
        dfs.append(json_to_df(path))  # Each json becomes a df with 4 columns. One for each metric

    print(f"{dir_to_merge}")
    print(f"  Number of training sessions: {len(dfs)}\n")
    print("   Number of rounds for each training session:")
    for i in range(len(dfs)):
        print(f"   {len(dfs[i])}")
    print("____________________________________________")
    
    # Merge rounds
    merged_df = pd.concat(dfs) # Concatenate the DataFrames
    #print(len(merged_df))
    
    merged_df = merged_df.groupby('Round').mean() # Group by 'Round' and calculate the mean

    
    
    
    # Sort by round (index)
    merged_df.index = merged_df.index.astype(str)
    numeric_index = merged_df.index.str.extract('(\d+)')[0].astype(int)
    merged_df = merged_df.iloc[numeric_index.argsort()]
    
    
    # View result
    #print(merged_df)
    
    
    # Save
    json_output = merged_df.to_json(orient='index', indent=4)
    filename = "merged.json"
    with open(os.path.join(root, dir_to_merge, filename), 'w') as file:
        file.write(json_output)

Gradient_X100/50_clients_40_rounds
Baseline
  Number of training sessions: 3

   Number of rounds for each training session:
   39
   39
   39
____________________________________________
2%
  Number of training sessions: 3

   Number of rounds for each training session:
   39
   39
   39
____________________________________________
4%
  Number of training sessions: 3

   Number of rounds for each training session:
   39
   39
   39
____________________________________________
6%
  Number of training sessions: 3

   Number of rounds for each training session:
   39
   39
   39
____________________________________________
8%
  Number of training sessions: 3

   Number of rounds for each training session:
   39
   39
   39
____________________________________________
10%
  Number of training sessions: 3

   Number of rounds for each training session:
   39
   39
   39
____________________________________________


# Debugging

In [15]:
import os
import json
import pandas as pd

def json_to_df(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    extracted_data = []
    for record in data:
        training_info = json.loads(record['data'])
        correlation_id = record['correlationId']
        # Attempt to extract round number, default to unknown if not possible
        try:
            round_number = int(correlation_id.split("-")[-1])
            round_label = f'Round {round_number}'
        except ValueError:
            round_label = 'Unknown Round'
        extracted_data.append({
            'client': record['sender']['name'],
            'correlationId': correlation_id,
            'training_loss': training_info['training_loss'],
            'training_accuracy': training_info['training_accuracy'],
            'test_loss': training_info['test_loss'],
            'test_accuracy': training_info['test_accuracy'],
            'Round': round_label
        })

    df = pd.DataFrame(extracted_data)
    return df


root = "Backdoor/10_clients_40_rounds"
dir_to_merge_list = ["10%", "20%"]

for dir_to_merge in dir_to_merge_list:
    path = os.path.join(root, dir_to_merge)
    file_paths = [os.path.join(path, item) for item in os.listdir(path) if item.endswith('.json') and not item.endswith('merged.json')]

    # Diagnostic: Check if all files are found
    print(f"Processing directory: {dir_to_merge}, Total files found: {len(file_paths)}")

    dfs = []
    for file_path in file_paths:
        df = json_to_df(file_path)
        dfs.append(df)

        # Diagnostic: Check if Round 40 is in each file
        if 'Round 40' not in df['Round'].values:
            print(f"Round 40 missing in file: {file_path}")

    merged_df = pd.concat(dfs)
    merged_df = merged_df.groupby('Round').mean()

    # Diagnostic: Check if Round 40 is in the final DataFrame
    if 'Round 40' not in merged_df.index:
        print(f"Round 40 missing after merging in directory: {dir_to_merge}")

    merged_df.index = merged_df.index.astype(str)
    numeric_index = merged_df.index.str.extract('(\d+)')[0].astype(int)
    merged_df = merged_df.iloc[numeric_index.argsort()]

    # Save the result
    json_output = merged_df.to_json(orient='index', indent=4)
    filename = "merged.json"
    with open(os.path.join(root, dir_to_merge, filename), 'w') as file:
        file.write(json_output)


Processing directory: 10%, Total files found: 10
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_3.json
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_2.json
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_5.json
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_9.json
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_8.json
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_4.json
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_7.json
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_6.json
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_1.json
Round 40 missing in file: Backdoor/10_clients_40_rounds/10%/10_clients_40_rounds_10.json


TypeError: agg function failed [how->mean,dtype->object]