## Calculating the metrics

In [None]:
import numpy as np
import pandas as pd
import os

# Define datasets, seeds, percentages, and combinations
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
seeds = [18, 61, 53, 29, 69, 42, 2, 21, 78, 99]
percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75]
combinations = [
    ['algo004', 'comp'],
    ['algo004', 'ml'],
    ['algo004', 'virtualshakespeare'],
    ['comp', 'ml'],
    ['comp', 'virtualshakespeare'],
    ['ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml'],
    ['algo004', 'comp', 'virtualshakespeare'],
    ['algo004', 'ml', 'virtualshakespeare'],
    ['comp', 'ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml', 'virtualshakespeare']
]

# Function to calculate metrics from confusion matrix
def calculate_metrics(cm):
    tn, fp, fn, tp = cm.ravel()
    total_samples = tn + fp + fn + tp
    # Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    # True Positives divided by all samples
    tp_div_all = tp / total_samples
    # False Positives divided by all samples
    fp_div_all = fp / total_samples
    return precision, tp_div_all, fp_div_all

# Main processing
for seed in seeds:
    for percentage in percentages:
        percentage_str = f'{percentage}p'  # Convert to string with 'p' suffix
        for ds in datasets:
            # Initialize a list to collect results for this dataset, seed, and percentage
            results = []
            # First, collect the individual dataset's metrics
            base_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results'
            cm_npy_dir = os.path.join(base_dir, 'cm_npy', str(seed))
            cm_filename = f'cm_{ds}_{percentage_str}_seed{seed}.npy'
            cm_path = os.path.join(cm_npy_dir, cm_filename)
            if os.path.exists(cm_path):
                cm = np.load(cm_path)
                precision, tp_div_all, fp_div_all = calculate_metrics(cm)
                result = {
                    'Dataset': ds,
                    'Combination': ds,
                    'Percentage': percentage,
                    'Seed': seed,
                    'Precision': precision,
                    'TP_div_all': tp_div_all,
                    'FP_div_all': fp_div_all
                }
                results.append(result)
            else:
                print(f'Confusion matrix not found for {ds} at {percentage}% seed {seed}')

            # Now, collect metrics for combinations that include this dataset
            for combo in combinations:
                if ds in combo:
                    combo_name = '_'.join(combo)
                    cm_npy_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/cm_npy/npy_{percentage}/{seed}'
                    cm_filename = f'cm_{ds}_{combo_name}_{percentage_str}_seed{seed}.npy'
                    cm_path = os.path.join(cm_npy_dir, cm_filename)
                    if os.path.exists(cm_path):
                        cm = np.load(cm_path)
                        precision, tp_div_all, fp_div_all = calculate_metrics(cm)
                        result = {
                            'Dataset': ds,
                            'Combination': combo_name,
                            'Percentage': percentage,
                            'Seed': seed,
                            'Precision': precision,
                            'TP_div_all': tp_div_all,
                            'FP_div_all': fp_div_all
                        }
                        results.append(result)
                    else:
                        print(f'Confusion matrix not found for {ds} in combination {combo_name} at {percentage}% seed {seed}')

            # Save the collected results into a CSV file
            if results:
                results_df = pd.DataFrame(results)
                # Define the save path
                csv_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/csv/combined_results'
                os.makedirs(csv_dir, exist_ok=True)
                csv_filename = f'metrics_{ds}_seed{seed}_{percentage_str}.csv'
                csv_save_path = os.path.join(csv_dir, csv_filename)
                results_df.to_csv(csv_save_path, index=False)
            else:
                print(f'No results to save for {ds} at {percentage}% seed {seed}')


## Metrics with differences

In [None]:
import numpy as np
import pandas as pd
import os

# Define datasets, seeds, percentages, and combinations
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
seeds = [18, 61, 53, 29, 69, 42, 2, 21, 78, 99]
percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75]
combinations = [
    ['algo004', 'comp'],
    ['algo004', 'ml'],
    ['algo004', 'virtualshakespeare'],
    ['comp', 'ml'],
    ['comp', 'virtualshakespeare'],
    ['ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml'],
    ['algo004', 'comp', 'virtualshakespeare'],
    ['algo004', 'ml', 'virtualshakespeare'],
    ['comp', 'ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml', 'virtualshakespeare']
]

# Function to calculate metrics from confusion matrix
def calculate_metrics(cm):
    tn, fp, fn, tp = cm.ravel()
    total_samples = tn + fp + fn + tp
    # Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    # True Positives divided by all samples
    tp_div_all = tp / total_samples
    # False Positives divided by all samples
    fp_div_all = fp / total_samples
    return precision, tp_div_all, fp_div_all

# Main processing
for seed in seeds:
    for percentage in percentages:
        percentage_str = f'{percentage}p'  # Convert to string with 'p' suffix
        for ds in datasets:
            # Initialize a list to collect results for this dataset, seed, and percentage
            results = []
            # First, collect the individual dataset's metrics
            base_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results'
            cm_npy_dir = os.path.join(base_dir, 'cm_npy', str(seed))
            cm_filename = f'cm_{ds}_{percentage_str}_seed{seed}.npy'
            cm_path = os.path.join(cm_npy_dir, cm_filename)
            if os.path.exists(cm_path):
                cm = np.load(cm_path)
                precision, tp_div_all, fp_div_all = calculate_metrics(cm)
                result = {
                    'Dataset': ds,
                    'Combination': ds,
                    'Percentage': percentage,
                    'Seed': seed,
                    'Precision': precision,
                    'TP_div_all': tp_div_all,
                    'FP_div_all': fp_div_all
                }
                results.append(result)
            else:
                print(f'Confusion matrix not found for {ds} at {percentage}% seed {seed}')

            # Now, collect metrics for combinations that include this dataset
            for combo in combinations:
                if ds in combo:
                    combo_name = '_'.join(combo)
                    cm_npy_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/combined_results/cm_npy/npy_{percentage}/{seed}'
                    cm_filename = f'cm_{ds}_{combo_name}_{percentage_str}_seed{seed}.npy'
                    cm_path = os.path.join(cm_npy_dir, cm_filename)
                    if os.path.exists(cm_path):
                        cm = np.load(cm_path)
                        precision, tp_div_all, fp_div_all = calculate_metrics(cm)
                        result = {
                            'Dataset': ds,
                            'Combination': combo_name,
                            'Percentage': percentage,
                            'Seed': seed,
                            'Precision': precision,
                            'TP_div_all': tp_div_all,
                            'FP_div_all': fp_div_all
                        }
                        results.append(result)
                    else:
                        print(f'Confusion matrix not found for {ds} in combination {combo_name} at {percentage}% seed {seed}')

            # Save the collected results into a CSV file
            if results:
                results_df = pd.DataFrame(results)

                # Ensure that the individual dataset's metrics are first
                results_df = results_df.sort_values(by='Combination', key=lambda x: x.apply(lambda y: (0, y) if y == ds else (1, y)))

                # Get baseline metrics from the first row (individual dataset)
                base_precision = results_df.iloc[0]['Precision']
                base_tp_div_all = results_df.iloc[0]['TP_div_all']
                base_fp_div_all = results_df.iloc[0]['FP_div_all']

                # Compute differences
                results_df['Precision_Difference'] = results_df['Precision'] - base_precision
                results_df['TP_div_all_Difference'] = results_df['TP_div_all'] - base_tp_div_all
                results_df['FP_div_all_Difference'] = results_df['FP_div_all'] - base_fp_div_all

                # Define the save path
                csv_dir = f'/content/drive/MyDrive/Colab Notebooks/Fed Learning Research/{ds}_results/csv/combined_csv'
                os.makedirs(csv_dir, exist_ok=True)
                csv_filename = f'metrics_{ds}_seed{seed}_{percentage_str}.csv'
                csv_save_path = os.path.join(csv_dir, csv_filename)
                results_df.to_csv(csv_save_path, index=False)
                print(f'Saved metrics for {ds} at {percentage_str} for seed {seed} to {csv_save_path}')
            else:
                print(f'No results to save for {ds} at {percentage}% seed {seed}')


## Generating graphs from differences

In [None]:
!ls

## Charts for Precision difference

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define datasets, seeds, percentages
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
seeds = [18, 61, 53, 29, 69, 42, 2, 21, 78, 99]
percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75]  # percentages as integers

# Define difference metrics
difference_metrics = ['Precision_Difference', 'TP_div_all_Difference', 'FP_div_all_Difference']

# Define colors
positive_color = 'blue'
negative_color = 'red'

# Parent directory for charts
charts_parent_dir = '/content/drive/MyDrive/Colab Notebooks/GNN Research/charts'

# Main processing
for ds in datasets:
    for seed in seeds:
        for percentage in percentages:
            # Construct percentage string
            percentage_str = f'{percentage}p'
            # Path to the CSV file
            csv_file = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/{ds}_results/csv/combined_csv/metrics_{ds}_seed{seed}_{percentage_str}.csv'
            # Check if the CSV file exists
            if not os.path.exists(csv_file):
                print(f'CSV file not found: {csv_file}')
                continue
            # Read the CSV file
            df = pd.read_csv(csv_file)
            # Skip if there are no combinations (less than 2 rows)
            if df.shape[0] <= 1:
                print(f'Not enough data to plot for {ds}, seed {seed}, percentage {percentage}')
                continue
            # For each difference metric, create a chart
            for metric in difference_metrics:
                # Prepare data
                combinations = df['Combination'][1:]  # Exclude the first row (individual dataset)
                differences = df[metric][1:]  # Exclude the first row (individual dataset)
                # Convert combinations and differences to lists
                combinations = combinations.tolist()
                differences = differences.tolist()
                # Create a color list based on positive or negative differences
                colors = [positive_color if x >= 0 else negative_color for x in differences]
                # Create the plot
                plt.figure(figsize=(10, len(combinations)*0.5))
                # Plot horizontal bars
                plt.barh(combinations, differences, color=colors)
                # Add vertical line at x=0
                plt.axvline(x=0, color='black', linewidth=0.8)
                # Set labels and title
                plt.xlabel(metric.replace('_', ' '))
                plt.ylabel('Combination')
                plt.title(f'{metric.replace("_", " ")} for {ds} (Seed {seed}, {percentage}%)')
                plt.tight_layout()
                # Define the save path
                chart_dir = os.path.join(charts_parent_dir, ds, f'seed_{seed}', f'{percentage}p')
                os.makedirs(chart_dir, exist_ok=True)
                # Name the file as per your specification
                metric_name = metric.replace('_Difference', '').replace('_', '')
                for idx, combination in enumerate(combinations):
                    # Prepare data for single combination
                    diff = differences[idx]
                    color = colors[idx]
                    # Create a single-bar plot
                    plt.figure(figsize=(6, 1))
                    plt.barh([combination], [diff], color=color)
                    plt.axvline(x=0, color='black', linewidth=0.8)
                    plt.xlabel(metric_name)
                    plt.title(f'{metric_name} Difference for {ds} with {combination}\n(Seed {seed}, {percentage}%)')
                    plt.tight_layout()
                    # Filename: dataset_combination_percent_seed_metric.png
                    chart_filename = f'{ds}_{combination}_{percentage}p_seed{seed}_{metric_name}.png'
                    chart_save_path = os.path.join(chart_dir, chart_filename)
                    # Save the plot
                    plt.savefig(chart_save_path, dpi=200)
                    plt.close()
                    #print(f'Saved chart: {chart_save_path}')
                # Comment out the group plot if you prefer individual charts per combination
                plt.savefig(chart_save_path, dpi=200)
                plt.close()
                print(f'Saved chart: {chart_save_path}')


## Calculating AUC mean, ACC mean, Precision mean, and standard deviation

In [None]:
import numpy as np
import pandas as pd
import os

# Define datasets, seeds, percentages, and combinations
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
seeds = [18, 61, 53, 29, 69, 42, 2, 21, 78, 99]
percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75]
combinations = [
    ['algo004', 'comp'],
    ['algo004', 'ml'],
    ['algo004', 'virtualshakespeare'],
    ['comp', 'ml'],
    ['comp', 'virtualshakespeare'],
    ['ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml'],
    ['algo004', 'comp', 'virtualshakespeare'],
    ['algo004', 'ml', 'virtualshakespeare'],
    ['comp', 'ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml', 'virtualshakespeare']
]

# Function to calculate metrics from confusion matrix
def calculate_metrics(cm):
    tn, fp, fn, tp = cm.ravel()
    total_samples = tn + fp + fn + tp
    # Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    # Accuracy
    accuracy = (tp + tn) / total_samples if total_samples > 0 else 0
    return precision, accuracy

# Main processing
for ds in datasets:
    for percentage in percentages:
        percentage_str = f'{percentage}p'  # Convert to string with 'p' suffix
        # Get combinations that include the current dataset
        ds_combinations = [combo for combo in combinations if ds in combo]
        # Include the individual dataset
        ds_combinations.insert(0, [ds])
        # Initialize a list to collect mean metrics for each combination
        results = []
        for combo in ds_combinations:
            combo_name = '_'.join(combo)
            precisions = []
            accuracies = []
            aucs = []
            for seed in seeds:
                # Load confusion matrix
                if len(combo) == 1:
                    # Individual dataset
                    base_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/{ds}_results'
                    cm_npy_dir = os.path.join(base_dir, 'cm_npy', str(seed))
                    cm_filename = f'cm_{ds}_{percentage_str}_seed{seed}.npy'
                    cm_path = os.path.join(cm_npy_dir, cm_filename)
                else:
                    # Combination
                    cm_npy_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/combined_results/cm_npy/npy_{percentage}/{seed}'
                    cm_filename = f'cm_{ds}_{combo_name}_{percentage_str}_seed{seed}.npy'
                    cm_path = os.path.join(cm_npy_dir, cm_filename)
                if os.path.exists(cm_path):
                    cm = np.load(cm_path)
                    precision, accuracy = calculate_metrics(cm)
                    precisions.append(precision)
                    accuracies.append(accuracy)
                else:
                    print(f'Confusion matrix not found for {ds} in combination {combo_name} at {percentage*100}% seed {seed}')
                    continue
                # Try to load AUC value
                if len(combo) == 1:
                    # Individual dataset
                    csv_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/{ds}_results/csv/{seed}'
                    csv_filename = f'results_{ds}_{percentage_str}_seed{seed}.csv'
                    csv_path = os.path.join(csv_dir, csv_filename)
                else:
                    # Combination
                    csv_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/combined_results/csv/csv_{percentage}/{seed}'
                    csv_filename = f'Combined_{combo_name}_{percentage_str}_seed{seed}.csv'
                    csv_path = os.path.join(csv_dir, csv_filename)
                if os.path.exists(csv_path):
                    df = pd.read_csv(csv_path)
                    # Modify this part to filter the DataFrame based on 'Dataset' and 'AUC'
                    # Check if 'Dataset' column exists
                    if 'Dataset' in df.columns:
                        # Filter the DataFrame to get the row matching the specific dataset
                        filtered_df = df[df['Dataset'] == ds]
                        if not filtered_df.empty:
                            auc = filtered_df['AUC'].values[0]
                            aucs.append(auc)
                        else:
                            print(f'AUC value not found for dataset {ds} in CSV file {csv_path}')
                    else:
                        # If 'Dataset' column does not exist, handle accordingly
                        # Assuming the CSV only contains AUC values for the combination
                        # and the 'AUC' value corresponds to the combination
                        auc = df['AUC'].values[0]
                        aucs.append(auc)
                else:
                    print(f'AUC file not found for {ds} in combination {combo_name} at {percentage*100}% seed {seed}')
            # Compute mean metrics
            mean_precision = np.mean(precisions) if precisions else None
            mean_accuracy = np.mean(accuracies) if accuracies else None
            mean_auc = np.mean(aucs) if aucs else None
            result = {
                'Dataset': ds,
                'Combination': combo_name,
                'Percentage': percentage,
                'Mean_Precision': mean_precision,
                'Mean_Accuracy': mean_accuracy,
                'Mean_AUC': mean_auc
            }
            results.append(result)
        # Save the collected mean results into a CSV file
        if results:
            results_df = pd.DataFrame(results)
            # Define the save path
            csv_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/mean_results/{ds}'
            os.makedirs(csv_dir, exist_ok=True)
            csv_filename = f'mean_metrics_{ds}_{percentage_str}.csv'
            csv_save_path = os.path.join(csv_dir, csv_filename)
            results_df.to_csv(csv_save_path, index=False)
        else:
            print(f'No results to save for {ds} at {percentage*100}%')


In [None]:
import numpy as np
import pandas as pd
import os

# Define datasets, seeds, percentages, and combinations
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
seeds = [18, 61, 53, 29, 69, 42, 2, 21, 78, 99]
percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75]
combinations = [
    ['algo004', 'comp'],
    ['algo004', 'ml'],
    ['algo004', 'virtualshakespeare'],
    ['comp', 'ml'],
    ['comp', 'virtualshakespeare'],
    ['ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml'],
    ['algo004', 'comp', 'virtualshakespeare'],
    ['algo004', 'ml', 'virtualshakespeare'],
    ['comp', 'ml', 'virtualshakespeare'],
    ['algo004', 'comp', 'ml', 'virtualshakespeare']
]

# Function to calculate metrics from confusion matrix
def calculate_metrics(cm):
    tn, fp, fn, tp = cm.ravel()
    total_samples = tn + fp + fn + tp
    # Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    # Accuracy
    accuracy = (tp + tn) / total_samples if total_samples > 0 else 0
    return precision, accuracy

# Main processing
for ds in datasets:
    for percentage in percentages:
        percentage_str = f'{percentage}p'  # Convert to string with 'p' suffix
        # Get combinations that include the current dataset
        ds_combinations = [combo for combo in combinations if ds in combo]
        # Include the individual dataset
        ds_combinations.insert(0, [ds])
        # Initialize a list to collect mean and std metrics for each combination
        results = []
        for combo in ds_combinations:
            combo_name = '_'.join(combo)
            precisions = []
            accuracies = []
            aucs = []
            for seed in seeds:
                # Load confusion matrix
                if len(combo) == 1:
                    # Individual dataset
                    base_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/{ds}_results'
                    cm_npy_dir = os.path.join(base_dir, 'cm_npy', str(seed))
                    cm_filename = f'cm_{ds}_{percentage_str}_seed{seed}.npy'
                    cm_path = os.path.join(cm_npy_dir, cm_filename)
                else:
                    # Combination
                    cm_npy_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/combined_results/cm_npy/npy_{percentage}/{seed}'
                    cm_filename = f'cm_{ds}_{combo_name}_{percentage_str}_seed{seed}.npy'
                    cm_path = os.path.join(cm_npy_dir, cm_filename)
                if os.path.exists(cm_path):
                    cm = np.load(cm_path)
                    precision, accuracy = calculate_metrics(cm)
                    precisions.append(precision)
                    accuracies.append(accuracy)
                else:
                    print(f'Confusion matrix not found for {ds} in combination {combo_name} at {percentage*100}% seed {seed}')
                    continue
                # Try to load AUC value
                if len(combo) == 1:
                    # Individual dataset
                    csv_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/{ds}_results/csv/{seed}'
                    csv_filename = f'results_{ds}_{percentage_str}_seed{seed}.csv'
                    csv_path = os.path.join(csv_dir, csv_filename)
                else:
                    # Combination
                    csv_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/combined_results/csv/csv_{percentage}/{seed}'
                    csv_filename = f'Combined_{combo_name}_{percentage_str}_seed{seed}.csv'
                    csv_path = os.path.join(csv_dir, csv_filename)
                if os.path.exists(csv_path):
                    df = pd.read_csv(csv_path)
                    # Modify this part to filter the DataFrame based on 'Dataset' and 'AUC'
                    # Check if 'Dataset' column exists
                    if 'Dataset' in df.columns:
                        # Filter the DataFrame to get the row matching the specific dataset
                        filtered_df = df[df['Dataset'] == ds]
                        if not filtered_df.empty:
                            auc = filtered_df['AUC'].values[0]
                            aucs.append(auc)
                        else:
                            print(f'AUC value not found for dataset {ds} in CSV file {csv_path}')
                    else:
                        # If 'Dataset' column does not exist, handle accordingly
                        # Assuming the CSV only contains AUC values for the combination
                        # and the 'AUC' value corresponds to the combination
                        auc = df['AUC'].values[0]
                        aucs.append(auc)
                else:
                    print(f'AUC file not found for {ds} in combination {combo_name} at {percentage*100}% seed {seed}')
            # Compute mean and standard deviation metrics
            mean_precision = np.mean(precisions) if precisions else None
            mean_accuracy = np.mean(accuracies) if accuracies else None
            mean_auc = np.mean(aucs) if aucs else None

            std_precision = np.std(precisions, ddof=1) if len(precisions) > 1 else None
            std_accuracy = np.std(accuracies, ddof=1) if len(accuracies) > 1 else None
            std_auc = np.std(aucs, ddof=1) if len(aucs) > 1 else None

            result = {
                'Dataset': ds,
                'Combination': combo_name,
                'Percentage': percentage,
                'Mean_Precision': mean_precision,
                'Std_Precision': std_precision,
                'Mean_Accuracy': mean_accuracy,
                'Std_Accuracy': std_accuracy,
                'Mean_AUC': mean_auc,
                'Std_AUC': std_auc
            }
            results.append(result)
        # Save the collected mean and std results into a CSV file
        if results:
            results_df = pd.DataFrame(results)
            # Define the save path
            csv_dir = f'/content/drive/MyDrive/Colab Notebooks/GNN Research/mean_results/{ds}'
            os.makedirs(csv_dir, exist_ok=True)
            csv_filename = f'mean_metrics_{ds}_{percentage_str}.csv'
            csv_save_path = os.path.join(csv_dir, csv_filename)
            results_df.to_csv(csv_save_path, index=False)
            print(f'Saved mean and std metrics for {ds} at {percentage}%')
        else:
            print(f'No results to save for {ds} at {percentage*100}%')


## Plot the means for comparison

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Define datasets and percentages
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75]  # percentages as integers

# Define metrics to plot and their corresponding colors
metrics_info = {
    'Mean_AUC': {'label': 'Mean AUC', 'color': 'blue'},
    'Mean_Accuracy': {'label': 'Mean Accuracy', 'color': 'green'},
    'Mean_Precision': {'label': 'Mean Precision', 'color': 'purple'},
}

# Parent directories for mean results and charts
mean_results_dir = '/content/drive/MyDrive/Colab Notebooks/GNN Research/mean_results'
charts_dir = '/content/drive/MyDrive/Colab Notebooks/GNN Research/mean_charts'

# Create charts directory if it doesn't exist
os.makedirs(charts_dir, exist_ok=True)

# Loop through datasets and percentages
for ds in datasets:
    for percentage in percentages:
        percentage_str = f'{percentage}p'
        # Path to the mean metrics CSV file
        csv_file = os.path.join(mean_results_dir, ds, f'mean_metrics_{ds}_{percentage_str}.csv')
        # Check if the CSV file exists
        if not os.path.exists(csv_file):
            print(f'CSV file not found: {csv_file}')
            continue
        # Read the CSV file
        df = pd.read_csv(csv_file)
        # Skip if the DataFrame is empty
        if df.empty:
            print(f'No data to plot for {ds} at {percentage}%')
            continue
        # Create a separate folder for this dataset and percentage
        plot_save_dir = os.path.join(charts_dir, ds, percentage_str)
        os.makedirs(plot_save_dir, exist_ok=True)
        # For each metric, create a bar chart
        for metric, info in metrics_info.items():
            # Prepare data
            combinations = df['Combination']
            values = df[metric]
            # Skip if all values are NaN
            if values.isnull().all():
                print(f'All values are NaN for {metric} in {ds} at {percentage}%')
                continue
            # Create the plot
            plt.figure(figsize=(12, 6))
            plt.bar(combinations, values, color=info['color'])
            # Add value labels on top of the bars
            for i, v in enumerate(values):
                if pd.notnull(v):
                    plt.text(i, v + 0.005, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
            # Set labels and title
            plt.xlabel('Combination', fontsize=12)
            plt.ylabel(info['label'], fontsize=12)
            plt.title(f'{info["label"]} for {ds} at {percentage}%', fontsize=14)
            plt.xticks(rotation=45, ha='right', fontsize=10)
            plt.yticks(fontsize=10)
            plt.tight_layout()
            # Define the save path
            plot_filename = f'{ds}_{percentage_str}_{metric}.png'
            plot_save_full_path = os.path.join(plot_save_dir, plot_filename)
            # Save the plot
            plt.savefig(plot_save_full_path, dpi=200)
            plt.close()
            print(f'Saved plot: {plot_save_full_path}')


## Hypothesis Testing

In [15]:
import os
import numpy as np
import pandas as pd
from scipy import stats

# Define datasets, percentages, and combinations
datasets = ['algo004', 'comp', 'ml', 'virtualshakespeare']
percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.5, 0.75]  # percentages as integers
num_seeds = 10  # Number of seeds used in your experiments

# Significance level
alpha = 0.05  # You can adjust this value as needed

# Directories
base_dir = '/content/drive/MyDrive/Colab Notebooks/GNN Research'
mean_results_dir = os.path.join(base_dir, 'mean_results')
hypothesis_testing_dir = os.path.join(base_dir, 'hypothesis_testing')
os.makedirs(hypothesis_testing_dir, exist_ok=True)

# Perform hypothesis testing
for ds in datasets:
    # Create a directory for the dataset
    ds_hypothesis_dir = os.path.join(hypothesis_testing_dir, ds)
    os.makedirs(ds_hypothesis_dir, exist_ok=True)

    for percentage in percentages:
        percentage_str = f'{percentage}p'

        # Path to the mean metrics CSV for this dataset and percentage
        mean_csv_path = os.path.join(mean_results_dir, ds, f'mean_metrics_{ds}_{percentage_str}.csv')

        if not os.path.exists(mean_csv_path):
            print(f'Mean metrics CSV not found for {ds} at {percentage}%')
            continue

        # Load the mean metrics
        mean_df = pd.read_csv(mean_csv_path)

        # Extract the individual dataset row (base case)
        individual_row = mean_df[mean_df['Combination'] == ds]
        if individual_row.empty:
            print(f'No individual data found for {ds} at {percentage}%')
            continue

        # Get individual dataset statistics
        ind_mean_auc = individual_row['Mean_AUC'].values[0]
        ind_std_auc = individual_row['Std_AUC'].values[0]
        ind_mean_precision = individual_row['Mean_Precision'].values[0]
        ind_std_precision = individual_row['Std_Precision'].values[0]

        # Filter out the combinations (excluding the individual dataset itself)
        combination_rows = mean_df[mean_df['Combination'] != ds]

        # Initialize a list to store hypothesis test results
        hypothesis_results = []

        for _, row in combination_rows.iterrows():
            combination_name = row['Combination']

            # Get combination statistics
            comb_mean_auc = row['Mean_AUC']
            comb_std_auc = row['Std_AUC']
            comb_mean_precision = row['Mean_Precision']
            comb_std_precision = row['Std_Precision']

            # Check for valid data
            if pd.isna(comb_mean_auc) or pd.isna(comb_std_auc) or pd.isna(comb_mean_precision) or pd.isna(comb_std_precision):
                print(f'Skipping combination {combination_name} due to NaN values at {percentage}%')
                continue

            # Perform one-sided t-tests as per your specified hypotheses
            # Null Hypothesis: Mean individual <= Mean combined
            # Alternative Hypothesis: Mean individual > Mean combined
            # Therefore, we set alternative='greater' in ttest_ind_from_stats

            # AUC t-test
            auc_t_stat, auc_p_value = stats.ttest_ind_from_stats(
                mean1=ind_mean_auc, std1=ind_std_auc, nobs1=num_seeds,
                mean2=comb_mean_auc, std2=comb_std_auc, nobs2=num_seeds,
                equal_var=False, alternative='greater'
            )
            # Determine decision for AUC
            if auc_p_value < alpha:
                auc_decision = 'Reject Null Hypothesis'
            else:
                auc_decision = 'Fail to Reject Null Hypothesis'

            # Precision t-test
            precision_t_stat, precision_p_value = stats.ttest_ind_from_stats(
                mean1=ind_mean_precision, std1=ind_std_precision, nobs1=num_seeds,
                mean2=comb_mean_precision, std2=comb_std_precision, nobs2=num_seeds,
                equal_var=False, alternative='greater'
            )
            # Determine decision for Precision
            if precision_p_value < alpha:
                precision_decision = 'Reject Null Hypothesis'
            else:
                precision_decision = 'Fail to Reject Null Hypothesis'

            # Store results
            result = {
                'Dataset': ds,
                'Percentage': percentage,
                'Combination': combination_name,
                'AUC_T-Statistic': auc_t_stat,
                'AUC_P-Value': auc_p_value,
                'AUC_Decision': auc_decision,
                'Precision_T-Statistic': precision_t_stat,
                'Precision_P-Value': precision_p_value,
                'Precision_Decision': precision_decision
            }
            hypothesis_results.append(result)

        # Save the hypothesis test results to a CSV file
        if hypothesis_results:
            hypothesis_df = pd.DataFrame(hypothesis_results)
            hypothesis_save_path = os.path.join(ds_hypothesis_dir, f'hypothesis_testing_{ds}_{percentage_str}.csv')
            hypothesis_df.to_csv(hypothesis_save_path, index=False)
            print(f'Hypothesis test results saved for {ds} at {percentage}%: {hypothesis_save_path}')
        else:
            print(f'No hypothesis test results for {ds} at {percentage}%')


Hypothesis test results saved for algo004 at 0.05%: /content/drive/MyDrive/Colab Notebooks/GNN Research/hypothesis_testing/algo004/hypothesis_testing_algo004_0.05p.csv
Hypothesis test results saved for algo004 at 0.1%: /content/drive/MyDrive/Colab Notebooks/GNN Research/hypothesis_testing/algo004/hypothesis_testing_algo004_0.1p.csv
Hypothesis test results saved for algo004 at 0.15%: /content/drive/MyDrive/Colab Notebooks/GNN Research/hypothesis_testing/algo004/hypothesis_testing_algo004_0.15p.csv
Hypothesis test results saved for algo004 at 0.2%: /content/drive/MyDrive/Colab Notebooks/GNN Research/hypothesis_testing/algo004/hypothesis_testing_algo004_0.2p.csv
Hypothesis test results saved for algo004 at 0.25%: /content/drive/MyDrive/Colab Notebooks/GNN Research/hypothesis_testing/algo004/hypothesis_testing_algo004_0.25p.csv
Hypothesis test results saved for algo004 at 0.5%: /content/drive/MyDrive/Colab Notebooks/GNN Research/hypothesis_testing/algo004/hypothesis_testing_algo004_0.5p.cs