ELAPSE Traces 

Dataset Properties

In [1]:
import csv


rows = [
    ("Adult", "(gender, age, race)"),
    ("KDD", "(gender, age, race)"),
    ("DC", "(gender, age)"),
    ("MobiAct", "(gender, age)"),
    ("ARS", "(gender)"),
    ("celeba", "(gender, age)"),
    ("fairface", "(age, race)"),
    ("audioMNIST", "(gender, age)"),
    ("voxceleb", "(race)"),
]


with open("../../traces/DatasetProperties.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Dataset", "SensitiveAttributes"])  
    writer.writerows(rows)


Experiment Configurations

In [2]:
import csv

# --- Datasets et modèles autorisés par groupe ---
# 1) "SVM", "MLP", "LR" avec "Adult", "KDD", "DC", "MobiAct", "ARS"
group1_datasets = ["Adult", "KDD", "DC", "MobiAct", "ARS"]
group1_models = ["SVM", "MLP", "LR"]

# 2) "celeba", "fairface" avec "ResNet18", "VGG"
group2_datasets = ["celeba", "fairface"]
group2_models = ["ResNet18", "VGG"]

# 3) "audioMnist", "voxceleb" avec "LSTM", "CNN"
group3_datasets = ["audioMnist", "voxceleb"]
group3_models = ["LSTM", "CNN"]

# Construction de la table Dataset -> Modèles autorisés
dataset_to_models = {}
for d in group1_datasets:
    dataset_to_models[d] = list(group1_models)
for d in group2_datasets:
    dataset_to_models[d] = list(group2_models)
for d in group3_datasets:
    dataset_to_models[d] = list(group3_models)

# >>> Spécificité demandée : MobiAct uniquement MLP <<<
dataset_to_models["MobiAct"] = ["MLP"]

# Liste ordonnée finale des datasets (pour l'ordre d’itération)
datasets = group1_datasets + group2_datasets + group3_datasets

# --- Autres paramètres ---
selection_methods = ["Full", "Craig", "Glister", "GradMatch", "Random"]
selection_ratios_partial = [0.05, 0.1, 0.2, 0.3]  # pour toutes les méthodes sauf "Full"
selection_frequency = 20
num_runs = 5

# --- Génération des configurations ---
configurations = []
ec_id = 1

for dataset in datasets:
    models = dataset_to_models[dataset]
    for model in models:
        for method in selection_methods:
            ratios = [1.0] if method == "Full" else selection_ratios_partial
            for ratio in ratios:
                configurations.append([
                    ec_id,
                    dataset,
                    model,
                    method,
                    ratio,
                    selection_frequency,
                    num_runs
                ])
                ec_id += 1

# --- Écriture du CSV ---
with open("../../traces/ExperimentConfigurations.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([
        "EC_ID",
        "dataset",
        "model",
        "system",
        "ratio",
        "Selection frequency",
        "#Runs"
    ])
    writer.writerows(configurations)


Experiment Measurements

In [3]:
import os
import re
import pandas as pd

# === Paths & settings ===
systems_path = [
    '../../results/ars-selection/CRAIGPB',
    '../../results/ars-selection/GLISTERPB',
    '../../results/ars-selection/GradMatchPB',
    '../../results/ars-selection/Random'
]
ratio_path = ['/ars_0.05', '/ars_0.1', '/ars_0.2', '/ars_0.3']
directory_full = '../../results/ars-selection/Full/ars_1'
models = ['Logreg', 'MLP', 'SVM']
excluded_columns = []  # add any columns to drop from fair metrics

# --- Model-specific row windows (start inclusive, stop exclusive) ---
MODEL_WINDOWS = {
    'selection': {
        'Logreg': (0, 150),
        'MLP':    (0, 150),
        'SVM':    (0, 80),
    },
    'full': {
        'Logreg': (0, 150),
        'MLP':    (0, 150),
        'SVM':    (0, 80),
    }
}

def _slice_window(model: str, phase: str):
    try:
        return MODEL_WINDOWS[phase][model]
    except KeyError:
        raise KeyError(f"Aucune fenêtre définie pour phase='{phase}', model='{model}'")

def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop unnamed index-like columns
    df = df.loc[:, ~df.columns.str.contains(r'^Unnamed')]
    return df

def _ensure_epoch_id_inplace(df_out: pd.DataFrame):
    """
    Renomme 'epoch' -> 'epochID' (ou 'Epoch' -> 'epochID').
    Si aucune des deux colonnes n'existe, crée 'epochID' = range(len(df_out)).
    Aucune colonne 'epoch'/'Epoch' ne reste après appel.
    """
    if 'epoch' in df_out.columns:
        df_out.rename(columns={'epoch': 'epochID'}, inplace=True)
    elif 'Epoch' in df_out.columns:
        df_out.rename(columns={'Epoch': 'epochID'}, inplace=True)
    else:
        df_out['epochID'] = list(range(len(df_out)))

def load_epochs(fair_path, model):
    start, stop = _slice_window(model, 'selection')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    # Retirer colonnes exclues, ne pas multiplier par 100, ne pas toucher Accuracy
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    # Vérifie présence d'Accuracy (vient uniquement de fair_path)
    _ = fair_df['Accuracy']  # lève KeyError si absent

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df  # fair_df contient déjà epochID

def load_epochs_full(fair_path, model):
    start, stop = _slice_window(model, 'full')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    _ = fair_df['Accuracy']  # assert presence

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path)
    df = _clean_df(df)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

all_epochs = []

# === Full traces ===
for model in models:
    if not os.path.isdir(directory_full):
        continue
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            _, df_metrics = load_epochs_full(fair_file, model)
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'ars'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            if not os.path.isdir(directory):
                continue

            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    _, df_metrics = load_epochs(fair_file, model)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'ars'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id

                    all_epochs.append(df_metrics)

if not all_epochs:
    raise ValueError("Aucun fichier trouvé : vérifie les noms des fichiers fair_metrics et cost_metrics.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)

os.makedirs('../../results/test', exist_ok=True)
df_all_epochs.to_csv('../../results/test/ars_epoch_traces.csv', index=False)


In [4]:
import os
import re
import pandas as pd

# === Paths & settings ===
systems_path = [
    '../../results/adult-selection/CRAIGPB',
    '../../results/adult-selection/GLISTERPB',
    '../../results/adult-selection/GradMatchPB',
    '../../results/adult-selection/Random'
]
ratio_path = ['/adult_0.05', '/adult_0.1', '/adult_0.2', '/adult_0.3']
directory_full = '../../results/adult-selection/Full/adult_1'
models = ['Logreg', 'MLP', 'SVM']
excluded_columns = []  # add any columns to drop from fair metrics


# --- Model-specific row windows (start inclusive, stop exclusive) ---
MODEL_WINDOWS = {
    'selection': {
        'Logreg': (0, 150),
        'MLP':    (0, 400),
        'SVM':    (0, 150),
    },
    'full': {
        'Logreg': (0, 150),
        'MLP':    (0, 400),
        'SVM':    (0, 150),
    }
}

def _slice_window(model: str, phase: str):
    try:
        return MODEL_WINDOWS[phase][model]
    except KeyError:
        raise KeyError(f"Aucune fenêtre définie pour phase='{phase}', model='{model}'")

def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop unnamed index-like columns
    df = df.loc[:, ~df.columns.str.contains(r'^Unnamed')]
    return df

def _ensure_epoch_id_inplace(df_out: pd.DataFrame):
    """
    Renomme 'epoch' -> 'epochID' (ou 'Epoch' -> 'epochID').
    Si aucune des deux colonnes n'existe, crée 'epochID' = range(len(df_out)).
    Aucune colonne 'epoch'/'Epoch' ne reste après appel.
    """
    if 'epoch' in df_out.columns:
        df_out.rename(columns={'epoch': 'epochID'}, inplace=True)
    elif 'Epoch' in df_out.columns:
        df_out.rename(columns={'Epoch': 'epochID'}, inplace=True)
    else:
        df_out['epochID'] = list(range(len(df_out)))

def load_epochs(fair_path, model):
    start, stop = _slice_window(model, 'selection')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    # Retirer colonnes exclues, ne pas multiplier par 100, ne pas toucher Accuracy
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    # Vérifie présence d'Accuracy (vient uniquement de fair_path)
    _ = fair_df['Accuracy']  # lève KeyError si absent

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df  # fair_df contient déjà epochID

def load_epochs_full(fair_path, model):
    start, stop = _slice_window(model, 'full')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    _ = fair_df['Accuracy']  # assert presence

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path)
    df = _clean_df(df)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

all_epochs = []

# === Full traces ===
for model in models:
    if not os.path.isdir(directory_full):
        continue
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            _, df_metrics = load_epochs_full(fair_file, model)
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'adult'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            if not os.path.isdir(directory):
                continue

            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    _, df_metrics = load_epochs(fair_file, model)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'adult'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id

                    all_epochs.append(df_metrics)

if not all_epochs:
    raise ValueError("Aucun fichier trouvé : vérifie les noms des fichiers fair_metrics et cost_metrics.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)

os.makedirs('../../results/test', exist_ok=True)
df_all_epochs.to_csv('../../results/test/adult_epoch_traces.csv', index=False)


In [5]:
import os
import re
import pandas as pd

# === Paths & settings ===
systems_path = [
    '../../results/kdd-selection/CRAIGPB',
    '../../results/kdd-selection/GLISTERPB',
    '../../results/kdd-selection/GradMatchPB',
    '../../results/kdd-selection/Random'
]
ratio_path = ['/kdd_0.05', '/kdd_0.1', '/kdd_0.2', '/kdd_0.3']
directory_full = '../../results/kdd-selection/Full/kdd_1'
models = ['Logreg', 'MLP', 'SVM']
excluded_columns = []  # add any columns to drop from fair metrics


# --- Model-specific row windows (start inclusive, stop exclusive) ---
MODEL_WINDOWS = {
    'selection': {
        'Logreg': (0, 80),
        'MLP':    (0, 80),
        'SVM':    (0, 80),
    },
    'full': {
        'Logreg': (0, 80),
        'MLP':    (0, 80),
        'SVM':    (0, 80),
    }
}

def _slice_window(model: str, phase: str):
    try:
        return MODEL_WINDOWS[phase][model]
    except KeyError:
        raise KeyError(f"Aucune fenêtre définie pour phase='{phase}', model='{model}'")

def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop unnamed index-like columns
    df = df.loc[:, ~df.columns.str.contains(r'^Unnamed')]
    return df

def _ensure_epoch_id_inplace(df_out: pd.DataFrame):
    """
    Renomme 'epoch' -> 'epochID' (ou 'Epoch' -> 'epochID').
    Si aucune des deux colonnes n'existe, crée 'epochID' = range(len(df_out)).
    Aucune colonne 'epoch'/'Epoch' ne reste après appel.
    """
    if 'epoch' in df_out.columns:
        df_out.rename(columns={'epoch': 'epochID'}, inplace=True)
    elif 'Epoch' in df_out.columns:
        df_out.rename(columns={'Epoch': 'epochID'}, inplace=True)
    else:
        df_out['epochID'] = list(range(len(df_out)))

def load_epochs(fair_path, model):
    start, stop = _slice_window(model, 'selection')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    # Retirer colonnes exclues, ne pas multiplier par 100, ne pas toucher Accuracy
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    # Vérifie présence d'Accuracy (vient uniquement de fair_path)
    _ = fair_df['Accuracy']  # lève KeyError si absent

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df  # fair_df contient déjà epochID

def load_epochs_full(fair_path, model):
    start, stop = _slice_window(model, 'full')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    _ = fair_df['Accuracy']  # assert presence

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path)
    df = _clean_df(df)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

all_epochs = []

# === Full traces ===
for model in models:
    if not os.path.isdir(directory_full):
        continue
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            _, df_metrics = load_epochs_full(fair_file, model)
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'kdd'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            if not os.path.isdir(directory):
                continue

            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    _, df_metrics = load_epochs(fair_file, model)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'kdd'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id

                    all_epochs.append(df_metrics)

if not all_epochs:
    raise ValueError("Aucun fichier trouvé : vérifie les noms des fichiers fair_metrics et cost_metrics.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)

os.makedirs('../../results/test', exist_ok=True)
df_all_epochs.to_csv('../../results/test/kdd_epoch_traces.csv', index=False)


In [6]:
import os
import re
import pandas as pd

# === Paths & settings ===
systems_path = [
    '../../results/dc-selection/CRAIGPB',
    '../../results/dc-selection/GLISTERPB',
    '../../results/dc-selection/GradMatchPB',
    '../../results/dc-selection/Random'
]
ratio_path = ['/dc_0.05', '/dc_0.1', '/dc_0.2', '/dc_0.3']
directory_full = '../../results/dc-selection/Full/dc_1'
models = ['Logreg', 'MLP', 'SVM']
excluded_columns = []  # add any columns to drop from fair metrics


# --- Model-specific row windows (start inclusive, stop exclusive) ---
MODEL_WINDOWS = {
    'selection': {
        'Logreg': (0, 280),
        'MLP':    (0, 120),
        'SVM':    (0, 120),
    },
    'full': {
        'Logreg': (0, 280),
        'MLP':    (0, 120),
        'SVM':    (0, 120),
    }
}

def _slice_window(model: str, phase: str):
    try:
        return MODEL_WINDOWS[phase][model]
    except KeyError:
        raise KeyError(f"Aucune fenêtre définie pour phase='{phase}', model='{model}'")

def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop unnamed index-like columns
    df = df.loc[:, ~df.columns.str.contains(r'^Unnamed')]
    return df

def _ensure_epoch_id_inplace(df_out: pd.DataFrame):
    """
    Renomme 'epoch' -> 'epochID' (ou 'Epoch' -> 'epochID').
    Si aucune des deux colonnes n'existe, crée 'epochID' = range(len(df_out)).
    Aucune colonne 'epoch'/'Epoch' ne reste après appel.
    """
    if 'epoch' in df_out.columns:
        df_out.rename(columns={'epoch': 'epochID'}, inplace=True)
    elif 'Epoch' in df_out.columns:
        df_out.rename(columns={'Epoch': 'epochID'}, inplace=True)
    else:
        df_out['epochID'] = list(range(len(df_out)))

def load_epochs(fair_path, model):
    start, stop = _slice_window(model, 'selection')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    # Retirer colonnes exclues, ne pas multiplier par 100, ne pas toucher Accuracy
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    # Vérifie présence d'Accuracy (vient uniquement de fair_path)
    _ = fair_df['Accuracy']  # lève KeyError si absent

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df  # fair_df contient déjà epochID

def load_epochs_full(fair_path, model):
    start, stop = _slice_window(model, 'full')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    _ = fair_df['Accuracy']  # assert presence

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path)
    df = _clean_df(df)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

all_epochs = []

# === Full traces ===
for model in models:
    if not os.path.isdir(directory_full):
        continue
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            _, df_metrics = load_epochs_full(fair_file, model)
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'dc'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            if not os.path.isdir(directory):
                continue

            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    _, df_metrics = load_epochs(fair_file, model)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'dc'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id

                    all_epochs.append(df_metrics)

if not all_epochs:
    raise ValueError("Aucun fichier trouvé : vérifie les noms des fichiers fair_metrics et cost_metrics.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)

os.makedirs('../../results/test', exist_ok=True)
df_all_epochs.to_csv('../../results/test/dc_epoch_traces.csv', index=False)


In [7]:
import os
import re
import pandas as pd

# === Paths & settings ===
BASE_DIR = '../../results/mobiact-selection'  # <- sans le doublon 'results'
systems = ['CRAIGPB', 'GLISTERPB', 'GradMatchPB', 'Random']
systems_path = [os.path.join(BASE_DIR, s) for s in systems]
ratio_path = ['/mobiact_0.05', '/mobiact_0.1', '/mobiact_0.2', '/mobiact_0.3']  # ajoute /mobiact_0.5 si tu en as
directory_full = os.path.join(BASE_DIR, 'Full', 'mobiact_1')
models = ['MLP']
excluded_columns = []

# Fenêtres iloc spécifiques
MODEL_WINDOWS = {
    'selection': {'MLP': (0, 300)},
    'full': {'MLP': (0, 300)},
}

def _slice_window(model: str, phase: str):
    try:
        return MODEL_WINDOWS[phase][model]
    except KeyError:
        raise KeyError(f"Aucune fenêtre définie pour phase='{phase}', model='{model}'")

def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[:, ~df.columns.str.contains(r'^Unnamed')]

def _ensure_epoch_id_inplace(df_out: pd.DataFrame):
    if 'epoch' in df_out.columns:
        df_out.rename(columns={'epoch': 'epochID'}, inplace=True)
    elif 'Epoch' in df_out.columns:
        df_out.rename(columns={'Epoch': 'epochID'}, inplace=True)
    else:
        df_out['epochID'] = list(range(len(df_out)))

# --- Tolérance sur les noms de fichiers ---
FAIR_PATTERNS = [
    'train_mobiact_fair_metrics_',  # ex: train_mobiact_fair_metrics_..._12.csv
    'mobiact_fair_metrics_',
    'fair_metrics_',                # ex: fair_metrics_MLP_12.csv
]
COST_PATTERNS = [
    'train_mobiact_cost_metrics_',
    'mobiact_cost_metrics_',
    'cost_metrics_',
]

RUNID_REGEXES = [
    re.compile(r'_(\d+)\.csv$'),     # ..._12.csv
    re.compile(r'run[_-]?(\d+)\.csv$'), # ...run12.csv ou ...run_12.csv
    re.compile(r'-(\d+)\.csv$'),     # ...-12.csv
]

def extract_run_id(filename: str):
    for rgx in RUNID_REGEXES:
        m = rgx.search(filename)
        if m:
            try:
                return int(m.group(1))
            except ValueError:
                pass
    return None

def match_cost_for_fair(dir_path: str, fair_filename: str, run_id: int | None):
    """
    Cherche le cost correspondant :
    1) même run_id si possible
    2) nom voisin en remplaçant 'fair'->'cost'
    3) s'il n'y a qu'un seul cost dans le dossier, on le prend
    """
    files = [f for f in os.listdir(dir_path) if f.endswith('.csv')]
    cost_files = [f for f in files if any(f.startswith(p) for p in COST_PATTERNS)]

    # 1) par run_id
    if run_id is not None:
        for cf in cost_files:
            if extract_run_id(cf) == run_id:
                return os.path.join(dir_path, cf)

    # 2) par nom voisin
    candidates = []
    neighbor = fair_filename.replace('fair', 'cost')
    for cf in cost_files:
        if cf == neighbor:
            return os.path.join(dir_path, cf)
        # tolérance: même tronc + cost
        if cf.split('.csv')[0].endswith(str(run_id)) and run_id is not None:
            candidates.append(cf)

    if candidates:
        return os.path.join(dir_path, candidates[0])

    # 3) unique cost dans le dossier
    if len(cost_files) == 1:
        return os.path.join(dir_path, cost_files[0])

    return None

def load_epochs_from_fair(fair_path: str, model: str, phase: str):
    start, stop = _slice_window(model, phase)
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)
    fair_df = fair_df_raw.iloc[start:stop].copy()
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    # Vérifie 'Accuracy' dans le fair
    _ = fair_df['Accuracy']  # lève KeyError si absent

    _ensure_epoch_id_inplace(fair_df)
    return fair_df

def load_first_training_time(cost_path: str):
    df = pd.read_csv(cost_path)
    df = _clean_df(df)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def list_matching_fair_files(dir_path: str):
    return [
        f for f in os.listdir(dir_path)
        if f.endswith('.csv') and any(f.startswith(p) for p in FAIR_PATTERNS)
    ]

all_epochs = []

# === Full traces ===
if os.path.isdir(directory_full):
    for fair_name in list_matching_fair_files(directory_full):
        fair_file = os.path.join(directory_full, fair_name)
        run_id = extract_run_id(fair_name)
        cost_file = match_cost_for_fair(directory_full, fair_name, run_id)
        if not cost_file:
            continue

        df_metrics = load_epochs_from_fair(fair_file, 'MLP', 'full')
        full_time = load_first_training_time(cost_file)

        df_metrics['dataset'] = 'mobiact'
        df_metrics['model'] = 'MLP'
        df_metrics['system'] = 'Full'
        df_metrics['ratio'] = 1.0
        df_metrics['Full_training_time'] = full_time
        df_metrics['runID'] = run_id if run_id is not None else -1

        all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for sys_dir in systems_path:
        if not os.path.isdir(sys_dir):
            continue
        system = os.path.basename(sys_dir)
        for r in ratio_path:
            directory = sys_dir + r
            if not os.path.isdir(directory):
                continue
            ratio = float(r.split('_')[-1])

            for fair_name in list_matching_fair_files(directory):
                fair_file = os.path.join(directory, fair_name)
                run_id = extract_run_id(fair_name)
                cost_file = match_cost_for_fair(directory, fair_name, run_id)
                if not cost_file:
                    continue

                df_metrics = load_epochs_from_fair(fair_file, model, 'selection')
                full_time = load_first_training_time(cost_file)

                df_metrics['dataset'] = 'mobiact'
                df_metrics['model'] = model
                df_metrics['system'] = system
                df_metrics['ratio'] = ratio
                df_metrics['Full_training_time'] = full_time
                df_metrics['runID'] = run_id if run_id is not None else -1

                all_epochs.append(df_metrics)

if not all_epochs:
    # aide au debug: affiche quelques fichiers trouvés dans Full + un dossier de sélection
    dbg = []
    if os.path.isdir(directory_full):
        dbg += [os.path.join(directory_full, f) for f in os.listdir(directory_full)[:10]]
    for p in systems_path:
        d = p + ratio_path[0]
        if os.path.isdir(d):
            dbg += [os.path.join(d, f) for f in os.listdir(d)[:10]]
            break
    raise ValueError(
        "Aucun fichier trouvé : vérifie les NOMS de fichiers fair/cost.\n"
        f"Patrons fair acceptés: {FAIR_PATTERNS}\n"
        f"Patrons cost acceptés: {COST_PATTERNS}\n"
        f"Exemples de fichiers vus: {dbg}"
    )

df_all_epochs = pd.concat(all_epochs, ignore_index=True)

out_dir = '../../results/test'
os.makedirs(out_dir, exist_ok=True)
out_csv = os.path.join(out_dir, 'mobiact_epoch_traces.csv')
df_all_epochs.to_csv(out_csv, index=False)
print(f"Trace écrite: {out_csv}")


Trace écrite: ../../results/test/mobiact_epoch_traces.csv


In [8]:
import os
import re
import pandas as pd

# === Paths & settings ===
systems_path = [
    '../../results/celeba-selection/CRAIGPB',
    '../../results/celeba-selection/GLISTERPB',
    '../../results/celeba-selection/GradMatchPB',
    '../../results/celeba-selection/Random'
]
ratio_path = ['/celeba_0.05', '/celeba_0.1', '/celeba_0.2', '/celeba_0.3']
directory_full = '../../results/celeba-selection/Full/celeba_1'
models = ['ResNet18', 'VGG']
excluded_columns = []  # add any columns to drop from fair metrics


# --- Model-specific row windows (start inclusive, stop exclusive) ---
MODEL_WINDOWS = {
    'selection': {
        'ResNet18': (0, 150),
        'VGG':    (0, 150),
    },
    'full': {
        'ResNet18': (0, 150),
        'VGG':    (0, 150),
    }
}

def _slice_window(model: str, phase: str):
    try:
        return MODEL_WINDOWS[phase][model]
    except KeyError:
        raise KeyError(f"Aucune fenêtre définie pour phase='{phase}', model='{model}'")

def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop unnamed index-like columns
    df = df.loc[:, ~df.columns.str.contains(r'^Unnamed')]
    return df

def _ensure_epoch_id_inplace(df_out: pd.DataFrame):
    """
    Renomme 'epoch' -> 'epochID' (ou 'Epoch' -> 'epochID').
    Si aucune des deux colonnes n'existe, crée 'epochID' = range(len(df_out)).
    Aucune colonne 'epoch'/'Epoch' ne reste après appel.
    """
    if 'epoch' in df_out.columns:
        df_out.rename(columns={'epoch': 'epochID'}, inplace=True)
    elif 'Epoch' in df_out.columns:
        df_out.rename(columns={'Epoch': 'epochID'}, inplace=True)
    else:
        df_out['epochID'] = list(range(len(df_out)))

def load_epochs(fair_path, model):
    start, stop = _slice_window(model, 'selection')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    # Retirer colonnes exclues, ne pas multiplier par 100, ne pas toucher Accuracy
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    # Vérifie présence d'Accuracy (vient uniquement de fair_path)
    _ = fair_df['Accuracy']  # lève KeyError si absent

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df  # fair_df contient déjà epochID

def load_epochs_full(fair_path, model):
    start, stop = _slice_window(model, 'full')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    _ = fair_df['Accuracy']  # assert presence

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path)
    df = _clean_df(df)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

all_epochs = []

# === Full traces ===
for model in models:
    if not os.path.isdir(directory_full):
        continue
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            _, df_metrics = load_epochs_full(fair_file, model)
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'celeba'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            if not os.path.isdir(directory):
                continue

            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    _, df_metrics = load_epochs(fair_file, model)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'celeba'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id

                    all_epochs.append(df_metrics)

if not all_epochs:
    raise ValueError("Aucun fichier trouvé : vérifie les noms des fichiers fair_metrics et cost_metrics.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)

os.makedirs('../../results/test', exist_ok=True)
df_all_epochs.to_csv('../../results/test/celeba_epoch_traces.csv', index=False)


In [9]:
import os
import re
import pandas as pd

# === Paths & settings ===
systems_path = [
    '../../results/fairface-selection/CRAIGPB',
    '../../results/fairface-selection/GLISTERPB',
    '../../results/fairface-selection/GradMatchPB',
    '../../results/fairface-selection/Random'
]
ratio_path = ['/fairface_0.05', '/fairface_0.1', '/fairface_0.2', '/fairface_0.3']
directory_full = '../../results/fairface-selection/Full/fairface_1'
models = ['ResNet18', 'VGG']
excluded_columns = []  # add any columns to drop from fair metrics


# --- Model-specific row windows (start inclusive, stop exclusive) ---
MODEL_WINDOWS = {
    'selection': {
        'ResNet18': (0, 150),
        'VGG':    (0, 150),
    },
    'full': {
        'ResNet18': (0, 150),
        'VGG':    (0, 150),
    }
}

def _slice_window(model: str, phase: str):
    try:
        return MODEL_WINDOWS[phase][model]
    except KeyError:
        raise KeyError(f"Aucune fenêtre définie pour phase='{phase}', model='{model}'")

def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop unnamed index-like columns
    df = df.loc[:, ~df.columns.str.contains(r'^Unnamed')]
    return df

def _ensure_epoch_id_inplace(df_out: pd.DataFrame):
    """
    Renomme 'epoch' -> 'epochID' (ou 'Epoch' -> 'epochID').
    Si aucune des deux colonnes n'existe, crée 'epochID' = range(len(df_out)).
    Aucune colonne 'epoch'/'Epoch' ne reste après appel.
    """
    if 'epoch' in df_out.columns:
        df_out.rename(columns={'epoch': 'epochID'}, inplace=True)
    elif 'Epoch' in df_out.columns:
        df_out.rename(columns={'Epoch': 'epochID'}, inplace=True)
    else:
        df_out['epochID'] = list(range(len(df_out)))

def load_epochs(fair_path, model):
    start, stop = _slice_window(model, 'selection')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    # Retirer colonnes exclues, ne pas multiplier par 100, ne pas toucher Accuracy
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    # Vérifie présence d'Accuracy (vient uniquement de fair_path)
    _ = fair_df['Accuracy']  # lève KeyError si absent

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df  # fair_df contient déjà epochID

def load_epochs_full(fair_path, model):
    start, stop = _slice_window(model, 'full')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    _ = fair_df['Accuracy']  # assert presence

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path)
    df = _clean_df(df)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

all_epochs = []

# === Full traces ===
for model in models:
    if not os.path.isdir(directory_full):
        continue
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            _, df_metrics = load_epochs_full(fair_file, model)
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'fairface'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            if not os.path.isdir(directory):
                continue

            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    _, df_metrics = load_epochs(fair_file, model)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'fairface'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id

                    all_epochs.append(df_metrics)

if not all_epochs:
    raise ValueError("Aucun fichier trouvé : vérifie les noms des fichiers fair_metrics et cost_metrics.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)

os.makedirs('../../results/test', exist_ok=True)
df_all_epochs.to_csv('../../results/test/fairface_epoch_traces.csv', index=False)


In [10]:
import os
import re
import pandas as pd

# === Paths & settings ===
systems_path = [
    '../../results/audiomnist-selection/CRAIGPB',
    '../../results/audiomnist-selection/GLISTERPB',
    '../../results/audiomnist-selection/GradMatchPB',
    '../../results/audiomnist-selection/Random'
]
ratio_path = ['/audiomnist_0.05', '/audiomnist_0.1', '/audiomnist_0.2', '/audiomnist_0.3']
directory_full = '../../results/audiomnist-selection/Full/audiomnist_1'
models = ['AudioCNN', 'AudioLSTM']
excluded_columns = []  # add any columns to drop from fair metrics


# --- Model-specific row windows (start inclusive, stop exclusive) ---
MODEL_WINDOWS = {
    'selection': {
        'AudioCNN': (0, 150),
        'AudioLSTM':    (0, 150),
    },
    'full': {
        'AudioCNN': (0, 150),
        'AudioLSTM':    (0, 150),
    }
}

def _slice_window(model: str, phase: str):
    try:
        return MODEL_WINDOWS[phase][model]
    except KeyError:
        raise KeyError(f"Aucune fenêtre définie pour phase='{phase}', model='{model}'")

def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop unnamed index-like columns
    df = df.loc[:, ~df.columns.str.contains(r'^Unnamed')]
    return df

def _ensure_epoch_id_inplace(df_out: pd.DataFrame):
    """
    Renomme 'epoch' -> 'epochID' (ou 'Epoch' -> 'epochID').
    Si aucune des deux colonnes n'existe, crée 'epochID' = range(len(df_out)).
    Aucune colonne 'epoch'/'Epoch' ne reste après appel.
    """
    if 'epoch' in df_out.columns:
        df_out.rename(columns={'epoch': 'epochID'}, inplace=True)
    elif 'Epoch' in df_out.columns:
        df_out.rename(columns={'Epoch': 'epochID'}, inplace=True)
    else:
        df_out['epochID'] = list(range(len(df_out)))

def load_epochs(fair_path, model):
    start, stop = _slice_window(model, 'selection')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    # Retirer colonnes exclues, ne pas multiplier par 100, ne pas toucher Accuracy
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    # Vérifie présence d'Accuracy (vient uniquement de fair_path)
    _ = fair_df['Accuracy']  # lève KeyError si absent

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df  # fair_df contient déjà epochID

def load_epochs_full(fair_path, model):
    start, stop = _slice_window(model, 'full')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    _ = fair_df['Accuracy']  # assert presence

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path)
    df = _clean_df(df)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

all_epochs = []

# === Full traces ===
for model in models:
    if not os.path.isdir(directory_full):
        continue
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            _, df_metrics = load_epochs_full(fair_file, model)
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'audiomnist'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            if not os.path.isdir(directory):
                continue

            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    _, df_metrics = load_epochs(fair_file, model)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'audiomnist'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id

                    all_epochs.append(df_metrics)

if not all_epochs:
    raise ValueError("Aucun fichier trouvé : vérifie les noms des fichiers fair_metrics et cost_metrics.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)

os.makedirs('../../results/test', exist_ok=True)
df_all_epochs.to_csv('../../results/test/audiomnist_epoch_traces.csv', index=False)


In [11]:
import os
import re
import pandas as pd

# === Paths & settings ===
systems_path = [
    '../../results/voxceleb-selection/CRAIGPB',
    '../../results/voxceleb-selection/GLISTERPB',
    '../../results/voxceleb-selection/GradMatchPB',
    '../../results/voxceleb-selection/Random'
]
ratio_path = ['/voxceleb_0.05', '/voxceleb_0.1', '/voxceleb_0.2', '/voxceleb_0.3']
directory_full = '../../results/voxceleb-selection/Full/voxceleb_1'
models = ['AudioCNN', 'AudioLSTM']
excluded_columns = []  # add any columns to drop from fair metrics


# --- Model-specific row windows (start inclusive, stop exclusive) ---
MODEL_WINDOWS = {
    'selection': {
        'AudioCNN': (0, 150),
        'AudioLSTM':    (0, 150),
    },
    'full': {
        'AudioCNN': (0, 150),
        'AudioLSTM':    (0, 150),
    }
}

def _slice_window(model: str, phase: str):
    try:
        return MODEL_WINDOWS[phase][model]
    except KeyError:
        raise KeyError(f"Aucune fenêtre définie pour phase='{phase}', model='{model}'")

def _clean_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop unnamed index-like columns
    df = df.loc[:, ~df.columns.str.contains(r'^Unnamed')]
    return df

def _ensure_epoch_id_inplace(df_out: pd.DataFrame):
    """
    Renomme 'epoch' -> 'epochID' (ou 'Epoch' -> 'epochID').
    Si aucune des deux colonnes n'existe, crée 'epochID' = range(len(df_out)).
    Aucune colonne 'epoch'/'Epoch' ne reste après appel.
    """
    if 'epoch' in df_out.columns:
        df_out.rename(columns={'epoch': 'epochID'}, inplace=True)
    elif 'Epoch' in df_out.columns:
        df_out.rename(columns={'Epoch': 'epochID'}, inplace=True)
    else:
        df_out['epochID'] = list(range(len(df_out)))

def load_epochs(fair_path, model):
    start, stop = _slice_window(model, 'selection')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    # Retirer colonnes exclues, ne pas multiplier par 100, ne pas toucher Accuracy
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    # Vérifie présence d'Accuracy (vient uniquement de fair_path)
    _ = fair_df['Accuracy']  # lève KeyError si absent

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df  # fair_df contient déjà epochID

def load_epochs_full(fair_path, model):
    start, stop = _slice_window(model, 'full')
    fair_df_raw = pd.read_csv(fair_path)
    fair_df_raw = _clean_df(fair_df_raw)

    fair_df = fair_df_raw.iloc[start:stop].copy()
    fair_df = fair_df.drop(columns=excluded_columns, errors='ignore')

    _ = fair_df['Accuracy']  # assert presence

    # Assure epochID (rename en place, sans doublon)
    _ensure_epoch_id_inplace(fair_df)

    return fair_df_raw, fair_df

def load_first_training_time(cost_path):
    df = pd.read_csv(cost_path)
    df = _clean_df(df)
    return df['Full_training_time'].iloc[0] if 'Full_training_time' in df.columns and len(df) > 0 else None

def extract_run_id(filename):
    match = re.search(r'_(\d+)\.csv$', filename)
    return int(match.group(1)) if match else None

all_epochs = []

# === Full traces ===
for model in models:
    if not os.path.isdir(directory_full):
        continue
    for filename in os.listdir(directory_full):
        if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
            run_id = extract_run_id(filename)
            if run_id is None:
                continue

            fair_file = os.path.join(directory_full, filename)

            cost_file = None
            for f in os.listdir(directory_full):
                if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                    cost_file = os.path.join(directory_full, f)
                    break
            if not cost_file:
                continue

            _, df_metrics = load_epochs_full(fair_file, model)
            full_time = load_first_training_time(cost_file)

            df_metrics['dataset'] = 'voxceleb'
            df_metrics['model'] = model
            df_metrics['system'] = 'Full'
            df_metrics['ratio'] = 1.0
            df_metrics['Full_training_time'] = full_time
            df_metrics['runID'] = run_id

            all_epochs.append(df_metrics)

# === Selection traces ===
for model in models:
    for directory_path_1 in systems_path:
        for directory_path_2 in ratio_path:
            directory = directory_path_1 + directory_path_2
            if not os.path.isdir(directory):
                continue

            system = directory_path_1.split('/')[-1]
            ratio = float(directory_path_2.split('_')[-1])

            for filename in os.listdir(directory):
                if f"fair_metrics_{model}_" in filename and filename.endswith('.csv'):
                    run_id = extract_run_id(filename)
                    if run_id is None:
                        continue

                    fair_file = os.path.join(directory, filename)

                    cost_file = None
                    for f in os.listdir(directory):
                        if f.startswith(f"cost_metrics_{model}_") and f.endswith(f"{run_id}.csv"):
                            cost_file = os.path.join(directory, f)
                            break
                    if not cost_file:
                        continue

                    _, df_metrics = load_epochs(fair_file, model)
                    full_time = load_first_training_time(cost_file)

                    df_metrics['dataset'] = 'voxceleb'
                    df_metrics['model'] = model
                    df_metrics['system'] = system
                    df_metrics['ratio'] = ratio
                    df_metrics['Full_training_time'] = full_time
                    df_metrics['runID'] = run_id

                    all_epochs.append(df_metrics)

if not all_epochs:
    raise ValueError("Aucun fichier trouvé : vérifie les noms des fichiers fair_metrics et cost_metrics.")

df_all_epochs = pd.concat(all_epochs, ignore_index=True)

os.makedirs('../../results/test', exist_ok=True)
df_all_epochs.to_csv('../../results/test/voxceleb_epoch_traces.csv', index=False)


In [12]:
import pandas as pd

def concatenate_csv_files(file_paths):
    target_columns = [
       'SPD_gender', 'EOD_gender', 'AOD_gender', 'DI_gender', 'DcI_gender',
       'SPD_race', 'EOD_race', 'AOD_race', 'DI_race', 'DcI_race', 'SPD_age',
       'EOD_age', 'AOD_age', 'DI_age', 'DcI_age', 'F1_score', 'Precision',
       'Recall', 'Accuracy', 'dataset', 'model', 'system', 'ratio',
       'Full_training_time', 'runID', 'epochID'
    ]
    
    dfs = []

    for file_path in file_paths:
        df = pd.read_csv(file_path, usecols=lambda c: c in target_columns, engine='python')
        missing_cols = set(target_columns) - set(df.columns)
        for col in missing_cols:
            df[col] = pd.NA
        df = df.reindex(columns=target_columns)
        dfs.append(df)

    result = pd.concat(dfs, ignore_index=True)

    return result

In [13]:
file_paths = ['../../results/test/ars_epoch_traces.csv', 
              '../../results/test/dc_epoch_traces.csv', 
              '../../results/test//mobiact_epoch_traces.csv', 
              '../../results/test/adult_epoch_traces.csv', 
              '../../results/test/kdd_epoch_traces.csv',
              '../../results/test/celeba_epoch_traces.csv', 
              '../../results/test/fairface_epoch_traces.csv',
              '../../results/test/audiomnist_epoch_traces.csv',
              '../../results/test/voxceleb_epoch_traces.csv' 
              ]
result_df = concatenate_csv_files(file_paths)
result_df.to_csv('../../results/test/epoch_traces.csv', index=False)
print(len(result_df))

283900


In [14]:
import pandas as pd
from pathlib import Path

RES_PATH = Path("../../results/test/epoch_traces.csv")
CFG_PATH = Path("../../traces/ExperimentConfigurations.csv")
OUT_PATH = Path("../../results/test/epoch_traces_EC_ID.csv")

# --- normalisations cohérentes ---
def norm_str(s):
    return str(s).strip()

def to_key(s):
    return norm_str(s).lower()

# Dictionnaires en MINUSCULES car on utilise .lower() pour la clé
MAP_MODEL = {
    'logreg':'LR','lr':'LR',
    'mlp':'MLP',
    'svm':'SVM',
    'audiolstm':'LSTM','lstm':'LSTM',
    'audiocnn':'CNN','cnn':'CNN',
    'dc':'DC'
}
MAP_DATASET = {
    'census':'Adult','adult':'Adult',
    'kdd':'KDD',
    'dc':'DC',
    'mobiact':'MobiAct',
    'ars':'ARS',
    'celeba':'CelebA',
    'fairface':'Fairface',
    'audiomnist':'audioMNIST',
    'voxceleb':'voxceleb'  # la config que tu génères utilise 'voxceleb' en minuscules
}
# IMPORTANT: retirer les suffixes "PB" et normaliser le nom du système
MAP_SYSTEM = {
    'full':'Full',
    'craig':'Craig','craigpb':'Craig',
    'glister':'Glister','glisterpb':'Glister',
    'gradmatch':'GradMatch','gradmatchpb':'GradMatch',
    'random':'Random'
}

def canon_model(v):
    k = to_key(v)
    return MAP_MODEL.get(k, norm_str(v))

def canon_dataset(v):
    k = to_key(v)
    return MAP_DATASET.get(k, norm_str(v))

def canon_system(v):
    k = to_key(v)
    # retire un éventuel suffixe '-pb' ou 'pb' isolé
    k = k.replace('-pb','').replace('_pb','')
    if k.endswith('pb'):
        k = k[:-2]
    return MAP_SYSTEM.get(k, norm_str(v))

def canon_ratio(x, ndigits=3):
    try:
        return round(float(x), ndigits)
    except Exception:
        return None

# --- lecture ---
df_res = pd.read_csv(RES_PATH, dtype=str)
df_cfg = pd.read_csv(CFG_PATH, dtype=str)

# hygiène colonnes
need_res = {"dataset","model","system","ratio"}
if not need_res.issubset(df_res.columns):
    missing = need_res - set(df_res.columns)
    raise ValueError(f"[epoch_traces.csv] colonnes manquantes: {missing}")

need_cfg = {"EC_ID","dataset","model","system","ratio"}
if not need_cfg.issubset(df_cfg.columns):
    missing = need_cfg - set(df_cfg.columns)
    raise ValueError(f"[ExperimentConfigurations.csv] colonnes manquantes: {missing}")

# --- canonisation des deux côtés ---
# résultats
df_res["ds_c"]     = df_res["dataset"].map(canon_dataset)
df_res["model_c"]  = df_res["model"].map(canon_model)
df_res["system_c"] = df_res["system"].map(canon_system)
df_res["ratio_c"]  = df_res["ratio"].map(lambda x: canon_ratio(x, ndigits=3))

# config
df_cfg["ds_c"]     = df_cfg["dataset"].map(canon_dataset)
df_cfg["model_c"]  = df_cfg["model"].map(canon_model)
df_cfg["system_c"] = df_cfg["system"].map(canon_system)
df_cfg["ratio_c"]  = df_cfg["ratio"].map(lambda x: canon_ratio(x, ndigits=3))

# --- dédup config par clé canonique (évite duplication après merge) ---
key = ["ds_c","model_c","system_c","ratio_c"]
dup = df_cfg.duplicated(key).sum()
if dup:
    print(f"[WARN] {dup} doublon(s) dans la config sur la clé canonique. On garde le premier.")
df_cfg_u = df_cfg.drop_duplicates(key, keep="first")

# --- jointure m:1 sur la clé canonique ---
df_merge = df_res.merge(df_cfg_u[key + ["EC_ID"]], on=key, how="left", validate="m:1")

# --- diag des NaN restants ---
nan_mask = df_merge["EC_ID"].isna()
nan_count = int(nan_mask.sum())
if nan_count:
    print(f"[WARN] {nan_count} ligne(s) sans EC_ID après normalisation.")
    print("Top (dataset, model, system, ratio) manquants :")
    print(df_merge.loc[nan_mask, key].value_counts().head(15).to_string())

# --- sortie : on reprend les colonnes d'origine + EC_ID propre ---
df_out = df_res.copy()
df_out["EC_ID"] = df_merge["EC_ID"]
df_out.to_csv(OUT_PATH, index=False)
print(f"[OK] écrit : {OUT_PATH}  ({len(df_out)} lignes)")


[OK] écrit : ../../results/test/epoch_traces_EC_ID.csv  (283900 lignes)


In [15]:
import pandas as pd

file_path = '../../results/test/epoch_traces_EC_ID.csv'  
df = pd.read_csv(file_path)


df = df.rename(columns={
    'runID': 'Run ID',
    'epochID': 'Epoch ID',
    'accuracy': 'Accuracy',
    'Full_training_time': 'Time'
})


desired_order = [
    'EC_ID', 'Run ID', 'Epoch ID', 'Time', 'Accuracy',
    'F1_score', 'Precision', 'Recall',
    'SPD_gender', 'EOD_gender', 'AOD_gender', 'DI_gender', 'DcI_gender',
    'SPD_age', 'EOD_age', 'AOD_age', 'DI_age', 'DcI_age',
    'SPD_race', 'EOD_race', 'AOD_race', 'DI_race', 'DcI_race'
]

columns_to_keep = [col for col in desired_order if col in df.columns]

df = df[columns_to_keep]

df.to_csv('../../results/test/ExperimentMeasurements.csv', index=False)


In [17]:
import pandas as pd
from pathlib import Path

# ---------- chemins ----------
IN_PATH  = Path("../../results/test/epoch_traces_EC_ID.csv")
OUT_DIR  = Path("../../traces/")

# ---------- lecture ----------
df = pd.read_csv(IN_PATH, dtype=str)
initial_rows = len(df)

# --- renommer les colonnes si elles existent (idempotent) ---
rename_map = {
    'runID': 'Run ID',
    'epochID': 'Epoch ID',
    'Full_training_time': 'Time',
    'accuracy': 'Accuracy',
}
existing_map = {k: v for k, v in rename_map.items() if k in df.columns}
if existing_map:
    df = df.rename(columns=existing_map)

# hygiène de base
for col in ["EC_ID", "dataset"]:
    if col not in df.columns:
        raise ValueError(f"Colonne manquante dans {IN_PATH.name}: '{col}'")
df["EC_ID"] = df["EC_ID"].astype(str).str.strip()
df["dataset"] = df["dataset"].astype(str).str.strip()

# normalisation dataset (casse/espaces)
norm = lambda s: str(s).strip().lower()
df["dataset_norm"] = df["dataset"].map(norm)

# ---------- colonnes demandées (avec l'ordre exact) ----------
BASE   = ['EC_ID', 'Run ID', 'Epoch ID', 'Time', 'Accuracy', 'F1_score', 'Precision', 'Recall']
GENDER = ['SPD_gender', 'EOD_gender', 'AOD_gender', 'DI_gender', 'DcI_gender']
AGE    = ['SPD_age', 'EOD_age', 'AOD_age', 'DI_age', 'DcI_age']
RACE   = ['SPD_race', 'EOD_race', 'AOD_race', 'DI_race', 'DcI_race']

def n(x): return norm(x)

groups = [
    {
        "name": "Adult_KDD",
        "datasets": {n("Adult"), n("KDD")},
        "cols": BASE + GENDER + AGE + RACE
    },
    {
        "name": "DC_MobiAct_CelebA_audioMNIST",
        "datasets": {n("DC"), n("MobiAct"), n("CelebA"), n("audioMNIST")},
        "cols": BASE + GENDER + AGE
    },
    {
        "name": "ARS",
        "datasets": {n("ARS")},
        "cols": BASE + GENDER
    },
    {
        "name": "fairface",
        "datasets": {n("fairface")},
        "cols": BASE + AGE + RACE
    },
    {
        "name": "voxceleb",
        "datasets": {n("voxceleb")},
        "cols": BASE + RACE
    },
]

# ---------- utilitaire d’écriture ----------
def write_group(df_all: pd.DataFrame, datasets_norm: set, desired_cols: list, out_name: str) -> int:
    sub = df_all[df_all["dataset_norm"].isin(datasets_norm)].copy()
    if sub.empty:
        print(f"[INFO] Aucun enregistrement pour {out_name} (datasets={sorted(datasets_norm)})")
        return 0

    # enlever dataset/dataset_norm des exports
    sub.drop(columns=[c for c in ["dataset", "dataset_norm"] if c in sub.columns],
             inplace=True, errors="ignore")

    # ne garder QUE les colonnes existantes parmi celles demandées (et donc dans l'ordre voulu)
    present_cols = [c for c in desired_cols if c in sub.columns]
    sub = sub[present_cols]

    # tri si dispo
    for k in ["Run ID", "Epoch ID"]:
        if k in sub.columns:
            sub[k] = pd.to_numeric(sub[k], errors="ignore")
    sort_keys = [k for k in ["EC_ID", "Run ID", "Epoch ID"] if k in sub.columns]
    if sort_keys:
        sub = sub.sort_values(sort_keys)

    out_path = OUT_DIR / f"ExperimentMeasurements_{out_name}.csv"
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    sub.to_csv(out_path, index=False)
    print(f"[OK] Écrit: {out_path.name}  ({len(sub)} lignes, {len(sub.columns)} colonnes)")
    return len(sub)

# ---------- génération ----------
total_rows = 0
covered_mask = pd.Series(False, index=df.index)
for g in groups:
    total_rows += write_group(df, g["datasets"], g["cols"], g["name"])
    covered_mask = covered_mask | df["dataset_norm"].isin(g["datasets"])

# ---------- check final ----------
covered_rows = int(covered_mask.sum())
uncovered_rows = initial_rows - covered_rows

print("\n=== RÉCAP ===")
print(f"Lignes dans le fichier source                  : {initial_rows}")
print(f"Lignes couvertes par les 5 groupes             : {covered_rows}")
print(f"Somme des lignes dans les fichiers générés     : {total_rows}")
if uncovered_rows > 0:
    missing_ds = (df.loc[~covered_mask, "dataset"].fillna("(NA)").value_counts())
    print(f"[INFO] {uncovered_rows} ligne(s) hors groupes. Répartition :")
    print(missing_ds.to_string())

if total_rows == covered_rows:
    print("[CHECK] Somme cohérente (somme = lignes couvertes).")
else:
    print("[CHECK] Incohérence : somme ≠ lignes couvertes. Vérifie la présence/orthographe des colonnes et datasets.")


[OK] Écrit: ExperimentMeasurements_Adult_KDD.csv  (79900 lignes, 23 colonnes)
[OK] Écrit: ExperimentMeasurements_DC_MobiAct_CelebA_audioMNIST.csv  (120700 lignes, 18 colonnes)
[OK] Écrit: ExperimentMeasurements_ARS.csv  (32300 lignes, 13 colonnes)
[OK] Écrit: ExperimentMeasurements_fairface.csv  (25500 lignes, 18 colonnes)
[OK] Écrit: ExperimentMeasurements_voxceleb.csv  (25500 lignes, 13 colonnes)

=== RÉCAP ===
Lignes dans le fichier source                  : 283900
Lignes couvertes par les 5 groupes             : 283900
Somme des lignes dans les fichiers générés     : 283900
[CHECK] Somme cohérente (somme = lignes couvertes).


Experiment Statistics

In [18]:
import pandas as pd

path_var = "../../results/test_var/std_avg_vc_ttest_results_with_vc_4c.csv"
path_sel = "../../results/test/ttest_5-5c-w-random.csv"

df_var = pd.read_csv(path_var)


df_var = df_var.rename(columns={
    col: col + "_var"
    for col in df_var.columns if col.startswith("test_")
})



df_sel = pd.read_csv(path_sel)

join_cols = ['dataset', 'model', 'system', 'ratio']


df_merged = None
df_merged = pd.merge(df_sel, df_var, on=join_cols, how='inner')

df_merged.to_csv("../../results/test/t-tests.csv", index=False)


In [19]:
import pandas as pd


file_path = "../../results/test/t-tests.csv"
df = pd.read_csv(file_path)

rename_map = {
    'test_acc': 'test_Accuracy',
    'test_acc_var': 'test_Accuracy_var',
    'test_f1': 'test_F1_score',
    'test_f1_var': 'test_F1_score_var',
    'test_recall': 'test_Recall',
    'test_recall_var': 'test_Recall_var',
    'test_precision': 'test_Precision',
    'test_precision_var': 'test_Precision_var'
}

df = df.rename(columns=rename_map)

df.to_csv(file_path, index=False)


In [20]:
import pandas as pd


df = pd.read_csv("../../results/test/t-tests.csv")  

 
id_vars = ['dataset', 'model', 'system', 'ratio']

metrics = [
    'Full_training_time', 'Accuracy', 'Precision', 'Recall', 'F1_score',
    'SPD_gender', 'EOD_gender', 'AOD_gender', 'DI_gender', 'DcI_gender',
    'SPD_age', 'EOD_age', 'AOD_age', 'DI_age', 'DcI_age',
    'SPD_race', 'EOD_race', 'AOD_race', 'DI_race', 'DcI_race'
]

suffixes = {
    'Mean': '_avg_avg',
    'Standard deviation': '_std_avg',
    'Variability coefficient': '_vc_avg',
    'Selection impact': 'test_{}',
    'Variability impact': 'test_{}_var'
}

rows = []

for _, row in df.iterrows():
    for metric in metrics:
        new_row = {key: row[key] for key in id_vars}
        new_row["Evaluation metric"] = metric
        
        for col_name, suffix in suffixes.items():
            if metric == "Full_training_time" and col_name == "SelImpact":
                column_name = "test_time"
            elif '{}' in suffix:
                column_name = suffix.format(metric)
            else:
                column_name = metric + suffix

            if column_name in df.columns:
                new_row[col_name] = row[column_name]
            else:
                new_row[col_name] = None

        rows.append(new_row)

df_reshaped = pd.DataFrame(rows)

def should_remove(row):
    metric = row["Evaluation metric"]
    dataset = row["dataset"].lower()
    if dataset == "ars" and (metric.endswith("_age") or metric.endswith("_race")):
        return True
    if dataset.startswith("mobiact") and metric.endswith("_race"):
        return True
    if dataset.startswith("dc") and metric.endswith("_race"):
        return True
    return False

df_filtered = df_reshaped[~df_reshaped.apply(should_remove, axis=1)]

# Sauvegarde dans un nouveau fichier CSV
df_filtered.to_csv("../../results/test/t-tests-metrics.csv", index=False)


In [27]:
# === AJOUT ROBUSTE DE EC_ID + DROP LIGNES TOUTES VIDES + EXPORT ===
import re
import pandas as pd
from pathlib import Path

IN_METRICS = Path("../../results/test/t-tests-metrics.csv")
IN_CFG     = Path("../../traces/ExperimentConfigurations.csv")
OUT_STATS  = Path("../../traces/ExperimentStatistics.csv")

def s(x):  # strip string
    return str(x).strip()

def k(x):  # key form
    return s(x).lower()

# Mappings en minuscules (on mappe sur .lower())
MAP_DATASET = {
    'census':'Adult','adult':'Adult',
    'kdd':'KDD','dc':'DC','mobiact':'MobiAct','ars':'ARS',
    'celeba':'CelebA','fairface':'Fairface',
    'audiomnist':'audioMNIST','audio mnist':'audioMNIST','audio_mnist':'audioMNIST',
    'voxceleb':'voxceleb'
}
MAP_MODEL = {
    'logreg':'LR','lr':'LR','mlp':'MLP','svm':'SVM','dc':'DC',
    'audiocnn':'CNN','cnn':'CNN','audiolstm':'LSTM','lstm':'LSTM',
    'resnet18':'ResNet18','vgg':'VGG'
}
MAP_SYSTEM = {
    'full':'Full',
    'craig':'Craig','craigpb':'Craig',
    'glister':'Glister','glisterpb':'Glister',
    'gradmatch':'GradMatch','gradmatchpb':'GradMatch',
    'random':'Random'
}

def canon_dataset(v): return MAP_DATASET.get(k(v), s(v))
def canon_model(v):   return MAP_MODEL.get(k(v), s(v))
def canon_system(v):
    kk = re.sub(r'[-_]?pb$', '', k(v))  # retire suffixe PB éventuel
    return MAP_SYSTEM.get(kk, s(v))
def canon_ratio(x, nd=3):
    try: return round(float(x), nd)
    except Exception: return None

# --- Lecture ---
df_res = pd.read_csv(IN_METRICS, dtype=str)
df_cfg = pd.read_csv(IN_CFG, dtype=str)

need_res = {"dataset","model","system","ratio","Evaluation metric"}
need_cfg = {"EC_ID","dataset","model","system","ratio"}
if not need_res.issubset(df_res.columns):
    raise ValueError(f"[t-tests-metrics.csv] colonnes manquantes: {need_res - set(df_res.columns)}")
if not need_cfg.issubset(df_cfg.columns):
    raise ValueError(f"[ExperimentConfigurations.csv] colonnes manquantes: {need_cfg - set(df_cfg.columns)}")

# --- Canonisation des deux côtés ---
for df in (df_res, df_cfg):
    df["dataset_c"] = df["dataset"].map(canon_dataset)
    df["model_c"]   = df["model"].map(canon_model)
    df["system_c"]  = df["system"].map(canon_system)
    df["ratio_c"]   = df["ratio"].map(lambda x: canon_ratio(x, 3))

# Dédup config
key = ["dataset_c","model_c","system_c","ratio_c"]
df_cfg_u = df_cfg.drop_duplicates(key, keep="first")

# --- Jointure m:1 pour récupérer EC_ID ---
df_j = df_res.merge(df_cfg_u[key + ["EC_ID"]], on=key, how="left", validate="m:1")

# --- Retirer les lignes où TOUS les 5 champs sont vides ---
value_cols = ["Mean", "Standard deviation", "Variability coefficient",
              "Selection impact", "Variability impact"]
existing_value_cols = [c for c in value_cols if c in df_j.columns]

# convertir chaînes vides/espaces en NA pour ces colonnes
for c in existing_value_cols:
    df_j[c] = df_j[c].replace(r'^\s*$', pd.NA, regex=True)

if existing_value_cols:
    before = len(df_j)
    df_j = df_j.dropna(subset=existing_value_cols, how="all")
    after = len(df_j)
    

# --- Retirer la ligne temps comme avant ---
df_j = df_j[df_j["Evaluation metric"] != "Full_training_time"].copy()

# --- Nettoyage des colonnes auxiliaires & ré-ordonnancement ---
drop_cols = ["dataset","model","system","ratio","dataset_c","model_c","system_c","ratio_c"]
df_j.drop(columns=[c for c in drop_cols if c in df_j.columns], inplace=True, errors="ignore")

# EC_ID en premier
cols = df_j.columns.tolist()
if "EC_ID" in cols:
    cols.insert(0, cols.pop(cols.index("EC_ID")))
    df_j = df_j[cols]

# --- Export ---
df_j.to_csv(OUT_STATS, index=False)



Taille des tables 

In [29]:
import pandas as pd

# === chemins vers tes fichiers CSV ===
file0 = "../../traces/DatasetProperties.csv"
file1 = "../../traces/ExperimentConfigurations.csv"
file2 = "../../traces/ExperimentMeasurements_Adult_KDD.csv"
file3 = "../../traces/ExperimentMeasurements_ARS.csv"
file4 = "../../traces/ExperimentMeasurements_DC_MobiAct_celeba_audioMNIST.csv"
file5 = "../../traces/ExperimentMeasurements_fairface.csv"
file6 = "../../traces/ExperimentMeasurements_voxceleb.csv"
file7 = "../../traces/ExperimentStatistics.csv"

# === lecture des fichiers ===
df0 = pd.read_csv(file0)
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)

# === calcul des tailles ===
size0 = len(df0)
size1 = len(df1)
size2 = len(df2)
size3 = len(df3)
size4 = len(df3)
size5 = len(df3)
size6 = len(df3)
size7 = len(df3)
total = size0 + size1 + size2 + size3 + size4 + size5 + size6 + size7

# === affichage ===
print(f"{size0} lignes")
print(f"{size1} lignes")
print(f"{size2 + size3 + size4 + size5 + size6} lignes")
print(f"{size7} lignes")
print("-" * 40)
print(f"Total : {total} lignes")


9 lignes
357 lignes
209100 lignes
32300 lignes
----------------------------------------
Total : 241766 lignes
