In [1]:
import os
import re
import ast
import numpy as np
import pandas as pd
from collections import defaultdict


def parse_ranpac_log(log_file_path):
    """Parse a RanPAC log file to extract metrics."""
    with open(log_file_path, 'r') as f:
        lines = f.readlines()
    
    # Validate log file by checking for exactly one 'Starting new run' and one 'Finishing run'
    starting_run_count = sum(1 for line in lines if 'Starting new run' in line)
    finishing_run_count = sum(1 for line in lines if 'Finishing run' in line)
    
    if starting_run_count != 1 or finishing_run_count != 1:
        print(f"Invalid log file {log_file_path}: "
              f"Starting new run: {starting_run_count}, Finishing run: {finishing_run_count}")
        return None
    
    result = {
        "model_name": "",
        "convnet_type": "",
        "exp_name": "",
        "dataset": "",
        "seed": "",
        "final_accuracy": 0.0,
        "average_accuracy": 0.0,
        "forgetting": 0.0,
    }
    
    # Parse header information
    for line in lines:
        if "model_name:" in line:
            result["model_name"] = line.split("model_name:")[1].strip()
        elif "convnet_type:" in line:
            result["convnet_type"] = line.split("convnet_type:")[1].strip()
        elif "exp_name:" in line:
            result["exp_name"] = line.split("exp_name:")[1].strip()
        elif "dataset:" in line:
            result["dataset"] = line.split("dataset:")[1].strip()
        elif "seed:" in line:
            result["seed"] = line.split("seed:")[1].strip()
    
    # Extract filename seed if not found in log
    if not result["seed"]:
        filename = os.path.basename(log_file_path)
        seed_match = re.search(r'(\d{4})_', filename)
        if seed_match:
            result["seed"] = seed_match.group(1)
    
    # Extract final Top1 curve for final_accuracy and average_accuracy
    top1_curves = []
    for line in lines:
        if "Top1 curve:" in line:
            # Extract the list from the line
            curve_str = line.split("Top1 curve:")[1].strip()
            try:
                curve_values = ast.literal_eval(curve_str)
                top1_curves.append(curve_values)
            except:
                continue
    
    if top1_curves:
        final_curve = top1_curves[-1]  # Last Top1 curve
        result["final_accuracy"] = final_curve[-1]  # Last value in the curve
        result["average_accuracy"] = np.mean(final_curve)  # Mean of all values
    
    # Extract Group Accuracies for forgetting calculation
    group_accuracies_history = defaultdict(list)
    
    for line in lines:
        if "Group Accuracies after this task:" in line:
            # Extract the dictionary from the line
            acc_str = line.split("Group Accuracies after this task:")[1].strip()
            try:
                # Parse the dictionary-like string
                acc_dict = ast.literal_eval(acc_str)
                for group, accuracy in acc_dict.items():
                    group_accuracies_history[group].append(accuracy)
            except:
                continue
    
    # Calculate forgetting: last - max for each group, then average
    forgetting_values = []
    for group, accuracies in group_accuracies_history.items():
        if len(accuracies) > 1:  # Need at least 2 values to calculate forgetting
            max_acc = max(accuracies)
            last_acc = accuracies[-1]
            forgetting = max_acc - last_acc  # Forgetting = max - last
            forgetting_values.append(forgetting)
    
    if forgetting_values:
        result["forgetting"] = np.mean(forgetting_values)
    
    return result


def parse_logs_folder(logs_folder):
    """Parse all log files in the logs folder."""
    all_results = []
    
    # Find all .log files recursively
    for root, dirs, files in os.walk(logs_folder):
        for file in files:
            if file.endswith('.log'):
                log_path = os.path.join(root, file)
                # print(f"Processing: {log_path}")
                try:
                    result = parse_ranpac_log(log_path)
                    
                    # Skip invalid logs (when parse_ranpac_log returns None)
                    if result is None:
                        continue
                    
                    # Add path information for debugging
                    result["log_path"] = log_path
                    
                    # Extract exp_name from path if not found in log
                    if not result["exp_name"]:
                        path_parts = log_path.split(os.sep)
                        if len(path_parts) >= 2:
                            result["exp_name"] = path_parts[-2]  # Parent directory name
                    
                    all_results.append(result)
                except Exception as e:
                    print(f"Error processing {log_path}: {e}")
                    continue
    
    return all_results


def generate_csv_results(logs_folder, output_prefix="ranpac_results"):
    """Generate CSV files with results and statistics."""
    all_results = parse_logs_folder(logs_folder)
    
    if not all_results:
        print("No valid results found!")
        return
    
    # Create DataFrame
    df = pd.DataFrame(all_results)
    
    # Rearrange columns
    columns = ["model_name", "convnet_type", "exp_name", "dataset", "seed", 
               "final_accuracy", "average_accuracy", "forgetting", "log_path"]
    df = df[columns]
    
    # Sort by relevant columns
    df = df.sort_values(by=["model_name", "dataset", "convnet_type", "exp_name", "seed"])
    
    # Save raw results
    df.to_csv(f"{output_prefix}_raw.csv", index=False)
    print(f"Saved raw results to {output_prefix}_raw.csv")
    
    # Group by experiment settings and calculate mean/std
    groupby_cols = ["model_name", "convnet_type", "exp_name", "dataset"]
    
    # Check which experiments have complete seeds
    grouped = df.groupby(groupby_cols)
    complete_experiments = []
    incomplete_experiments = []
    
    for name, group in grouped:
        seeds = sorted(group["seed"].astype(str).tolist())
        if len(seeds) >= 3:  # Assuming we want at least 3 seeds
            complete_experiments.append((name, group))
        else:
            incomplete_experiments.append((name, group, seeds))
    
    print(f"\nFound {len(complete_experiments)} complete experiments")
    print(f"Found {len(incomplete_experiments)} incomplete experiments")
    
    if incomplete_experiments:
        print("\nIncomplete experiments:")
        for name, group, seeds in incomplete_experiments:
            print(f"  {name}: seeds {seeds}")
    
    # Calculate mean and std for complete experiments
    if complete_experiments:
        stats_data = []
        
        for name, group in complete_experiments:
            stats = {
                "model_name": name[0],
                "convnet_type": name[1], 
                "exp_name": name[2],
                "dataset": name[3],
                "num_seeds": len(group),
                "seeds": ",".join(sorted(group["seed"].astype(str))),
                "final_accuracy_mean": group["final_accuracy"].mean(),
                "final_accuracy_std": group["final_accuracy"].std(),
                "average_accuracy_mean": group["average_accuracy"].mean(),
                "average_accuracy_std": group["average_accuracy"].std(),
                "forgetting_mean": group["forgetting"].mean(),
                "forgetting_std": group["forgetting"].std(),
            }
            stats_data.append(stats)
        
        stats_df = pd.DataFrame(stats_data)
        stats_df = stats_df.sort_values(by=["model_name", "dataset", "convnet_type", "exp_name"])
        stats_df.to_csv(f"{output_prefix}_stats.csv", index=False)
        print(f"Saved statistics to {output_prefix}_stats.csv")
        
        # Display summary
        print("\nSummary of results:")
        print(stats_df.to_string(index=False))
    
    return df


# Main execution
logs_folder = "../logs"
df = generate_csv_results(logs_folder)


Saved raw results to ranpac_results_raw.csv

Found 90 complete experiments
Found 2 incomplete experiments

Incomplete experiments:
  ('adapter', 'pretrained_vit_b16_224_in21k_adapter', 'RP_for_pca', 'cifar224'): seeds ['1993']
  ('adapter', 'pretrained_vit_b16_224_in21k_adapter', 'noRP_for_pca', 'cifar224'): seeds ['1993']
Saved statistics to ranpac_results_stats.csv

Summary of results:
model_name                         convnet_type                                    exp_name  dataset  num_seeds          seeds  final_accuracy_mean  final_accuracy_std  average_accuracy_mean  average_accuracy_std  forgetting_mean  forgetting_std
   adapter pretrained_vit_b16_224_in21k_adapter                      10000_1_1step_balanceG cifar224          3 1993,1994,1995            91.656667            0.196554              94.817333              0.126065         3.385185        0.124887
   adapter pretrained_vit_b16_224_in21k_adapter                               10000_1_2step cifar224          3 1993,

### Correct the ID column of the develop arguments csv

In [2]:
import pandas as pd
import os

csv_file_path = "/home/yanhongwei/RanPAC/args/cifar224_develop_moe.csv"

# Read the CSV file
csv_data = pd.read_csv(csv_file_path)

# Get columns to check for uniqueness (all except 'ID' and 'seed')
check_columns = [col for col in csv_data.columns if col not in ['ID', 'seed']]

# Find unique experiment configurations
unique_configs = csv_data[check_columns].drop_duplicates()

# For each unique config, check if it has all 3 seeds
new_rows = []
for _, config in unique_configs.iterrows():
    # Find all rows matching this config
    matching_rows = csv_data.copy()
    for col in check_columns:
        matching_rows = matching_rows[matching_rows[col] == config[col]]
    
    existing_seeds = set(matching_rows['seed'])
    needed_seeds = {1993, 1994, 1995} - existing_seeds
    
    # Add missing seeds
    for seed in needed_seeds:
        new_row = config.copy()
        new_row['seed'] = seed
        new_rows.append(new_row)

# Add new rows to dataframe
if new_rows:
    csv_data = pd.concat([csv_data, pd.DataFrame(new_rows)], ignore_index=True)

# Update ID column
csv_data['ID'] = range(len(csv_data))

# Save updated CSV
csv_data.to_csv(csv_file_path, index=False)

# Create summary dataframe
summary_data = []
for _, config in unique_configs.iterrows():
    matching_rows = csv_data.copy()
    for col in check_columns:
        matching_rows = matching_rows[matching_rows[col] == config[col]]
    seeds = sorted(matching_rows['seed'].unique())
    
    # Create row with all parameters
    summary_row = config.to_dict()
    summary_row['seeds'] = ','.join(map(str, seeds))
    summary_row['complete'] = 'Yes' if len(seeds)==3 else 'No'
    summary_data.append(summary_row)

# Create summary DataFrame
summary_df = pd.DataFrame(summary_data)

# Find columns with varying values
varying_columns = []
for col in check_columns:
    if len(summary_df[col].unique()) > 1:
        varying_columns.append(col)

# Add seeds and complete status to display columns
display_columns = varying_columns + ['seeds', 'complete']

# Display summary table with only varying columns
print("\nSummary of experiment configurations:")
print(summary_df[display_columns].to_string(index=False))


Summary of experiment configurations:
         seeds complete
1993,1994,1995      Yes
