In [1]:
import pandas as pd
import ast
from pathlib import Path
import wandb

# 1. Download Sweep data from W&B API
Do so only if new sweeps where run under project name `domShift`

In [2]:
download_again = False

if download_again:
    # setup
    api = wandb.Api()
    
    # Project is specified by <entity/project-name>
    runs = api.runs("7shoe/domShift-src")
    
    summary_list, config_list, name_list = [], [], []
    for run in runs: 
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files 
        summary_list.append(run.summary._json_dict)
    
        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k,v in run.config.items()
              if not k.startswith('_')})
    
        # .name is the human-readable name of the run.
        name_list.append(run.name)
    
    runs_df = pd.DataFrame({
        "summary": summary_list,
        "config": config_list,
        "name": name_list
        })
    
    runs_df.to_csv("./report/sweep3.csv") # new name should be `sweep3`

# 2. Process downloaded W&B frame

In [3]:
currate_best_models_again = True

if currate_best_models_again:
    # current sweep = 2
    df = pd.read_csv('./report/sweep2.csv')
    
    # Convert the 'summary' and 'config' columns from strings to dictionaries.
    df['summary'] = df['summary'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else {})
    df['config'] = df['config'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else {})
    
    # Now expand the dictionary columns.
    summary_expanded = pd.json_normalize(df['summary'])
    config_expanded = pd.json_normalize(df['config'])
    
    # Join them back into the original dataframe.
    df = df.drop(columns=['summary', 'config']).join(summary_expanded).join(config_expanded)
    
    # subset to sweep 2 (all 15 epochs)
    df=df[df['epoch']==15]
    
    # best frame
    best_rows = []
    for model in ['SimCLR', 'SimSiam', 'BYOL']:
        for dataset in ['uniform', 'moderate', 'heavy']:
            sub_df = df[(df['model'] == model) & (df['dataset'] == dataset)]
            if not sub_df.empty:
                best_row = sub_df.loc[sub_df['val_loss'].idxmin()]
                best_rows.append(best_row[['dataset', 'model', 'checkpoint', 'train_loss', 'val_loss', 'epoch', 'batch_size', 'temperature', 'learning_rate']])
            else:
                print(f"No data for model={model} and dataset={dataset}")
    
    # Combine the results into a single DataFrame.
    best_df = pd.DataFrame(best_rows)
    
    # delta
    best_df['gen_gap'] = best_df['val_loss'] - best_df['train_loss']
    
    # store
    best_df.to_csv('./report/best_models.csv', index=None)
    best_df.sort_values(by="dataset", inplace=True)
else:
    best_df = pd.read_csv('./report/best_models.csv')
