In [1]:
import pandas as pd
import numpy as np
import wandb
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Hardcoded parameters for the specific sweep
entity_name = "dive-ci"
project_name = "CLadder"
sweep_id = "ngq4k8jt"  #"qfwpl45m"

# Initialize wandb API
api = wandb.Api()

# Get the sweep runs
sweep = api.sweep(f"{entity_name}/{project_name}/{sweep_id}")

dataset_names = {'cladder-v1-q-commonsense': 'commonsense', 'cladder-v1-q-anticommonsense': 'anticommonsense', 'cladder-v1-q-noncommonsense': 'noncommonsense'}

# Collect run data
runs_data = []
for run in sweep.runs:
    # Get run summary and config
    run_data = {
        "id": run.id,
        "name": run.name,
        "state": run.state,
        "created_at": run.created_at,
        "summary": run.summary._json_dict if hasattr(run.summary, "_json_dict") else {},
        "config": run.config
    }
    
    # Extract relevant metrics and parameters
    summary = run_data["summary"]
    config = run_data["config"]
    # print(summary)
    
    run_info = {
        "run_id": run.id,
        "run_name": run.name,
        "reasoning": config.get("experiment.reasoning", None),
        "anonymize": config.get("dataset.anonymize", None),
        "learning_rate": config.get("training.learning_rate", None),
        "percent_train": config.get("dataset.percent_of_train_dataset", None),
        "num_epochs": config.get("training.num_train_epochs", None),
    }
    
    # Extract scores for different test datasets
    for data_name, score_key in dataset_names.items():
        run_info[score_key] = summary[data_name]
    
    runs_data.append(run_info)

# Convert to dataframe
df = pd.DataFrame(runs_data)

# Identify test dataset columns
test_dataset_columns = [col for col in df.columns if col in dataset_names.values()]
training_percentages = sorted(df['percent_train'].unique())
learning_rate = sorted(df['learning_rate'].unique())

print(f"Found {len(df)} runs with {len(test_dataset_columns)} test datasets")
print(f"Test datasets: {[col.replace('score_', '') for col in test_dataset_columns]}")
print(f"Training percentages: {training_percentages}")
df

Found 210 runs with 3 test datasets
Test datasets: ['commonsense', 'anticommonsense', 'noncommonsense']
Training percentages: [0.001, 0.005, 0.01, 0.02, 0.05]


Unnamed: 0,run_id,run_name,reasoning,anonymize,learning_rate,percent_train,num_epochs,commonsense,anticommonsense,noncommonsense
0,oip7h3lc,QWen-RTrue-DP0.05-EP64-AbsFalse-lr0.0006,True,False,0.00060,0.050,64,68.37,67.31,67.58
1,3lo61mx3,QWen-RTrue-DP0.05-EP32-AbsFalse-lr0.0006,True,False,0.00060,0.050,32,77.31,77.69,71.97
2,thqx9ha9,QWen-RTrue-DP0.05-EP16-AbsFalse-lr0.0006,True,False,0.00060,0.050,16,80.77,81.73,78.03
3,63w8go85,QWen-RTrue-DP0.05-EP8-AbsFalse-lr0.0006,True,False,0.00060,0.050,8,73.08,72.98,70.61
4,gmz08pm6,QWen-RTrue-DP0.05-EP4-AbsFalse-lr0.0006,True,False,0.00060,0.050,4,67.31,65.87,61.72
...,...,...,...,...,...,...,...,...,...,...
205,hgt14yll,QWen-RTrue-DP0.001-EP64-AbsTrue-lr0.00015,True,True,0.00015,0.001,64,53.46,55.10,56.15
206,mhc7na7q,QWen-RTrue-DP0.001-EP4-AbsTrue-lr0.00015,True,True,0.00015,0.001,4,48.17,50.48,45.51
207,muihqr8n,QWen-RTrue-DP0.001-EP8-AbsTrue-lr0.00015,True,True,0.00015,0.001,8,52.79,53.37,49.61
208,qxgfjwy3,QWen-RTrue-DP0.001-EP16-AbsTrue-lr0.00015,True,True,0.00015,0.001,16,49.42,50.77,50.39


In [32]:
sweep = api.sweep(f"{entity_name}/{project_name}/{sweep_id}")
sweep.runs[0].config["dataset.percent_of_train_dataset"]
# for key in sweep.runs[1].__dir__():
#     print(sweep.runs[0].getattr(key))



0.1

In [11]:

# Create comparison plots
def create_anonymize_comparison_plots(learning_rate):
    # Create a subplot for each test dataset and training percentage combination
    num_test_sets = len(test_dataset_columns)
    num_percentages = len(training_percentages)
    
    fig = make_subplots(
        rows=num_percentages, 
        cols=num_test_sets,
        subplot_titles=[f"{col} - {pct*100}% Training" 
                        for pct in training_percentages for col in test_dataset_columns],
        vertical_spacing=0.05,
        horizontal_spacing=0.05
    )
    
    # Color scheme for True/False anonymization
    colors = {"True": "rgb(31, 119, 180)", "False": "rgb(255, 127, 14)"}
    
    for i, pct in enumerate(training_percentages):
        for j, test_col in enumerate(test_dataset_columns):
            dataset_name = test_col
            
            # Filter data for this training percentage
            df_pct = df[(df['percent_train'] == pct) & (df['learning_rate'] == learning_rate)]
            
            # Group by anonymization and epochs, calculate mean scores
            for anon in [True, False]:
                df_group = df_pct[df_pct['anonymize'] == anon]
                
                # Skip if no data for this combination
                if len(df_group) == 0:
                    continue
                
                # Get average scores for each epoch
                epoch_scores = df_group.groupby('num_epochs')[test_col].mean().reset_index()
                # print(epoch_scores)
                
                # Add line to plot
                fig.add_trace(
                    go.Scatter(
                        x=epoch_scores['num_epochs'],
                        y=epoch_scores[test_col],
                        mode='lines+markers',
                        name=f"Anonymize={anon}",
                        line=dict(color=colors[str(anon)]),
                        legendgroup=f"Anonymize={anon}",
                        showlegend=(i==0 and j==0) # Only show legend once
                    ),
                    row=i+1, col=j+1
                )
            
            # Update axis labels
            if i == num_percentages-1:
                fig.update_xaxes(title_text="Number of Epochs", row=i+1, col=j+1)
            if j == 0:
                fig.update_yaxes(title_text="Score", row=i+1, col=j+1)
    
    # Update layout
    fig.update_layout(
        height=200*num_percentages,
        width=400*num_test_sets,
        title_text="Anonymize vs Non-Anonymize Performance Comparison",
        legend_title="Anonymization Setting",
        margin=dict(t=50, b=20, l=20, r=20),
    )
    
    return fig

# Create and display the comparison plots
for lr in learning_rate:
    print(f"Creating comparison plots for learning rate: {lr}")
    comparison_fig = create_anonymize_comparison_plots(lr)
    comparison_fig.show()

Creating comparison plots for learning rate: 0.00015


Creating comparison plots for learning rate: 0.0003


Creating comparison plots for learning rate: 0.0006


In [8]:

# Create a simplified single-row comparison for easier viewing
def create_simplified_comparison():
    # Create one row of plots, one for each test dataset
    fig = make_subplots(
        rows=1, 
        cols=len(test_dataset_columns),
        subplot_titles=[col.replace('score_', '') for col in test_dataset_columns],
        horizontal_spacing=0.1
    )
    
    # Color scheme for different training percentages
    colors = px.colors.qualitative.Plotly
    
    for j, test_col in enumerate(test_dataset_columns):
        dataset_name = test_col.replace('score_', '')
        
        # For each training percentage
        for i, pct in enumerate(training_percentages):
            # Filter data for this training percentage
            df_pct = df[df['percent_train'] == pct]
            
            # For anonymize=True and anonymize=False
            for k, anon in enumerate([True, False]):
                df_group = df_pct[df_pct['anonymize'] == anon]
                
                # Skip if no data for this combination
                if len(df_group) == 0:
                    continue
                
                # Get average scores for each epoch
                epoch_scores = df_group.groupby('num_epochs')[test_col].mean().reset_index()
                
                # Add line to plot
                fig.add_trace(
                    go.Scatter(
                        x=epoch_scores['num_epochs'],
                        y=epoch_scores[test_col],
                        mode='lines+markers',
                        name=f"{pct*100}% Train, Anon={anon}",
                        line=dict(
                            color=colors[i % len(colors)],
                            dash='dash' if anon else 'solid'
                        ),
                    ),
                    row=1, col=j+1
                )
            
            # Update axis labels
            fig.update_xaxes(title_text="Number of Epochs", row=1, col=j+1)
            if j == 0:
                fig.update_yaxes(title_text="Score", row=1, col=j+1)
    
    # Update layout
    fig.update_layout(
        height=500,
        width=350*len(test_dataset_columns),
        title_text="Anonymize vs Non-Anonymize Performance Comparison",
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.3,
            xanchor="center",
            x=0.5
        ),
        margin=dict(b=100)
    )
    
    return fig

# Create and display the simplified comparison
simple_fig = create_simplified_comparison()
simple_fig.show()