In [21]:

import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))
import pandas as pd
import numpy as np
import hashlib

from data_loader.loader import load_buggy_dataset, load_data, save_data
from bug_detective.detective import process_prompt_dataset
from utils.visualization import compare_groundtruth_vs_corrected_histograms, compare_metrics_versus_bug_histograms, plot_column_distribution, plot_metrics_boxplots
from utils.simple_metrics import compute_and_store_metrics
from typing import Dict,List


data_path = '/Users/zanchitta/Developer/BugDetectiveAI/Bugdetectiveai/data/checkpoints/'

### Load all Experiments


In [22]:
def create_identifier(df: pd.DataFrame, prompt: str) -> pd.DataFrame:
    df['sample_uuid'] = df.apply(
        lambda row: hashlib.md5(
            (row['before_merge_without_docstrings'] + row['after_merge_without_docstrings']).encode('utf-8')
        ).hexdigest(),
        axis=1
    )
    df['prompt'] = prompt

    return df

In [23]:
baseline = load_data(data_path + "300_metrics_prompt_baseline_system_none.pkl")
system_apr = load_data(data_path + "300_metrics_prompt_baseline_system_apr.pkl")
prompt_style = load_data(data_path + "300_metrics_prompt_style_based.pkl")

baseline = create_identifier(baseline,"baseline")
system_apr = create_identifier(system_apr,"system_apr")
prompt_style = create_identifier(prompt_style,"prompt_style")

In [24]:
baseline.head()

Unnamed: 0,before_merge_without_docstrings,after_merge_without_docstrings,traceback_type,full_traceback,response_qwen-2.5-coder-32b-instruct,response_codestral-2501,response_gpt-4o,response_claude-3.5-sonnet,metric_qwen-2.5-coder-32b-instruct_ast_score,metric_qwen-2.5-coder-32b-instruct_text_score,...,metric_claude-3.5-sonnet_ast_score,metric_claude-3.5-sonnet_text_score,metric_claude-3.5-sonnet_ast_score_normalized,metric_claude-3.5-sonnet_codebleu,metric_claude-3.5-sonnet_ngram_match_score,metric_claude-3.5-sonnet_weighted_ngram_match_score,metric_claude-3.5-sonnet_syntax_match_score,metric_claude-3.5-sonnet_dataflow_match_score,sample_uuid,prompt
75748,"def run(self, cmd, **kwargs):\n sel...","def run(self, cmd, **kwargs):\n sel...",IndexError,An unexpected error has occurred: IndexError: ...,"def run(self, cmd, **kwargs):\n self._creat...","def run(self, cmd, **kwargs):\n self._creat...","def run(self, cmd, **kwargs):\n self._creat...","def run(self, cmd, **kwargs):\n self._creat...",0.0,0.688742,...,0.0,0.498246,0.0,0.484122,0.294467,0.33963,0.6,0.647059,8caf7a732dd1acbd2731a012a8bd45fe,baseline
583899,"def process(self, argv=None, executable=No...","def process(self, argv=None, executable=No...",TypeError,[+] Downloading '/proc/sys/kernel/core_pattern...,"def process(self, argv=None, executable=None, ...","def process(self, argv=None, executable=None, ...",cwd = cwd or self.cwd\n if isinstance(c...,"def process(self, argv=None, executable=None, ...",0.0,0.94899,...,0.0,0.807457,1.0,0.691194,0.705416,0.741912,0.818316,0.60514,d1e8f77aa937f586008b06aabb60d1ac,baseline
532193,"def backprojection(calibrated_event_list, pixe...","def backprojection(calibrated_event_list, pixe...",TypeError,import sunpy.data\n>>> import sunpy.data.sampl...,"def backprojection(calibrated_event_list, pixe...","def backprojection(calibrated_event_list, pixe...","def backprojection(calibrated_event_list, pixe...","def backprojection(calibrated_event_list, pixe...",0.835698,0.871159,...,0.714742,0.798102,0.168188,0.652772,0.52919,0.709786,0.79096,0.592105,1ca9ee95a397f49814adbe7531b4e88a,baseline
357599,"def export_annotations(self,export_range,e...","def export_annotations(self,export_range,e...",AttributeError,"Traceback (most recent call last):\nFile ""main...","def export_annotations(self, export_range, exp...","def export_annotations(self, export_range, exp...","def export_annotations(self, export_range, exp...","def export_annotations(self,export_range,expor...",0.0,0.88188,...,0.0,0.884472,0.0,0.959438,0.914406,0.919994,1.0,1.0,2b0af7a3b93947509281e74e469190c6,baseline
45974,"def summarize_corpus(corpus, ratio=0.2):\n ...","def summarize_corpus(corpus, ratio=0.2):\n ...",TypeError,----------------------------------------------...,"def summarize_corpus(corpus, ratio=0.2):\n ...","def summarize_corpus(corpus, ratio=0.2):\n ...","def summarize_corpus(corpus, ratio=0.2):\n ...","def summarize_corpus(corpus, ratio=0.2):\n ...",0.0,0.892398,...,0.0,0.909631,0.0,0.765659,0.897493,0.935363,0.917647,0.525,9fcbb3b0aa8818b51aba3778799ed601,baseline


In [None]:
from utils.simple_metrics import wilcoxon_test

# Example: Compare two sets of metrics between experiments using Wilcoxon signed-rank test
# Let's assume we want to compare 'accuracy' between baseline and system_apr
def wilcoxon_test_cross_dataset_simple(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    model_name: str,
    metric_name: str = "ast_score",
    reference_column: str = "after_merge_without_docstrings",
    id_column: str = "bug_id",
    alternative: str = "two-sided"
) -> Dict[str, float]:
    """Perform Wilcoxon test for the same model across two datasets using existing function.
    
    This function reuses the existing wilcoxon_test by merging datasets and creating
    separate response columns for each dataset.
    
    Args:
        df1 (pd.DataFrame): First dataset DataFrame
        df2 (pd.DataFrame): Second dataset DataFrame  
        model_name (str): Name of the model to compare (e.g., "gpt4", "claude")
        metric_name (str): Name of the metric to compare (default: "ast_score")
        reference_column (str): Reference column used for metric calculation
        id_column (str): Column name to match instances across datasets
        alternative (str): Alternative hypothesis: "two-sided", "greater", or "less"
    
    Returns:
        Dict[str, float]: Dictionary containing test results from existing wilcoxon_test
    """
    # Find the response column for the specified model in each dataset
    response_col1 = None
    response_col2 = None
    
    for col in df1.columns:
        if col.startswith("response_") and model_name.lower() in col.lower():
            response_col1 = col
            break
            
    for col in df2.columns:
        if col.startswith("response_") and model_name.lower() in col.lower():
            response_col2 = col
            break
    
    if not response_col1:
        raise ValueError(f"Model '{model_name}' not found in dataset 1")
    if not response_col2:
        raise ValueError(f"Model '{model_name}' not found in dataset 2")
    
    # Ensure metrics are computed for both datasets
    if not has_metrics_columns(df1, response_col1):
        print(f"Computing metrics for dataset 1...")
        df1 = compute_and_store_metrics(df1, reference_column, [response_col1])
    
    if not has_metrics_columns(df2, response_col2):
        print(f"Computing metrics for dataset 2...")
        df2 = compute_and_store_metrics(df2, reference_column, [response_col2])
    
    # Get metric column names
    clean_model_name = response_col1.replace("response_", "")
    metric_col1 = f"metric_{clean_model_name}_{metric_name}"
    metric_col2 = f"metric_{clean_model_name}_{metric_name}"
    
    
    merged_df = pd.merge(
        df1[[id_column, metric_col1]], 
        df2[[id_column, metric_col2]], 
        on=id_column, 
        how='inner',
        suffixes=('_dataset1', '_dataset2')
    )
    
    # Create a unified DataFrame that mimics the structure expected by wilcoxon_test
    unified_df = pd.DataFrame({
        id_column: merged_df[id_column],
        "response_dataset1": merged_df[metric_col1 + "_dataset1"],
        "response_dataset2": merged_df[metric_col2 + "_dataset2"],
        "dummy_reference": merged_df[metric_col1 + "_dataset1"]  # For compatibility
    })
    
    # Use the existing wilcoxon_test function directly!
    return wilcoxon_test(
        df=unified_df,
        response_column1="response_dataset1",
        response_column2="response_dataset2",
        metric_name=metric_name,  # This won't be used since we're passing metric values directly
        reference_column="dummy_reference",
        alternative=alternative
    )

In [26]:

# Compare GPT-4 performance across experiments using existing function
results = wilcoxon_test_cross_dataset_simple(
    df1=baseline,
    df2=system_apr,
    model_name="qwen-2.5-coder-32b-instruct",
    metric_name="codebleu",
    id_column="sample_uuid"
)

print(f"Cross-dataset Wilcoxon test results:")
print(f"p-value: {results['pvalue']:.6f}")
print(f"Significant difference: {'Yes' if results['significant'] else 'No'}")
print(f"Effect size: {results['effect_size']:.4f}")
print(f"Matching instances: {results['matching_instances']}")

NameError: name 'metric_col1' is not defined