In [None]:
!pip install openai

In [None]:
!pip install evaluate

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

from tqdm import tqdm
import warnings

import model_eval
from model_eval import *
from spider_utils_py import load_csv_database
#from process_transformations import split_code_into_blocks, generate_nl_description

# required for spider2 csvs
CSV_DBS_BASE_PATH = "/kaggle/input/spider-dbs-csv"
os.environ["DB_CSVS_BASE_PATH"] = CSV_DBS_BASE_PATH

# enable tqdm for pandas
tqdm.pandas()

# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

# set pandas to display in 2 decimal places
pd.set_option('display.float_format', '{:.2f}'.format)  

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Set up

In [None]:
MODEL_COL = 'model'
ORIGINAL_NOTEBOOK_COL = 'nb_source'
ACTUAL_CODE_COL = 'code'
INPUT_DATA_COL = 'inputs'
OUTPUT_DATA_COL = 'outputs'
TRANSFORMATION_DESCRIPTION_COL = 'intent'
GENERATED_CODE_COL = 'generated_intent_code'
GENERATED_INPUT_DATA_COL = 'gen_inputs'
GENERATED_OUTPUT_DATA_COL = 'generated_outputs'

In [None]:
MAX_GEN_INTENTS = None # limit how many to generate

# Load Data

## m=n=1

In [None]:
# Arcade existing

# Gemini completions
arcade_existing_gemini_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_existing_transformed_generated_google_ALL_notebooks_m_1_n_1.csv')
arcade_existing_gemini_df['model'] = 'Gemini'
arcade_existing_gemini_df['benchmark'] = 'ARCADE existing'

# Claude completions
arcade_existing_claude_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_existing_transformed_generated_anthropic_ALL_notebooks_m_1_n_1.csv')
arcade_existing_claude_df['model'] = 'Claude'
arcade_existing_claude_df['benchmark'] = 'ARCADE existing'

# ChatGPT completions
arcade_existing_chatgpt_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_existing_transformed_generated_openai_ALL_notebooks_m_1_n_1.csv')
arcade_existing_chatgpt_df['model'] = 'ChatGPT'
arcade_existing_chatgpt_df['benchmark'] = 'ARCADE existing'

# Arcade new

# Gemini completions
arcade_new_gemini_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_new_transformed_generated_google_ALL_notebooks_m_1_n_1.csv')
arcade_new_gemini_df['model'] = 'Gemini'
arcade_new_gemini_df['benchmark'] = 'ARCADE new'

# Claude completions
arcade_new_claude_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_new_transformed_generated_anthropic_ALL_notebooks_m_1_n_1.csv')
arcade_new_claude_df['model'] = 'Claude'
arcade_new_claude_df['benchmark'] = 'ARCADE new'

# ChatGPT completions
arcade_new_chatgpt_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_new_transformed_generated_openai_ALL_notebooks_m_1_n_1.csv')
arcade_new_chatgpt_df['model'] = 'ChatGPT'
arcade_new_chatgpt_df['benchmark'] = 'ARCADE new'


# Spider2-intents

# Gemini completions
spider2_intents_gemini_df = pd.read_csv('/kaggle/input/llm-etl-data-set/spider2_intents_transformed_generated_google_ALL_notebooks_m_1_n_1.csv')
spider2_intents_gemini_df['model'] = 'Gemini'
spider2_intents_gemini_df['benchmark'] = 'Spider2-intents'

# Claude completions
spider2_intents_claude_df = pd.read_csv('/kaggle/input/llm-etl-data-set/spider2_intents_transformed_generated_anthropic_ALL_notebooks_m_1_n_1.csv')
spider2_intents_claude_df['model'] = 'Claude'
spider2_intents_claude_df['benchmark'] = 'Spider2-intents'

# ChatGPT completions
spider2_intents_chatgpt_df = pd.read_csv('/kaggle/input/llm-etl-data-set/spider2_intents_transformed_generated_openai_ALL_notebooks_m_1_n_1.csv')
spider2_intents_chatgpt_df['model'] = 'ChatGPT'
spider2_intents_chatgpt_df['benchmark'] = 'Spider2-intents'



In [None]:
test_df = pd.concat([
    arcade_existing_gemini_df, 
    arcade_existing_claude_df, 
    arcade_existing_chatgpt_df,
    arcade_new_gemini_df, 
    arcade_new_claude_df, 
    arcade_new_chatgpt_df,
    spider2_intents_gemini_df, 
    spider2_intents_claude_df, 
    spider2_intents_chatgpt_df,
])
test_df['m'] = 1
test_df['n'] = 1

In [None]:
test_df.groupby(["benchmark", 'model']).intent_number.count()

## m, n > 1

In [None]:
# chatgpt 10 arcade existing notebooks 
chatgpt_m1n1_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_transformed_generated_openai_ten_notebooks_m_1_n_1.csv')
chatgpt_m1n2_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_transformed_generated_openai_ten_notebooks_m_1_n_2.csv')
chatgpt_m1n3_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_transformed_generated_openai_ten_notebooks_m_1_n_3.csv')
chatgpt_m2n1_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_transformed_generated_openai_ten_notebooks_m_2_n_1.csv')
chatgpt_m2n2_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_transformed_generated_openai_ten_notebooks_m_2_n_2.csv')
chatgpt_m3n3_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_transformed_generated_openai_ten_notebooks_m_3_n_3.csv')
chatgpt_m3n1_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_transformed_generated_openai_ten_notebooks_m_3_n_1.csv')
chatgpt_m3n2_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_transformed_generated_openai_ten_notebooks_m_3_n_2.csv')
chatgpt_m2n3_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_transformed_generated_openai_ten_notebooks_m_2_n_3.csv')

chatgpt_m1n1_df['model'] = 'ChatGPT'
chatgpt_m1n2_df['model'] = 'ChatGPT'
chatgpt_m1n3_df['model'] = 'ChatGPT'
chatgpt_m2n1_df['model'] = 'ChatGPT'
chatgpt_m2n2_df['model'] = 'ChatGPT'
chatgpt_m2n3_df['model'] = 'ChatGPT'
chatgpt_m3n1_df['model'] = 'ChatGPT'
chatgpt_m3n2_df['model'] = 'ChatGPT'
chatgpt_m3n3_df['model'] = 'ChatGPT'

chatgpt_m1n1_df['benchmark'] = 'ARCADE existing'
chatgpt_m1n2_df['benchmark'] = 'ARCADE existing'
chatgpt_m1n3_df['benchmark'] = 'ARCADE existing'
chatgpt_m2n1_df['benchmark'] = 'ARCADE existing'
chatgpt_m2n2_df['benchmark'] = 'ARCADE existing'
chatgpt_m2n3_df['benchmark'] = 'ARCADE existing'
chatgpt_m3n1_df['benchmark'] = 'ARCADE existing'
chatgpt_m3n2_df['benchmark'] = 'ARCADE existing'
chatgpt_m3n3_df['benchmark'] = 'ARCADE existing'

chatgpt_m1n1_df['m'] = 1
chatgpt_m1n2_df['m'] = 1
chatgpt_m1n3_df['m'] = 1
chatgpt_m2n1_df['m'] = 2
chatgpt_m2n2_df['m'] = 2
chatgpt_m2n3_df['m'] = 2
chatgpt_m3n1_df['m'] = 3
chatgpt_m3n2_df['m'] = 3
chatgpt_m3n3_df['m'] = 3

chatgpt_m1n1_df['n'] = 1
chatgpt_m1n2_df['n'] = 2
chatgpt_m1n3_df['n'] = 3
chatgpt_m2n1_df['n'] = 1
chatgpt_m2n2_df['n'] = 2
chatgpt_m2n3_df['n'] = 3
chatgpt_m3n1_df['n'] = 1
chatgpt_m3n2_df['n'] = 2
chatgpt_m3n3_df['n'] = 3

chatgpt_10_test_mn_df = pd.concat([
    chatgpt_m1n1_df,
    chatgpt_m1n2_df,
    chatgpt_m1n3_df,
    chatgpt_m2n1_df,
    chatgpt_m2n2_df,
    chatgpt_m2n3_df,
    chatgpt_m3n1_df,
    chatgpt_m3n2_df,
    chatgpt_m3n3_df    
])

In [None]:
# gemini all arcade new notebooks; note some only have 20 notebooks
gemini_m1n1_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_new_transformed_generated_google_ALL_notebooks_m_1_n_1.csv')
gemini_m1n2_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_new_transformed_generated_google_ALL_notebooks_m_1_n_2.csv')
gemini_m1n3_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_new_transformed_generated_google_ALL_notebooks_m_1_n_3.csv')
gemini_m2n1_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_new_transformed_generated_google_ALL_notebooks_m_2_n_1.csv')
gemini_m2n2_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_new_transformed_generated_google_ALL_notebooks_m_2_n_2.csv')
gemini_m3n3_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_new_transformed_generated_google_ALL_notebooks_m_3_n_3.csv')
gemini_m3n1_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_20_new_transformed_generated_google_ALL_notebooks_m_3_n_1.csv')
gemini_m3n2_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_20_new_transformed_generated_google_ALL_notebooks_m_3_n_2.csv')
gemini_m2n3_df = pd.read_csv('/kaggle/input/llm-etl-data-set/arcade_20_new_transformed_generated_google_ALL_notebooks_m_2_n_3.csv')

gemini_m1n1_df['model'] = 'Gemini'
gemini_m1n2_df['model'] = 'Gemini'
gemini_m1n3_df['model'] = 'Gemini'
gemini_m2n1_df['model'] = 'Gemini'
gemini_m2n2_df['model'] = 'Gemini'
gemini_m2n3_df['model'] = 'Gemini'
gemini_m3n1_df['model'] = 'Gemini'
gemini_m3n2_df['model'] = 'Gemini'
gemini_m3n3_df['model'] = 'Gemini'

gemini_m1n1_df['benchmark'] = 'ARCADE new'
gemini_m1n2_df['benchmark'] = 'ARCADE new'
gemini_m1n3_df['benchmark'] = 'ARCADE new'
gemini_m2n1_df['benchmark'] = 'ARCADE new'
gemini_m2n2_df['benchmark'] = 'ARCADE new'
gemini_m2n3_df['benchmark'] = 'ARCADE new'
gemini_m3n1_df['benchmark'] = 'ARCADE new'
gemini_m3n2_df['benchmark'] = 'ARCADE new'
gemini_m3n3_df['benchmark'] = 'ARCADE new'

gemini_m1n1_df['m'] = 1
gemini_m1n2_df['m'] = 1
gemini_m1n3_df['m'] = 1
gemini_m2n1_df['m'] = 2
gemini_m2n2_df['m'] = 2
gemini_m2n3_df['m'] = 2
gemini_m3n1_df['m'] = 3
gemini_m3n2_df['m'] = 3
gemini_m3n3_df['m'] = 3

gemini_m1n1_df['n'] = 1
gemini_m1n2_df['n'] = 2
gemini_m1n3_df['n'] = 3
gemini_m2n1_df['n'] = 1
gemini_m2n2_df['n'] = 2
gemini_m2n3_df['n'] = 3
gemini_m3n1_df['n'] = 1
gemini_m3n2_df['n'] = 2
gemini_m3n3_df['n'] = 3


test_mn_df = pd.concat([
    gemini_m1n1_df,
    gemini_m1n2_df,
    gemini_m1n3_df,
    gemini_m2n1_df,
    gemini_m2n2_df,
    gemini_m2n3_df,
    gemini_m3n1_df,
    gemini_m3n2_df,
    gemini_m3n3_df    
])


## Filter Data

In [None]:
# intents with predictions
test_df = test_df[~test_df.generated_intent_code.isna()]
test_mn_df = test_mn_df[~test_mn_df.generated_intent_code.isna()]

In [None]:
# no execution errors
test_df = test_df[~test_df.execute_error]
test_mn_df = test_mn_df[~test_mn_df.execute_error]

In [None]:
# mxn notebooks that are common across all experiments
# Create sets of `nb_name` for each dataset
nb_name_sets = [
    set(gemini_m1n1_df['nb_name']),
    set(gemini_m1n2_df['nb_name']),
    set(gemini_m1n3_df['nb_name']),
    set(gemini_m2n1_df['nb_name']),
    set(gemini_m2n2_df['nb_name']),
    set(gemini_m2n3_df['nb_name']),
    set(gemini_m3n1_df['nb_name']),
    set(gemini_m3n2_df['nb_name']),
    set(gemini_m3n3_df['nb_name'])
]

# Find the intersection of all `nb_name` sets
common_nb_names = set.intersection(*nb_name_sets)

# Filter the combined DataFrame to keep only rows with `nb_name` in the intersection
filtered_test_mn_df = test_mn_df[test_mn_df['nb_name'].isin(common_nb_names)]

# Display the filtered DataFrame
print("Number of notebooks common to all mxn experiments:", len(filtered_test_mn_df.nb_name.unique()))
print(filtered_test_mn_df.nb_name.unique())

test_mn_df = filtered_test_mn_df

In [None]:
# make sure known notebook with header error has been removed
assert(test_df[test_df.nb_name == 'dataset_chipotle/notebook_1/annotated.ipynb'].empty)
assert(test_mn_df[test_mn_df.nb_name == 'dataset_chipotle/notebook_1/annotated.ipynb'].empty)

In [None]:
print("Filtered test_df shape:", test_df.shape)
print("Filtered test_mn_df shape:", test_mn_df.shape)

# Model Eval Functions

In [None]:
import pandas as pd
import json
import time
import psutil
import os
import tracemalloc
from threading import Thread
from collections import Counter
from difflib import SequenceMatcher
import difflib
import re
import ast
from tqdm import tqdm
from openai import OpenAI
#import nltk
#from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import evaluate

from process_transformations import split_code_into_blocks, generate_nl_description 

def compare_code(generated_code, actual_code):
    '''Compares code using the diff library similar to version control'''
    generated_code = filter_comments(generated_code)
    actual_code = filter_comments(actual_code)
    
    generated_lines = generated_code.split('\n')
    actual_lines = actual_code.split('\n')

    diff = difflib.unified_diff(
        generated_lines, actual_lines,
        fromfile='Generated Code', tofile='Actual Code',
        lineterm=''
    )
    
    for line in diff:
        if line.startswith('---') or line.startswith('+++'):
            print(f'\033[1;34m{line}\033[0m')  # Blue header
        elif line.startswith('-'):
            print(f'\033[1;31m{line}\033[0m')  # Red for deletions
        elif line.startswith('+'):
            print(f'\033[1;32m{line}\033[0m')  # Green for additions
        else:
            print(line)  # Normal text

def clean_up_llm_gen_code(generated_code):
    '''Applies clean up to code generate by LLMs eg Markdown code ``` delimiters'''
    generated_code = generated_code.str.removeprefix("```python").str.lstrip()
    generated_code = generated_code.str.removesuffix("```").str.rstrip()
    return generated_code

# Function to calculate BLEU-3 score using simple n-gram matching
def calculate_bleu_3(reference, candidate):
    """
    Computes a simple BLEU-3-like score based on n-gram overlap.
    """
    # Check for empty reference or candidate
    if not reference or not candidate:
        return 0.0  # Return a BLEU score of 0 for empty inputs
    
    # Load the BLEU metric
    bleu = evaluate.load("bleu")
     
    # Compute BLEU-3 score
    results = bleu.compute(predictions=[candidate], references=[[reference]], max_order=3)
    #print(results['bleu'])    
    return results['bleu']

# Function to calculate percentage structure and data correctness
def compare_objects(obj_actual, obj_gen):
    """
    Compare two objects (DataFrame or Series) and calculate structure and content correctness scores.
    """
    if obj_actual is None or obj_gen is None:
        return (0.0, 0.0)

    # Handle DataFrame comparison
    if isinstance(obj_actual, pd.DataFrame) and isinstance(obj_gen, pd.DataFrame):
        # Structural similarity: Check matching columns
        common_columns = set(obj_gen.columns).intersection(set(obj_actual.columns))
        structure_score = len(common_columns) / max(len(obj_gen.columns), len(obj_actual.columns))
        
        # Align data by reindexing to the same number of rows
        max_rows = max(len(obj_gen), len(obj_actual))
        obj_gen = obj_gen.reindex(range(max_rows))
        obj_actual = obj_actual.reindex(range(max_rows))
        
        # Content correctness: Count exact matches including missing values
        exact_matches = 0
        total_values = 0
        
        for col in common_columns:
            matches = (obj_gen[col] == obj_actual[col]) & obj_gen[col].notna() & obj_actual[col].notna()
            exact_matches += matches.sum()
            total_values += max(len(obj_gen[col].dropna()), len(obj_actual[col].dropna()))
        
        content_score = exact_matches / total_values if total_values > 0 else 0
        return (structure_score, content_score)

    # Handle Series comparison
    elif isinstance(obj_actual, pd.Series) and isinstance(obj_gen, pd.Series):
        # Structural similarity: Check if indices match
        common_indices = set(obj_gen.index).intersection(set(obj_actual.index))
        structure_score = len(common_indices) / max(len(obj_gen.index), len(obj_actual.index))
        
        # Align data by reindexing to the same indices
        obj_gen = obj_gen.reindex(obj_actual.index)
        
        # Content correctness: Count exact matches including missing values
        matches = (obj_gen == obj_actual) & obj_gen.notna() & obj_actual.notna()
        exact_matches = matches.sum()
        total_values = max(len(obj_gen.dropna()), len(obj_actual.dropna()))
        
        content_score = exact_matches / total_values if total_values > 0 else 0
        return (structure_score, content_score)

    # If types don't match, return 0 scores
    else:
        return (0.0, 0.0)


def measure_performance(func, *args, **kwargs):
    """
    Measures execution time, memory usage, and peak CPU utilization of a function.
    """
    # Start tracking memory and time
    tracemalloc.start()
    start_time = time.perf_counter()  # High-resolution start time
    process = psutil.Process(os.getpid())
    start_memory = process.memory_info().rss

    # Variable to store peak CPU utilization
    peak_cpu = 0

    # Function to monitor CPU usage in a separate thread
    def monitor_cpu():
        nonlocal peak_cpu
        while not stop_monitoring:
            cpu_usage = process.cpu_percent(interval=0.01)  # Measure CPU usage every 0.1 seconds
            peak_cpu = max(peak_cpu, cpu_usage)

    # Start monitoring CPU usage
    stop_monitoring = False
    cpu_thread = Thread(target=monitor_cpu)
    cpu_thread.start()

    # Execute the function
    result = func(*args, **kwargs)

    # Stop monitoring CPU usage
    stop_monitoring = True
    cpu_thread.join()

    # Stop tracking memory and time
    end_time = time.perf_counter()  # High-resolution end time
    peak_memory = tracemalloc.get_traced_memory()[1]  # Peak memory usage
    end_memory = process.memory_info().rss
    tracemalloc.stop()

    # Calculate elapsed time and memory difference
    elapsed_time = (end_time - start_time)*1000
    memory_difference = end_memory - start_memory  # Memory difference from baseline

    return result, elapsed_time, peak_memory, peak_cpu
    

def evaluate_completions(test_df, repeat_runs=3):
    """
    Evaluates generated code and stores metrics directly in the original DataFrame.
    Captures metrics for every run and adds a run number to the output.

    Params:
        test_df: DataFrame containing test cases.
        repeat_runs: Number of times to repeat each intent for performance measurement.
    """
    # Initialize metrics
    test_df["bleu_3_exact_code"] = float("nan")
    test_df["output_structure_score"] = float("nan")
    test_df["output_data_score"] = float("nan")
    test_df["original_execution_time"] = float("nan")
    test_df["original_peak_memory"] = float("nan")
    test_df["original_peak_cpu"] = float("nan")
    test_df["generated_execution_time"] = float("nan")
    test_df["generated_peak_memory"] = float("nan")
    test_df["generated_peak_cpu"] = float("nan")
    test_df["generated_error"] = bool("nan")
    test_df["original_error"] = bool("nan")
    test_df["generated_error_message"] = ''
    test_df["original_error_message"] = ''
    test_df["original_code_lines"] = float("nan") 
    test_df["generated_code_lines"] = float("nan") 
    test_df['repeat_runs'] = repeat_runs
  
    df_nbs = test_df.groupby(["benchmark", "model", "nb_name", "m", "n"])

    df_nbs = test_df.groupby(["benchmark", "model", "nb_name", "m", "n"])

    for (benchmark, model, nb_name, m, n), intents in tqdm(df_nbs):
        print(f"Evaluating (model, notebook, m, n): {benchmark, model, nb_name, m, n}")

        # Step 1: Execute the notebook header
        nb_header = intents.iloc[0]["nb_setup_code"]  # Assuming the header code is in the first row

        # Initialize execution states for original and generated
        exec_state_org = {"pd": pd, "load_csv_database": load_csv_database, "np": np}
        exec_state_gen = {"pd": pd, "load_csv_database": load_csv_database, "np": np}

        # Execute the notebook setup
        try:
            outputs, exec_state_org = execute_intent_code(exec_state_org, nb_header, verbose=False)
            inputs = outputs  # Initialize inputs with the header execution outputs
            exec_state_gen = exec_state_org.copy()  # The state after executing the same header is the same for org and gen
        except Exception as e:
            print(f"Error executing notebook header for {nb_name}: {e}")
            continue  # Skip this notebook if the header fails

        # Step 2: Process each intent in the notebook
        for _, row in intents.iterrows():
            model = row[MODEL_COL]
            nb_name = row["nb_name"]
            intent = row["intent"]
            input_data = row[INPUT_DATA_COL]
            description = row[TRANSFORMATION_DESCRIPTION_COL]
            actual_code = filter_comments(row[ACTUAL_CODE_COL])
            output_data = row[OUTPUT_DATA_COL]
            generated_code = filter_comments(row[GENERATED_CODE_COL])

            # Calculate number of lines of code for original and generated code
            original_code_lines = len(actual_code.splitlines()) if actual_code else 0
            generated_code_lines = len(generated_code.splitlines()) if generated_code else 0

            # First execution: Update exec_states and capture outputs
            try:
                print("Executing original code (first run)...")
                (original_outputs, exec_state_org), original_elapsed_time, original_peak_memory, original_peak_cpu = measure_performance(
                    execute_intent_code, exec_state_org, actual_code, verbose=False
                )
                original_error = 'error' in original_outputs
            except Exception as e:
                print(f"Error executing original code: {e}")
                original_outputs = {'error': str(e)}
                original_elapsed_time = None
                original_peak_memory = None
                original_peak_cpu = None
                original_error = True

            try:
                print("Executing generated code (first run)...")
                (generated_outputs, exec_state_gen), generated_elapsed_time, generated_peak_memory, generated_peak_cpu = measure_performance(
                    execute_intent_code, exec_state_gen, generated_code, verbose=False
                )
                generated_error = 'error' in generated_outputs
            except Exception as e:
                print(f"Error executing generated code: {e}")
                generated_outputs = {'error': str(e)}
                generated_elapsed_time = None
                generated_peak_memory = None
                generated_peak_cpu = None
                generated_error = True

            # Subsequent executions: Only measure performance
            original_times, original_memories, original_cpus = [original_elapsed_time], [original_peak_memory], [original_peak_cpu]
            generated_times, generated_memories, generated_cpus = [generated_elapsed_time], [generated_peak_memory], [generated_peak_cpu]

            for run_number in range(2, repeat_runs + 1):
                print(f"Run {run_number} for intent {intent} (performance measurement only)")

                # Measure performance of original code execution
                try:
                    _, elapsed_time, peak_memory, peak_cpu = measure_performance(
                        execute_intent_code, exec_state_org.copy(), actual_code, verbose=False
                    )
                    original_times.append(elapsed_time)
                    original_memories.append(peak_memory)
                    original_cpus.append(peak_cpu)
                except Exception as e:
                    print(f"Error executing original code: {e}")
                    original_times.append(None)
                    original_memories.append(None)
                    original_cpus.append(None)

                # Measure performance of generated code execution
                try:
                    _, elapsed_time, peak_memory, peak_cpu = measure_performance(
                        execute_intent_code, exec_state_gen.copy(), generated_code, verbose=False
                    )
                    generated_times.append(elapsed_time)
                    generated_memories.append(peak_memory)
                    generated_cpus.append(peak_cpu)
                except Exception as e:
                    print(f"Error executing generated code: {e}")
                    generated_times.append(None)
                    generated_memories.append(None)
                    generated_cpus.append(None)

            # Calculate averages
            avg_original_time = sum(filter(None, original_times)) / len(list(filter(None, original_times))) if any(original_times) else None
            avg_original_memory = sum(filter(None, original_memories)) / len(list(filter(None, original_memories))) if any(original_memories) else None
            avg_original_cpu = sum(filter(None, original_cpus)) / len(list(filter(None, original_cpus))) if any(original_cpus) else None

            avg_generated_time = sum(filter(None, generated_times)) / len(list(filter(None, generated_times))) if any(generated_times) else None
            avg_generated_memory = sum(filter(None, generated_memories)) / len(list(filter(None, generated_memories))) if any(generated_memories) else None
            avg_generated_cpu = sum(filter(None, generated_cpus)) / len(list(filter(None, generated_cpus))) if any(generated_cpus) else None

            # Compute BLEU-3 score for overall code correctness
            bleu_score = calculate_bleu_3(actual_code, generated_code)

            # Compute BLEU-3 score for overall code correctness
            bleu_score = calculate_bleu_3(actual_code, generated_code)

            # Compare DataFrames
            try:
                # Capture both DataFrames and Series
                original_objects = [
                    pd.DataFrame(json.loads(value)) if value.startswith("[{") and value.endswith("}]") else pd.Series(json.loads(value))
                    for key, value in original_outputs.items()
                    if isinstance(value, str) and (value.startswith("[{") and value.endswith("}]") or value.startswith("{") and value.endswith("}"))
                ]
                
                generated_objects = [
                    pd.DataFrame(json.loads(value)) if value.startswith("[{") and value.endswith("}]") else pd.Series(json.loads(value))
                    for key, value in generated_outputs.items()
                    if isinstance(value, str) and (value.startswith("[{") and value.endswith("}]") or value.startswith("{") and value.endswith("}"))
                ]
                
                # Compare each original DataFrame with every generated DataFrame
                max_structure_scores = []
                max_data_scores = []

                for original_obj in original_objects:
                    structure_scores = []
                    data_scores = []
                    for generated_obj in generated_objects:
                        output_structure_score, output_data_score = compare_objects(original_obj, generated_obj)
                        structure_scores.append(output_structure_score)
                        data_scores.append(output_data_score)

                    # Store the maximum scores for this original DataFrame
                    if structure_scores:
                        max_structure_scores.append(max(structure_scores))
                    if data_scores:
                        max_data_scores.append(max(data_scores))

                # Compute the average of the maximum scores for this intent
                avg_max_structure_score = sum(max_structure_scores) / len(max_structure_scores) if max_structure_scores else 0
                avg_max_data_score = sum(max_data_scores) / len(max_data_scores) if max_data_scores else 0

            except Exception as e:
                print(f"Error comparing DataFrames: {e}")
                avg_max_structure_score, avg_max_data_score = 0, 0
            
            # Store metrics in the DataFrame
            test_df.loc[
                (test_df["benchmark"] == benchmark) & 
                (test_df["model"] == model) & 
                (test_df["nb_name"] == nb_name) & 
                (test_df["intent"] == intent) &
                (test_df['m'] == m) &
                (test_df['n'] == n), 
                [   GENERATED_OUTPUT_DATA_COL,
                    "bleu_3_exact_code", 
                    "output_structure_score", "output_data_score",
                    "original_execution_time", "original_peak_memory", "original_peak_cpu",
                    "generated_execution_time", "generated_peak_memory", "generated_peak_cpu",
                    "original_error", "generated_error", 
                    "original_code_lines", "generated_code_lines"
                ]
            ] = [
                str(generated_outputs),
                round(bleu_score, 3), 
                avg_max_structure_score, avg_max_data_score,
                avg_original_time, avg_original_memory, avg_original_cpu,
                avg_generated_time, avg_generated_memory, avg_generated_cpu,
                original_error, generated_error,
                original_code_lines, generated_code_lines
            ]

    return test_df


def perfect_gen(df):
    '''
    Generates "perfect" predictions by copying original code and output.
    Used for testing of evaluation functions.
    '''
    df[GENERATED_CODE_COL] = df[ACTUAL_CODE_COL]
    df[GENERATED_INPUT_DATA_COL] = df[INPUT_DATA_COL]
    df[GENERATED_OUTPUT_DATA_COL] = df[OUTPUT_DATA_COL]

    # hypothetical perfect model
    df[MODEL_COL] = 'perfect'   
    return df

# RQ1: How do current LLMs perform on Arcade?

In [None]:
#test_models_df = perfect_gen(test_df)
#test_models_df = test_df[test_df['benchmark']=='Spider2-intents']

In [None]:
#metrics = evaluate_completions(test_models_df)
#metrics.to_csv("/kaggle/working/rq1_results.csv")
#metrics.to_csv("/kaggle/working/rq1_results_spider2.csv")

In [None]:
metrics_arcade = pd.read_csv("/kaggle/input/llm-etl-data-set/rq1_results.csv")
metrics_spider2 = pd.read_csv("/kaggle/input/llm-etl-data-set/rq1_results_spider2.csv")

In [None]:
metrics = pd.concat([metrics_arcade, metrics_spider2], ignore_index=True)

## Correctness

In [None]:
rq1_correctness_scores = metrics[[
    'benchmark',
    'model',
    'bleu_3_exact_code',
    'output_structure_score',
    'output_data_score',
    'original_error', 
    'generated_error',
    'original_code_lines', 'generated_code_lines']].groupby(['benchmark','model']).mean().round(2).transpose()

rq1_correctness_scores


In [None]:
print(rq1_correctness_scores.to_latex(float_format="%.2f"))

In [None]:
# Switch back to an interactive backend
%matplotlib inline

### Overall charts (across benchmarks)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def visualize_metrics(metrics, metric_groups):
    """
    Visualize metrics as histograms grouped by metric groups, with each row corresponding to a metric group
    and each metric in the group displayed side-by-side. Bars for different models are separated.

    Parameters:
    - metrics (pd.DataFrame): DataFrame containing the metrics to visualize.
    - metric_groups (list of list): A list of metric groups, where each group is a list of metric names.
    """
    # Replace infinite values with NaN
    metrics = metrics.replace([float('inf'), -float('inf')], pd.NA)

    # Create a grid of subplots
    n_rows = len(metric_groups)
    max_cols = max(len(group) for group in metric_groups)  # Max number of metrics in any group
    fig, axes = plt.subplots(n_rows, max_cols, figsize=(6 * max_cols, 6 * n_rows), constrained_layout=True)

    # Ensure axes is always a 2D array for consistent indexing
    if n_rows == 1:
        axes = [axes]  # Wrap single row in a list
    if max_cols == 1:
        axes = [[ax] for ax in axes]  # Wrap single column in a list

    # Iterate through each metric group and plot
    for row_idx, group in enumerate(metric_groups):
        for col_idx, metric in enumerate(group):
            ax = axes[row_idx][col_idx]
            sns.histplot(
                data=metrics,
                x=metric,
                hue='model',
                kde=False,
                bins=20,
                alpha=0.6,
                multiple='dodge',  # Ensure bars for different models don't overlap
                ax=ax
            )
            ax.set_title(f"{metric} Histogram")
            ax.set_xlabel("Value")
            ax.set_ylabel("Frequency")
         
        # Hide unused subplots in the row
        for col_idx in range(len(group), max_cols):
            axes[row_idx][col_idx].axis("off")

    # Show the grid of plots
    plt.show()

In [None]:
metric_groups = [
    ['bleu_3_exact_code'],
    ['output_structure_score', 'output_data_score'],   
    ['original_execution_time', 'generated_execution_time'],
    ['original_peak_memory', 'generated_peak_memory'],
    ['original_peak_cpu', 'generated_peak_cpu'],
    ['original_error', 'generated_error'],
    ['original_code_lines', 'generated_code_lines']
]

visualize_metrics(metrics, metric_groups)

### By benchmark charts

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Replace infinite values with NaN
metrics = metrics.replace([float('inf'), -float('inf')], pd.NA)

all_metrics = [
    'bleu_3_exact_code',
    'generated_error',
    'output_structure_score',
    'output_data_score',
    'original_error',
    'generated_error',
    'original_code_lines',
    'generated_code_lines'
]

# Get unique benchmark values
benchmarks = metrics['benchmark'].unique()

# Calculate global x-axis limits for each metric
x_limits = {}
for metric in all_metrics:
    x_min = metrics[metric].min()
    x_max = metrics[metric].max()
    x_limits[metric] = (x_min, x_max)

# Create a grid of subplots
n_metrics = len(all_metrics)
n_benchmarks = len(benchmarks)

fig, axes = plt.subplots(n_metrics, n_benchmarks, figsize=(5 * n_benchmarks, 3 * n_metrics), sharex=False, sharey=False)

# Iterate through each metric (row) and benchmark (column)
for i, metric in enumerate(all_metrics):
    for j, benchmark in enumerate(benchmarks):
        ax = axes[i, j] if n_metrics > 1 and n_benchmarks > 1 else axes  # Handle single-row/column cases
        group = metrics[metrics['benchmark'] == benchmark]
        
        sns.histplot(
            data=group,
            x=metric,
            hue='model',
            kde=False,
            bins=20,
            alpha=0.6,
            multiple='dodge',
            ax=ax
        )
        
        # Set x-axis limits
        ax.set_xlim(x_limits[metric])
        
        # Set titles and labels
        if i == 0:
            ax.set_title(f"Benchmark: {benchmark}", fontsize=12)
        if j == 0:
            ax.set_ylabel(metric, fontsize=10)
        ax.set_xlabel("Value", fontsize=8)

# Adjust layout
plt.tight_layout()
plt.show()

## Performance

In [None]:
# define the performance deltas
metrics['diff_execution_time'] = metrics['generated_execution_time'] - metrics['original_execution_time']
metrics['diff_peak_memory'] = metrics['generated_peak_memory'] - metrics['original_peak_memory']
metrics['diff_peak_cpu'] = metrics['generated_peak_cpu'] - metrics['original_peak_cpu']

In [None]:
# convert memory to MBs
metrics['diff_peak_memory_mb'] = metrics['diff_peak_memory']/1e6 # convert to MB

### Absolute Performance

In [None]:
rq1_performance_scores = metrics[[
    'benchmark',
    'model',
    'original_execution_time', 'generated_execution_time',
    'original_peak_memory', 'generated_peak_memory',
    'original_peak_cpu', 'generated_peak_cpu'
]].groupby(['benchmark','model']).mean().round(2).transpose()

rq1_performance_scores

In [None]:
print(rq1_performance_scores.to_latex(float_format="%.2f"))

### Original to Generated Performance

In [None]:
metrics['diff_peak_cpu'].info()

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp, sem, t

# Function to calculate 95% confidence intervals
def calculate_confidence_interval(data):
    data = data.dropna()
    n = len(data)
    if n < 2:
        return (np.nan, np.nan)  # Not enough data for confidence interval
    mean = np.mean(data)
    margin_of_error = t.ppf(0.975, n - 1) * sem(data)
    lower_bound = mean - margin_of_error
    upper_bound = mean + margin_of_error
    return lower_bound, upper_bound

# Function to perform a one-sample t-test
def perform_t_test(data):
    data = data.dropna()
    if len(data) < 2:
        return np.nan, np.nan  # Not enough data for t-test
    t_stat, p_value = ttest_1samp(data, 0)  # Test if the mean difference is significantly different from 0
    return t_stat, p_value

# Step 1: Group by `benchmark` and `model` and calculate stats for each delta
grouped = metrics.groupby(['benchmark', 'model'])

# Calculate mean, std, confidence intervals, and t-test results for each delta
ci_stats = grouped['diff_execution_time'].apply(calculate_confidence_interval).apply(pd.Series)
ci_stats.columns = ['execution_time_ci_lower', 'execution_time_ci_upper']

ci_stats['mean_diff_execution_time'] = grouped['diff_execution_time'].mean()
ci_stats['std_diff_execution_time'] = grouped['diff_execution_time'].std()

ci_stats['execution_time_t_stat'], ci_stats['execution_time_p_value'] = zip(
    *grouped['diff_execution_time'].apply(perform_t_test)
)

ci_stats['peak_memory_ci_lower'], ci_stats['peak_memory_ci_upper'] = zip(
    *grouped['diff_peak_memory_mb'].apply(calculate_confidence_interval)
)
ci_stats['mean_diff_peak_memory'] = grouped['diff_peak_memory_mb'].mean()
ci_stats['std_diff_peak_memory'] = grouped['diff_peak_memory_mb'].std()

ci_stats['peak_memory_t_stat'], ci_stats['peak_memory_p_value'] = zip(
    *grouped['diff_peak_memory_mb'].apply(perform_t_test)
)

ci_stats['peak_cpu_ci_lower'], ci_stats['peak_cpu_ci_upper'] = zip(
    *grouped['diff_peak_cpu'].apply(calculate_confidence_interval)
)
ci_stats['mean_diff_peak_cpu'] = grouped['diff_peak_cpu'].mean()
ci_stats['std_diff_peak_cpu'] = grouped['diff_peak_cpu'].std()

ci_stats['peak_cpu_t_stat'], ci_stats['peak_cpu_p_value'] = zip(
    *grouped['diff_peak_cpu'].apply(perform_t_test)
)

# Reset index for better readability
#ci_stats = ci_stats.reset_index()

# Step 2: Display results
print("\nConfidence Intervals and T-Test Results by `benchmark` and `model`:")
ci_stats.transpose()

In [None]:
print(ci_stats.transpose().to_latex(float_format="%.2f"))

### By benchmark charts

In [None]:
# Replace infinite values with NaN
metrics = metrics.replace([float('inf'), -float('inf')], pd.NA)

all_metrics = [
    'diff_execution_time',
    'diff_peak_memory',
    'diff_peak_cpu'
]

# Get unique benchmark values
benchmarks = metrics['benchmark'].unique()

# Calculate global x-axis limits for each metric
x_limits = {}
for metric in all_metrics:
    x_min = metrics[metric].min()
    x_max = metrics[metric].max()
    x_limits[metric] = (x_min, x_max)

# Create a grid of subplots
n_metrics = len(all_metrics)
n_benchmarks = len(benchmarks)

fig, axes = plt.subplots(n_metrics, n_benchmarks, figsize=(5 * n_benchmarks, 3 * n_metrics), sharex=False, sharey=False)

# Iterate through each metric (row) and benchmark (column)
for i, metric in enumerate(all_metrics):
    for j, benchmark in enumerate(benchmarks):
        ax = axes[i, j] if n_metrics > 1 and n_benchmarks > 1 else axes  # Handle single-row/column cases
        group = metrics[metrics['benchmark'] == benchmark]
        
        sns.histplot(
            data=group,
            x=metric,
            hue='model',
            kde=False,
            bins=20,
            alpha=0.6,
            multiple='dodge',
            ax=ax
        )
        
        # Set x-axis limits
        # ax.set_xlim(x_limits[metric])
        
        # Set titles and labels
        if i == 0:
            ax.set_title(f"Benchmark: {benchmark}", fontsize=12)
        if j == 0:
            ax.set_ylabel(metric, fontsize=10)
        ax.set_xlabel("Value", fontsize=8)

# Adjust layout
plt.tight_layout()
plt.show()

# RQ2: What types of issues do we see?

In [None]:
def print_results(metrics, i, show_prompt=False, show_dfs=True):
    """
    Prints evaluation results for a specific row in the metrics DataFrame.
    """
    row = metrics.iloc[i]  # Extract the row once to avoid repetitive indexing

    print("Model:", row['model'])
    print("Notebook:", row['nb_name'])
    print("Original error:", row['original_error'])
    print("Generated error:", row['generated_error'])
    if show_prompt:
        print("Prompt:", row['evolved_prompt'])
    print("Intent:", row['intent'])
    #print("Header:", row['nb_setup_code'])
    print("Bleu 3 score:", row['bleu_3_exact_code'])
    print("Output structure score:", row['output_structure_score'])
    print("Output data score:", row['output_structure_score'])
    print("Code:")
    compare_code(row['generated_intent_code'], row['code'])
    print("Output:")

    try:
        original_outputs = eval(row[OUTPUT_DATA_COL])
        generated_outputs = eval(row[GENERATED_OUTPUT_DATA_COL])
    
        # Extract DataFrames and Series from original outputs
        original_objects = {
            key: pd.DataFrame(json.loads(value)) if value.startswith("[{") and value.endswith("}]") 
            else pd.Series(json.loads(value))
            for key, value in original_outputs.items()
            if isinstance(value, str) and (
                value.startswith("[{") and value.endswith("}]") or value.startswith("{") and value.endswith("}")
            )
        }
    
        # Extract DataFrames and Series from generated outputs
        generated_objects = {
            key: pd.DataFrame(json.loads(value)) if value.startswith("[{") and value.endswith("}]") 
            else pd.Series(json.loads(value))
            for key, value in generated_outputs.items()
            if isinstance(value, str) and (
                value.startswith("[{") and value.endswith("}]") or value.startswith("{") and value.endswith("}")
            )
        }
    
        # Display original objects
        print("Original objects:", list(original_objects.keys()))
        if show_dfs:
            for obj_name, original_obj in original_objects.items():
                print(obj_name)
                if isinstance(original_obj, pd.DataFrame):
                    print("DataFrame:")
                    display(original_obj)
                elif isinstance(original_obj, pd.Series):
                    print("Series:")
                    display(original_obj)
    
        # Display generated objects
        print("Generated objects:", list(generated_objects.keys()))
        if show_dfs:
            for obj_name, generated_obj in generated_objects.items():
                print(obj_name)
                if isinstance(generated_obj, pd.DataFrame):
                    print("DataFrame:")
                    display(generated_obj)
                elif isinstance(generated_obj, pd.Series):
                    print("Series:")
                    display(generated_obj)
    
    except Exception as e:
        print(f"Error comparing objects: {e}")
        avg_max_structure_score, avg_max_data_score = 0, 0
                
    print('_' * 80)

In [None]:
metrics[metrics.benchmark=='ARCADE new'].iloc[0]

## Outliers

In [None]:
outliers = metrics[metrics.generated_code_lines > 50]
print(outliers.model.value_counts())
print(outliers.nb_name.value_counts())

In [None]:
# Gemini tends to repeat the entire code at each iteration; ChatGPT and Claude do not for the same prompt

print_results(metrics[(metrics.nb_name == 'dataset_batting/notebook_0/annotated.ipynb') & (metrics.model == 'Gemini')], 0, show_prompt=False, show_dfs=False)
print_results(metrics[(metrics.nb_name == 'dataset_batting/notebook_0/annotated.ipynb') & (metrics.model == 'ChatGPT')], 0, show_prompt=False, show_dfs=False)
print_results(metrics[(metrics.nb_name == 'dataset_batting/notebook_0/annotated.ipynb') & (metrics.model == 'Claude')], 0, show_prompt=False, show_dfs=False)

## Error Classification

In [None]:
import json
import re
import pandas as pd

# Function to clean and fix invalid JSON strings
def clean_json_string(output):
    if isinstance(output, str):
        # Replace single quotes with double quotes for the outermost JSON object
        output = re.sub(r"(?<!\\)'", '"', output)
        # Fix improperly escaped characters (e.g., \/ -> /)
        output = output.replace(r"\/", "/")
    return output

# Function to extract error types from the generated_outputs column
def extract_error_str(output):
    try:
        # Clean the JSON string
        output = clean_json_string(output)
        # Parse the JSON string into a dictionary
        output_dict = json.loads(output)
        if 'error' in output_dict and isinstance(output_dict['error'], str):
            return output_dict['error'].split(":")[0].strip()  # Extract the part before the colon (error type)
    except (json.JSONDecodeError, TypeError):
        pass    
    return "Malformed output"


def categorize_error(error_string):
    """
    Categorize an error string into broader categories based on keywords or patterns.

    Args:
        error_string (str): The error string to categorize.

    Returns:
        str: The category of the error.
    """
    if not isinstance(error_string, str):
        return "Uncategorized"

    # Syntax Errors
    if "syntax" in error_string.lower() or "unterminated string literal" in error_string.lower():
        return "Syntax Errors"

    # Runtime Errors
    if "recursion" in error_string.lower() or "truth value" in error_string.lower():
        return "Runtime Errors"

    # DataFrame Errors
    if "dataframe" in error_string.lower() or "mask" in error_string.lower() or "column" in error_string.lower():
        return "DataFrame Errors"

    # Index Errors
    if "index" in error_string.lower() or "list index" in error_string.lower():
        return "Index Errors"

    # Type Errors
    if "type" in error_string.lower() or "comparison" in error_string.lower():
        return "Type Errors"

    # Output Errors
    if "malformed" in error_string.lower() or "output" in error_string.lower():
        return "Malformed Output Errors"

    # Other Errors
    if "nan" in error_string.lower() or "expected" in error_string.lower():
        return "Other Errors"

    # Uncategorized
    return "Uncategorized"


# Find the first errors in each notebook
errors = metrics[metrics.generated_error == True].groupby(
    ['model', 'benchmark', 'nb_name']
).apply(lambda group: group.loc[group.intent.idxmin()]).reset_index(drop=True)

# Extract error types from the generated_outputs column
errors['error_type'] = errors['generated_outputs'].apply(extract_error_str).apply(categorize_error)

# Group by model, benchmark, and error category, and count occurrences
error_summary = errors.groupby(['model', 'benchmark', 'error_type']).size().reset_index(name='count')

# Pivot the table for better readability
error_summary_pivot = error_summary.pivot_table(
    index='error_type', 
    columns=['model', 'benchmark'], 
    values='count', 
    fill_value=0,
    aggfunc='sum',
    margins=True, 
    margins_name='Total'
)

# Separate the Total row
total_row = error_summary_pivot.loc['Total']
error_summary_pivot = error_summary_pivot.drop('Total').sort_values('Total', ascending=False)

# Append the Total row back at the bottom
error_summary_pivot.loc['Total'] = total_row

# Display the summarized table
error_summary_pivot

In [None]:
# # generated intents with syntax errors
# metrics_errors = metrics[metrics.generated_error]
# for i in range(metrics_errors.shape[0]):
#     print('*'*80)
#     print_results(metrics_errors, i, show_prompt=False, show_dfs=False)

In [None]:
# # all output
# for i in range(metrics.shape[0]):
#     print('*'*80)
#     print_results(metrics,i, show_prompt=False, show_dfs=False)

## Controllable Issues

In [None]:
# # example notebook with original execution errors
# print(metrics[metrics.original_error].iloc[0]['nb_setup_code'])

# #Error processing 'first_n_rows': Expecting ',' delimiter: line 1 column 757 (char 756)
# print(process_first_n_rows(eval(metrics[metrics.original_error].iloc[0]['inputs'])))

# # something is wrong with original formatting of data
# pprint(metrics[metrics.original_error].iloc[0]['inputs'])

# RQ3: Does more time "thinking" by the LLM help?

In [None]:
#metrics_mn = evaluate_completions(test_mn_df)
#metrics_mn.to_csv("/kaggle/working/rq3_results.csv")

In [None]:
metrics_mn = pd.read_csv("/kaggle/input/llm-etl-data-set/rq3_results.csv")

In [None]:
# Switch back to an interactive backend
%matplotlib inline

## Correctness

In [None]:
metrics_mn[[
    'm',
    'n',
    'bleu_3_exact_code',
    'output_structure_score',
    'output_data_score',
    'original_error', 
    'generated_error',
    'original_code_lines', 'generated_code_lines']].groupby(['m','n']).mean().transpose()

In [None]:
def plot_mxn_heat_map(metrics_mn_grouped):
    # Iterate through each metric and plot
    for metric in metrics_mn_grouped.columns:
        plt.figure(figsize=(8, 6))
        # Pivot the data for heatmap or line plot
        metric_data = metrics_mn_grouped[metric].unstack(level='n')  # Reshape for plotting
        
        # Create a heatmap for the metric
        sns.heatmap(metric_data, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={'label': metric})
        plt.title(f"Heatmap of {metric} for increasing m and n")
        plt.xlabel("n")
        plt.ylabel("m")
        plt.show()
        
# Group by 'm' and 'n', calculate the mean, and transpose the DataFrame
metrics_mn_grouped = metrics_mn[[
    'm',
    'n',
    'bleu_3_exact_code',
    'generated_code_lines',
    'output_data_score', 
    'output_structure_score'
]].groupby(['m', 'n']).mean()

plot_mxn_heat_map(metrics_mn_grouped)

## Performance

In [None]:
metrics_mn['original_peak_memory_mb'] = metrics_mn['original_peak_memory']/1e6
metrics_mn['generated_peak_memory_mb'] = metrics_mn['generated_peak_memory']/1e6

In [None]:
metrics_mn[[
    'm', 'n',
    'original_execution_time', 'generated_execution_time',
    'original_peak_memory_mb', 'generated_peak_memory_mb',
    'original_peak_cpu', 'generated_peak_cpu'
]].groupby(['m','n']).mean().transpose()

In [None]:
# Group by 'm' and 'n', calculate the mean, and transpose the DataFrame
metrics_mn_grouped = metrics_mn[[
    'm',
    'n',
    'generated_execution_time',
    'generated_peak_memory_mb',
]].groupby(['m', 'n']).mean()

plot_mxn_heat_map(metrics_mn_grouped)

## Same Notebook Stat. Analysis

In [None]:
# Re-import necessary packages after code execution state reset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from statsmodels.stats.multicomp import pairwise_tukeyhsd

def same_notebook_analysis(df, metric):
    # Ensure `m` and `n` are discrete and limited to 1, 2, 3
    df['m'] = pd.Categorical(df['m'], categories=[1, 2, 3], ordered=True)
    df['n'] = pd.Categorical(df['n'], categories=[1, 2, 3], ordered=True)

    # Mixed effects model (correctness)
    model_correct = smf.mixedlm(metric + " ~ C(m) * C(n)", df, groups=df["subject"])
    result_correct = model_correct.fit()
    
    # Random effects per subject
    random_effects = result_correct.random_effects
    
    # Tukey HSD post hoc
    df['condition'] = df['m'].astype(str) + '-' + df['n'].astype(str)
    tukey_correct = pairwise_tukeyhsd(df[metric], df['condition'], alpha=0.05)
    
    # Box plot
    plt.figure(figsize=(10, 8))
    df['m_n'] = 'm=' + df['m'].astype(str) + ', n=' + df['n'].astype(str)
    sorted_mn = sorted(df['m_n'].unique(), key=lambda x: (int(x[2]), int(x[-1])))
    
    sns.boxplot(data=df, y='m_n', x=metric, order=sorted_mn, palette='viridis', showfliers=True)
    sns.stripplot(data=df, y='m_n', x=metric, order=sorted_mn, color='k', alpha=0.3, jitter=0.15)
    plt.title("Box Plot of " + metric + " per (m, n) Combination")
    plt.xlabel("Correctness Score")
    plt.ylabel("(m, n)")
    plt.grid(True)
    plt.show()
    
    # Heatmap
    pivot_correct = df.groupby(['m', 'n'])[metric].mean().unstack()
    plt.figure(figsize=(8, 6))
    sns.heatmap(pivot_correct, annot=True, cmap='YlGnBu')
    plt.title("Mean " + metric + " by (m, n)")
    plt.show()
    
    # Confidence interval plot
    grouped = df.groupby(['m', 'n'])[metric]
    means = grouped.mean()
    stds = grouped.std()
    counts = grouped.count()
    cis = 1.96 * stds / np.sqrt(counts)
    
    ci_df = means.reset_index()
    ci_df['ci'] = cis.values
    
    plt.figure(figsize=(4, 6))
    m_levels = sorted(df['m'].unique())
    for m in m_levels:
        subset = ci_df[ci_df['m'] == m]
        plt.errorbar(subset['n'], subset[metric], yerr=subset['ci'], label=f'm={m}', capsize=4, marker='o', alpha=0.75)
    
    plt.title("Mean " + metric + " with 95% Confidence Intervals")
    plt.xticks(ticks=[1, 2, 3], labels=[1, 2, 3])
    plt.xlabel("Intent Refinement Level (n)")
    plt.ylabel(metric)
    plt.legend(title="Notebook Refinement Level (m)")
    plt.grid(True)
    plt.show()
    
    # Extract std errors manually
    cov_matrix = result_correct.cov_params()
    std_errors = np.sqrt(np.diag(cov_matrix))
    se_df = pd.DataFrame({'Term': cov_matrix.columns, 'Std_Error': std_errors})
       
    print(result_correct.summary())
    print(result_correct.summary().tables[0].to_latex())
    print(result_correct.summary().tables[1].to_latex())
    #print(tukey_correct.summary())
    #print(random_effects)

### Bleu 3 score

In [None]:
df['subject'] = df['nb_name'] + '_' + df['intent_number'].astype(str)
same_notebook_analysis(df, metric='bleu_3_exact_code')

### Output structure score

In [None]:
df['subject'] = df['nb_name'] + '_' + df['intent_number'].astype(str)
same_notebook_analysis(df, metric='output_structure_score')

### Output data score

In [None]:
df = metrics_mn.copy()
df['subject'] = df['nb_name'] + '_' + df['intent_number'].astype(str)
same_notebook_analysis(df, metric='output_data_score')

### Run time

In [None]:
df['subject'] = df['nb_name'] + '_' + df['intent_number'].astype(str)
same_notebook_analysis(df, metric='generated_execution_time')

### Memory

In [None]:
df['subject'] = df['nb_name'] + '_' + df['intent_number'].astype(str)
same_notebook_analysis(df, metric='generated_peak_memory_mb')

### Peak CPU

In [None]:
df['subject'] = df['nb_name'] + '_' + df['intent_number'].astype(str)
same_notebook_analysis(df, metric='generated_peak_cpu')