In [26]:
import pandas as pd
import json
import os
import ast
import re
import numpy as np
from datasets import Dataset
from datetime import datetime

# For LLM
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    set_seed,
    pipeline
)
from trl import setup_chat_format

import torch
from time import time

# Set seed
set_seed(42)

In [27]:
def split_dictionary(data):
    """
    Splits the tasks that have multiple test input/output pairs into separate entries.

    Args:
    data (dict): The original dictionary containing tasks with 'test' and 'train' fields.

    Returns:
    tuple: A tuple containing:
        - result (dict): The dictionary with tasks split into separate entries if they have multiple test pairs.
        - split_files (list): A list of keys for the tasks that were split.
    """
    result = {}
    split_files = []
    for key, value in data.items():
        test_list = value.get("test", [])
        train_list = value.get("train", [])
        if len(test_list) > 1:
            for idx, test_item in enumerate(test_list):
                new_key = f"{key}_{idx}"
                result[new_key] = {
                    "test": [test_item],
                    "train": train_list
                }
                split_files.append(new_key)
        else:
            result[key] = value
    return result, split_files


In [28]:
# Compare results to solution
test_run = True

# Prepare data for DataFrame
# Load JSON data from the files
if test_run:
    with open('data/arc-agi_evaluation_challenges.json') as f:
        challenges = json.load(f)
        # Split tasks with multiple test inputs
        challenges, split_files = split_dictionary(challenges) 

    with open('data/arc-agi_evaluation_solutions.json') as f:
        solutions = json.load(f)
else:
    with open('data/arc-agi_test_challenges.json') as f:
        challenges = json.load(f)
    # Split tasks with multiple test inputs
    challenges, split_files = split_dictionary(challenges) 

# Print how many files have been split and their names
split_file_count = len(split_files)//2

print(f"Number of files split: {split_file_count}")
print("File names:")
for name in split_files:
    print(name)

# Prepare data
data = []
        
for file_name, grids in challenges.items():
    train_grids = grids.get('train', [])
    test_inputs = grids.get('test', [])
    if test_run:
        # Handle files with multiple test inputs
        parts = file_name.split('_')
        if len(parts) > 1:
            test_nr = int(parts[1])
        else:
            test_nr = 0
        test_outputs = solutions.get(parts[0], [])
        # Transform test grids to lists of dicts with 'output' key
        test_outputs_transformed = [{'output': test_outputs[test_nr]}]
        # Combine test inputs and outputs in alternating manner
        combined_tests = [{'input': test_inputs[0]['input'], 'output': test_outputs_transformed[0]['output']}]
    data.append({
            'file_name': file_name,
            'train': train_grids,
            'test_input': test_inputs,
            'test_output': test_outputs_transformed if test_run else [[0, 0]],
            'test': combined_tests if test_run else test_inputs
    })

# Create DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

Number of files split: 19
File names:
12997ef3_0
12997ef3_1
1d398264_0
1d398264_1
31d5ba1a_0
31d5ba1a_1
3b4c2228_0
3b4c2228_1
4852f2fa_0
4852f2fa_1
4c177718_0
4c177718_1
5d2a5c43_0
5d2a5c43_1
6ea4a07e_0
6ea4a07e_1
8b28cd80_0
8b28cd80_1
9110e3c5_0
9110e3c5_1
9b4c17c4_0
9b4c17c4_1
b1fc8b8e_0
b1fc8b8e_1
bbb1b8b6_0
bbb1b8b6_1
c074846d_0
c074846d_1
d5c634a2_0
d5c634a2_1
da2b0fe3_0
da2b0fe3_1
e21a174a_0
e21a174a_1
e345f17b_0
e345f17b_1
f3e62deb_0
f3e62deb_1
    file_name                                              train  \
0    00576224  [{'input': [[8, 6], [6, 4]], 'output': [[8, 6,...   
1    009d5c81  [{'input': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2    00dbd492  [{'input': [[2, 2, 2, 2, 2, 0, 0], [2, 0, 0, 0...   
3    03560426  [{'input': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0...   
4    05a7bcf2  [{'input': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
..        ...                                                ...   
414  fd096ab6  [{'input': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   


In [29]:
LLAMA_3_CHAT_TEMPLATE = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"""

# Set the data type for computations to float16, bfloat16 not supported on T4/P100
compute_dtype = getattr(torch, "float16")

# Configure the BitsAndBytes settings for 4-bit quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Use double quantization for improved precision
    bnb_4bit_quant_type="nf4",  # Specify the quantization type
    bnb_4bit_compute_dtype=compute_dtype,  # Set the computation data type
)

# Specify the model ID for loading the fine-tuned Llama 3 model
model_id = "models/llama3.2_1B/"
model_name = model_id.removeprefix("models/llama")
# Configure the BitsAndBytes settings for 8-bit quantization to reduce memory usage
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True  # Enable 8-bit quantization
# )

# Record the start time to measure the loading duratio
time_start = time()
print("Loading model")

# Load the pre-trained model with specified configurations
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True, # Allow the model to use custom code from the repository
    quantization_config=bnb_config, # Apply the 4-bit or 8-bit quantization configuration
    attn_implementation='sdpa', # Use scaled-dot product attention for better performance
    torch_dtype=compute_dtype, # Set the data type for the model
    use_cache=True, # Disable caching to save memory
    device_map= {"": 1}, # Automatically map the model to available devices (e.g., GPUs)
)

# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAMA_3_CHAT_TEMPLATE # Apply the chat message template

# Record the end time and print the duration for preparing the model and tokenizer
time_end = time()
print(f"Prepare model, tokenizer: {round(time_end-time_start, 3)} sec.")

Loading model
Prepare model, tokenizer: 1.632 sec.


In [47]:
# The system_prompt defines the initial instructions for the model, setting the context for solving ARC tasks.
system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.'''

# User message template is a template for creating user prompts. It includes placeholders for training data and test input data, guiding the model to learn the rule and apply it to solve the given puzzle.
user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
----------------------------------------
{training_data}
----------------------------------------
Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
----------------------------------------
[{{'input': {input_test_data}, 'output': [[]]}}]
----------------------------------------
What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:'''

def preprocess(task, test_run, train_mode=False):
    """
    Preprocess a single ARC task to create the prompt and solution for the model.

    This function formats the system and user messages using a predefined template and the task's training and test data.
    If in training mode, it also includes the assistant's message with the expected output.

    Parameters:
    task (dict): The ARC task data containing training and test examples.
    train_mode (bool): If True, includes the assistant's message with the expected output for training purposes.

    Returns:
    dict: A dictionary containing the formatted text prompt, the solution, and the file name.
    """
    # System message
    system_message = {"role": "system", "content": system_prompt}

    # Extract training data and input grid from the task
    training_data = task['train']
    input_test_data = task['test'][0]['input']
    if test_run:
        output_test_data = task['test'][0]['output']
    else:
        output_test_data = [[0 ,0]]

    # Format the user message with training data and input test data
    user_message_content = user_message_template.format(training_data=training_data, input_test_data=input_test_data)
    user_message = {
        "role": "user",
        "content": user_message_content
    }

    # Include the assistant message with the expected output if in training mode
    if train_mode:
        assistant_message = {
            "role": "assistant",
            "content": str(output_test_data)
        }

        # Combine system, user, and assistant messages
        messages = [system_message, user_message, assistant_message]
    else:
        messages = [system_message, user_message]
    # Convert messages using the chat template for use with the instruction finetuned version of Llama
    messages = tokenizer.apply_chat_template(messages, tokenize=False)
    if test_run:
        return {"text": messages, "solution": output_test_data, "file_name": task['file_name']}
    else:
        return {"text": messages, "file_name": task['file_name']}

# Convert the loaded data to a Huggingface Dataset object
dataset = Dataset.from_pandas(df)
print(dataset)

# Apply the preprocess function to each task in the dataset
dataset = dataset.map(lambda x: preprocess(x, test_run), batched=False, remove_columns=dataset.column_names)
dataset = dataset.select(range(395,405))
print(dataset)


Dataset({
    features: ['file_name', 'train', 'test_input', 'test_output', 'test'],
    num_rows: 419
})


Map: 100%|██████████| 419/419 [00:00<00:00, 2213.50 examples/s]

Dataset({
    features: ['file_name', 'text', 'solution'],
    num_rows: 10
})





In [48]:
# Define the maximum number of tokens allowed
max_tokens = 8000  # Adjust this value as needed


# Function to calculate the number of tokens
def count_tokens(text):
    """
    Calculate the number of tokens in a given text using the tokenizer.

    This function uses the tokenizer to encode the input text and returns the
    number of tokens. It is useful for ensuring that the text length stays
    within the model's context window.

    Parameters:
    text (str): The input text to be tokenized.

    Returns:
    int: The number of tokens in the input text.
    """
    return len(tokenizer.encode(text))

# Filter the dataset to include only tasks with a number of tokens within the allowed limit
filtered_dataset = dataset.filter(lambda x: count_tokens(x['text']) <= max_tokens)

# Print the number of tasks filtered out and the remaining tasks
print(f'{len(dataset)-len(filtered_dataset)} tasks contain too many tokens if we set max_tokens to {max_tokens}')
print(f'The dataset contains {len(filtered_dataset)} tasks to evaluate the model')

print(filtered_dataset.to_pandas().columns)

Filter: 100%|██████████| 10/10 [00:00<00:00, 302.34 examples/s]

3 tasks contain too many tokens if we set max_tokens to 8000
The dataset contains 7 tasks to evaluate the model
Index(['file_name', 'text', 'solution'], dtype='object')





In [49]:
# Define your LLM pipeline
text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map={"": 1}
)

# Define terminators for the pipeline
terminators = [
    text_gen_pipeline.tokenizer.eos_token_id,
    text_gen_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# Function to generate outputs
def generate_solution(task, max_new_tokens=512, do_sample=True, temperature=0.1, top_p=0.1):
    """
    Generate a solution for an ARC task using the language model.

    This function takes a task prompt, generates a solution using the text generation pipeline,
    and extracts the generated solution from the model's output.

    Parameters:
    task (dict): The ARC task data containing the prompt and other relevant information.
    max_new_tokens (int, optional): The maximum number of new tokens to generate. Default is 512.
    do_sample (bool, optional): Whether to use sampling; if False, greedy decoding is used. Default is True.
    temperature (float, optional): The sampling temperature. Lower values make the model more conservative. Default is 0.1.
    top_p (float, optional): The cumulative probability for nucleus sampling. Lower values make the model more conservative. Default is 0.1.

    Returns:
    dict: A dictionary containing the generated solution.
    """
    # Extract the prompt from the task
    prompt = task['text']
    
    # Generate the model's output based on the prompt
    outputs = text_gen_pipeline(
        prompt, 
        max_new_tokens=max_new_tokens, 
        eos_token_id=terminators, 
        do_sample=do_sample, 
        temperature=temperature, 
        top_p=top_p
    )
    
    # Extract the generated solution from the model's output
    generated_solutions = outputs[0]["generated_text"][len(prompt):]
    return {'generated_solution': generated_solutions}


Device set to use cuda:1


In [50]:
# # Generate solutions
# print("Generating solutions")
# filtered_dataset = filtered_dataset.map(generate_solution, batched=False)

In [51]:
print(filtered_dataset[:5]['generated_solution'])

KeyError: 'generated_solution'

In [52]:
def extract_solution(text):
    """
    Extract the solution array from the generated text.

    Parameters:
    text (str): The text containing the generated solution.

    Returns:
    list: A list of lists representing the extracted solution array.
          Returns [[0]] if no valid solution is found.
    """
    try:
        # Find the part of the text that looks like a nested list
        start = text.index('[[')
        end = text.index(']]', start) + 2
        array_str = text[start:end]
        
        # Use ast.literal_eval to safely evaluate the string as a Python expression
        array = ast.literal_eval(array_str)
        
        # Check if the result is a list of lists
        if all(isinstance(i, list) for i in array):
            return array
        else:
            return [[0]]
    except (ValueError, SyntaxError):
        return [[0]]

def pad_array_with_value(array, target_shape, pad_value):
    """
    Pad the given array to the target shape with the specified pad value.

    This function pads the original array to fit the target shape by adding additional
    pixels at the ends. This method ensures that the smaller array is placed at the
    top-left corner of the target shape, making sense of the number of correct pixels
    during comparison.

    Note:
    Depending on how you pad the arrays, the number of correct pixels might vary.
    For example, placing the smaller array in the center versus adding pixels at the ends
    can yield different results. Here, we pad by adding pixels at the ends.

    Parameters:
    array (list): The original array to be padded.
    target_shape (tuple): The desired shape of the padded array (rows, columns).
    pad_value (int): The value to use for padding the array.

    Returns:
    np.ndarray: A padded array with the specified target shape and pad value.
    """
    padded_array = np.full(target_shape, pad_value, dtype=int)
    original_shape = np.array(array).shape
    padded_array[:original_shape[0], :original_shape[1]] = array
    return padded_array

def is_rectangular(array):
    """
    Check if all rows in a 2D list have the same length.
    """
    if not array or not all(isinstance(row, list) for row in array):
        return False
    row_length = len(array[0])
    return all(len(row) == row_length for row in array)

def compare_solutions_with_padding(generated_output, correct_output, pad_value=-1):
    """
    Compare the generated output with the correct output, using padding to align their shapes.

    Parameters:
    generated_output (list): The generated solution array.
    correct_output (list): The correct solution array.
    pad_value (int, optional): The value to use for padding. Default is -1. The colour value -1 should not be present in the solutions.

    Returns:
    tuple: A tuple containing:
        - is_correct (bool): True if the solutions match exactly, False otherwise.
        - correct_percentage (float): The percentage of correctly matched pixels.
    """
    max_rows = max(len(generated_output), len(correct_output))
    max_cols = max(len(generated_output[0]), len(correct_output[0]))
    target_shape = (max_rows, max_cols)
    
    padded_generated = pad_array_with_value(generated_output, target_shape, pad_value)
    padded_correct = pad_array_with_value(correct_output, target_shape, pad_value)
    
    total_pixels = max_rows * max_cols
    correct_pixels = np.sum((padded_generated == padded_correct) & (padded_generated != pad_value) & (padded_correct != pad_value))
    correct_percentage = (correct_pixels / total_pixels) * 100
    
    is_correct = (correct_pixels == total_pixels)
    
    return is_correct, correct_percentage



In [53]:
# if test_run:
#     # Lists to store results of task evaluation
#     solved_tasks = []
#     failed_tasks = []
#     accuracy_list = []

#     for i, task in enumerate(filtered_dataset):
#         true_solution = task['solution']
#         file_name = task['file_name']
#         generated_text = task["generated_solution"]

#         # Extract the solution generated by the model
#         gen_solution = extract_solution(generated_text)

#         # Compare the generated solution with the true solution
#         is_correct, correct_percentage = compare_solutions_with_padding(gen_solution, true_solution)

#         # Append results to respective lists based on correctness
#         if is_correct:
#             solved_tasks.append({
#                 'file_name': file_name,
#                 'llm_output': generated_text,
#                 'gen_solution': gen_solution,
#                 'true_solution': true_solution
#             })
#         else:
#             failed_tasks.append({
#                 'file_name': file_name,
#                 'llm_output': generated_text,
#                 'gen_solution': gen_solution,
#                 'true_solution': true_solution
#             })

#         # Store "pixel accuracy for each task
#         accuracy_list.append({
#             'file_name': file_name,
#             'correct_percentage': correct_percentage
#         })

#     # Create a dictionary to store results
#     results = {'file_name': [], 'solved': [], 'accuracy': [], 'gen_solution': [], 'true_solution': []}

#     # Add solved tasks to the results
#     for task in solved_tasks:
#         results['file_name'].append(task['file_name'])
#         results['solved'].append(True)
#         results['accuracy'].append(next((item['correct_percentage'] for item in accuracy_list if item['file_name'] == task['file_name']), None))
#         results['gen_solution'].append(task['gen_solution'])
#         results['true_solution'].append(task['true_solution'])

#     # Add failed tasks to the results
#     for task in failed_tasks:
#         results['file_name'].append(task['file_name'])
#         results['solved'].append(False)
#         results['accuracy'].append(next((item['correct_percentage'] for item in accuracy_list if item['file_name'] == task['file_name']), None))
#         results['gen_solution'].append(task['gen_solution'])
#         results['true_solution'].append(task['true_solution'])

#     # Create a DataFrame
#     df_results = pd.DataFrame(results)

#     # Display the DataFrame as a table
#     print(df_results)

#     # Calculate and print the average correct percentage
#     average_correct_percentage = df_results['accuracy'].mean()
#     print(f"Average 'Pixel Accuracy' of attempted tasks: {average_correct_percentage:.2f}%")

#     # Calculate and print the number of solved tasks out of the total number of tasks
#     total_tasks = len(df)
#     solved_tasks_count = df_results['solved'].sum()
#     print(f"Solved {solved_tasks_count} out of {total_tasks} tasks ({(solved_tasks_count / total_tasks) * 100:.2f}%)")


#     # Get current timestamp in a readable format
#     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
#     # Use it in your filename
#     f_path = "output/" + timestamp + "_" + model_name[:len(model_name)-1] + "_solution.csv"
#     df_results.to_csv(f_path, index=False)


In [54]:
# 1. Find the index label of the max
idx_label = df_results['accuracy'].idxmax()

# 2. Use it to get the value
max_value = df_results.at[idx_label, 'accuracy']

print(f"Highest feature value is {max_value} in row with index label: {idx_label}")


# Retrieve and print that row
max_row = df_results.loc[idx_label]
print(max_row)



NameError: name 'df_results' is not defined

# Pass@k 

In [61]:
def generate_solutions_passk(task, k=3, max_new_tokens=512, do_sample=True, temperature=0.9, top_p=0.9):
    """
    Generate k solutions for an ARC task using the language model for pass@k evaluation.

    Parameters:
    task (dict): The ARC task data containing the prompt and other relevant information.
    k (int): Number of solutions to generate for pass@k evaluation. Default is 3.
    max_new_tokens (int, optional): The maximum number of new tokens to generate. Default is 512.
    do_sample (bool, optional): Whether to use sampling; if False, greedy decoding is used. Default is True.
    temperature (float, optional): The sampling temperature. Higher values for more diversity. Default is 0.7.
    top_p (float, optional): The cumulative probability for nucleus sampling. Default is 0.9.

    Returns:
    dict: A dictionary containing k generated solutions.
    """
    # Extract the prompt from the task
    prompt = task['text']
    
    generated_solutions = []
    
    # Generate k solutions
    for i in range(k):
        # Generate the model's output based on the prompt
        outputs = text_gen_pipeline(
            prompt, 
            max_new_tokens=max_new_tokens, 
            eos_token_id=terminators, 
            do_sample=do_sample, 
            temperature=temperature, 
            top_p=top_p
        )
        
        # Extract the generated solution from the model's output
        generated_solution = outputs[0]["generated_text"][len(prompt):]
        generated_solutions.append(generated_solution)
    
    return {f'generated_solution_{i+1}': sol for i, sol in enumerate(generated_solutions)}

In [62]:
def evaluate_passk_solutions(task, k=3):
    """
    Evaluate pass@k performance by checking if at least one of k solutions is correct.

    Parameters:
    task (dict): The task containing generated solutions and ground truth.
    k (int): Number of solutions to evaluate.

    Returns:
    dict: Evaluation results including pass@k success, best accuracy, and solution details.
    """
    if not test_run:
        # If not in test mode, just extract solutions without evaluation
        extracted_solutions = []
        for i in range(1, k+1):
            solution_key = f'generated_solution_{i}'
            if solution_key in task:
                gen_solution = extract_solution(task[solution_key])
                extracted_solutions.append(gen_solution)
        
        return {
            'extracted_solutions': extracted_solutions,
            'pass_at_k': False,  # Cannot evaluate without ground truth
            'best_accuracy': 0.0,
            'best_solution_idx': -1
        }
    
    true_solution = task['solution']
    file_name = task['file_name']
    
    best_accuracy = 0.0
    best_solution_idx = -1
    best_solution = None
    is_any_correct = False
    
    solution_results = []
    
    # Evaluate each of the k solutions
    for i in range(1, k+1):
        solution_key = f'generated_solution_{i}'
        if solution_key in task:
            generated_text = task[solution_key]
            gen_solution = extract_solution(generated_text)

            if not is_rectangular(gen_solution) or not is_rectangular(true_solution):
                print(f"Skipping {file_name} due to jagged array.")
                continue

            # Compare with ground truth
            is_correct, correct_percentage = compare_solutions_with_padding(gen_solution, true_solution)
            
            solution_results.append({
                'solution_idx': i,
                'is_correct': is_correct,
                'accuracy': correct_percentage,
                'extracted_solution': gen_solution
            })
            
            # Track the best solution
            if correct_percentage > best_accuracy:
                best_accuracy = correct_percentage
                best_solution_idx = i
                best_solution = gen_solution
            
            # Check if any solution is completely correct
            if is_correct:
                is_any_correct = True
    
    return {
        'file_name': file_name,
        'pass_at_k': is_any_correct,
        'best_accuracy': best_accuracy,
        'best_solution_idx': best_solution_idx,
        'best_solution': best_solution,
        #'all_solutions': solution_results,
        'true_solution':true_solution
    }

In [63]:
def run_passk_evaluation(filtered_dataset, k=3, use_batch=True):
    """
    Run pass@k evaluation on the filtered dataset.

    Parameters:
    filtered_dataset: The dataset to evaluate.
    k (int): Number of solutions to generate per task.
    use_batch (bool): Whether to use batch generation for efficiency.

    Returns:
    dict: Comprehensive evaluation results.
    """
    print(f"Generating {k} solutions per task for pass@{k} evaluation...")
    
    # Choose generation function #generate_solutions_passk_batch if use_batch else
    generation_func = generate_solutions_passk
    
    # Generate k solutions for each task
    dataset_with_solutions = filtered_dataset.map(
        lambda x: generation_func(x, k=k), 
        batched=False
    )
    
    print("Evaluating solutions...")
    
    # Evaluate pass@k performance
    evaluation_results = []
    
    for i, task in enumerate(dataset_with_solutions):
        result = evaluate_passk_solutions(task, k=k)
        evaluation_results.append(result)
        
        if test_run and (i + 1) % 10 == 0:
            print(f"Evaluated {i + 1}/{len(dataset_with_solutions)} tasks")
    
    return evaluation_results, dataset_with_solutions

In [64]:
def analyze_passk_results(evaluation_results, k):
    """
    Analyze and print pass@k evaluation results.

    Parameters:
    evaluation_results (list): List of evaluation results from run_passk_evaluation.
    k (int): The k value used in pass@k evaluation.
    """
    if not test_run:
        print("Cannot analyze results - not in test mode (no ground truth available)")
        return
    
    total_tasks = len(evaluation_results)
    pass_at_k_count = sum(1 for result in evaluation_results if result['pass_at_k'])
    
    # Calculate average best accuracy
    avg_best_accuracy = sum(result['best_accuracy'] for result in evaluation_results) / total_tasks
    
    # Calculate accuracy for each attempt position
    attempt_accuracies = {}
    for i in range(1, k+1):
        accuracies = []
        for result in evaluation_results:
            for sol_result in result['all_solutions']:
                if sol_result['solution_idx'] == i:
                    accuracies.append(sol_result['accuracy'])
        if accuracies:
            attempt_accuracies[f'attempt_{i}'] = sum(accuracies) / len(accuracies)
    
    # Print results
    print(f"\n=== Pass@{k} Evaluation Results ===")
    print(f"Total tasks evaluated: {total_tasks}")
    print(f"Tasks solved with pass@{k}: {pass_at_k_count}")
    print(f"Pass@{k} success rate: {(pass_at_k_count / total_tasks) * 100:.2f}%")
    print(f"Average best accuracy: {avg_best_accuracy:.2f}%")
    
    print(f"\nAccuracy by attempt position:")
    for attempt, accuracy in attempt_accuracies.items():
        print(f"  {attempt}: {accuracy:.2f}%")
    
    # Find tasks where pass@k helped
    helped_tasks = []
    for result in evaluation_results:
        if result['pass_at_k'] and result['best_solution_idx'] > 1:
            helped_tasks.append(result)
    
    if helped_tasks:
        print(f"\nTasks where pass@{k} helped (solution wasn't the first attempt): {len(helped_tasks)}")
        print("Examples:")
        for i, task in enumerate(helped_tasks[:5]):  # Show first 5 examples
            print(f"  {task['file_name']}: Best solution was attempt #{task['best_solution_idx']}")
    

In [None]:
k_value = 50  # You can change this value
evaluation_results, dataset_with_solutions = run_passk_evaluation(filtered_dataset, k=k_value, use_batch=True)

Generating 30 solutions per task for pass@30 evaluation...


Map:   0%|          | 0/7 [00:00<?, ? examples/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Se

Evaluating solutions...





In [70]:
from datetime import datetime

# Get current time
now = datetime.now()

# Format time as a string (e.g., '2025-08-05_14-30-00')
timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")


df_eval = pd.DataFrame(evaluation_results)

# Save to CSV — use `repr` to preserve readable nested structures
#df_eval['all_solutions'] = df_eval['all_solutions'].apply(repr)
df_eval['best_solution'] = df_eval['best_solution'].apply(repr)

# Save it
df_eval.to_csv(f"output/{timestamp}.csv", index=False)

In [None]:
# # Analyze results
# analyze_passk_results(evaluation_results, k_value)

KeyError: 'all_solutions'