In [None]:
#Ideally you might want to use a different way of storing and formating the outputs of the models

import re

def test_extract_sections(sample_text, prompts_to_match):
    """
    Tests section extraction from sample text input.
    
    Parameters:
    - sample_text (str): The raw multiline string to parse.
    - prompts_to_match (list): List of prompts you expect to extract outputs for.
    
    Returns:
    - List of dictionaries with prompt and extracted output.
    """
    results = []

    # Split by the delimiter line (assuming 5 or more dashes)
    sections = re.split(r"-{5,}", sample_text)

    print(f"Found {len(sections)} sections.")

    for section in sections:
        print("\n--- Processing New Section ---")

        # Look for the prompt in this section
        prompt_match = re.search(r"Prompt:\s*(.*?)\s*Output:", section, re.DOTALL)
        if not prompt_match:
            print("No prompt found in section.")
            continue
        
        prompt_text = prompt_match.group(1).strip()
        print(f"Found prompt: '{prompt_text}'")

        if prompt_text not in prompts_to_match:
            print(f"Prompt '{prompt_text}' not in list of prompts to match.")
            continue

        # Extract the output text after ### OUTPUT:
        output_match = re.search(r"### OUTPUT:\s*(.*)", section, re.DOTALL)
        if not output_match:
            print(f"No output found for prompt: {prompt_text}")
            continue
        
        model_output = output_match.group(1).strip()

        print(f"Extracted output for '{prompt_text}':\n{model_output[:300]}...")  # Show the first 300 chars

        results.append({
            "Prompt": prompt_text,
            "Output": model_output
        })

    return results


In [None]:
# Define prompts we expect to match
prompts = [
    "A small iron sword.",
    "A large two-handed sword.",
    "A steel short sword.",
    "A bronze sword.",
    "A black sword with a red gem.",
    "A rusty old sword.",
    "A shining silver sword.",
    "A sword with a leather grip.",
    "A fire sword.",
    "A sword with an icy blue blade.",
    "A knight's sword with a golden hilt.",
    "A plain copper sword.",
    "A curved blade sword.",
    "A sword with a green handle.",
    "A glowing gold sword.",
    "A small dagger-like sword.",
    "A big heavy sword.",
    "A thin, fast sword.",
    "A red sword with jagged edges.",
    "A royal sword with a blue gem."
]


In [None]:
#Again you might want to change the method of storing results, the process of extracting and sorting the results was kind of cumbersome 

import os
import re
import pandas as pd
from IPython.display import display

def extract_model_output(folder_path, prompts):
    """
    Reads all text files in a folder, extracts model outputs for given prompts, and structures data into a table.
    
    Parameters:
    - folder_path (str): Path to the folder containing the text files.
    - prompts (list of str): List of prompts to match in files.

    Returns:
    - DataFrame: Organized output table
    """
    data = []

    # Known model names and regex for size detection
    known_models = ["Lamma", "Qwen", "StarCoder2", "Mixtral"]

    # Updated filename pattern to extract raw model name and run number
    filename_pattern = re.compile(
        r'([A-Za-z0-9]+)_?(?:Output|Outputs)?_?run(\d+)\.txt',
        re.IGNORECASE
    )

    # Get all text files
    files = sorted(os.listdir(folder_path))

    for file in files:
        if not file.endswith(".txt") or "Time" in file:
            continue  # Skip unwanted files
        
        match = filename_pattern.match(file)
        if not match:
            print(f"Skipped file: {file} (No match)")
            continue
        
        raw_model_name, run_number = match.groups()
        run_number = int(run_number)

        # Extract the model size
        size_match = re.search(r"(\d+[Bb][s]?)", raw_model_name)
        extracted_size = size_match.group(1) if size_match else "UnknownSize"

        # Detect the model name from known models
        detected_model = "UnknownModel"
        # Remove extracted size from raw model name to improve matching
        model_name_no_size = raw_model_name.replace(extracted_size or "", "")
        for model in known_models:
            if model.lower() in model_name_no_size.lower():
                detected_model = model
                break

        # Debug info if model name not detected
        if detected_model == "UnknownModel":
            print(f"Unknown model detected in filename: {file}")

        file_path = os.path.join(folder_path, file)

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        # Split into sections by delimiter
        sections = re.split(r"-{5,}", content)

        for section in sections:
            # Look for the prompt in the section
            prompt_match = re.search(r"Prompt:\s*(.*?)\s*Output:", section, re.DOTALL)
            if not prompt_match:
                continue  # No prompt found in this section
            
            prompt_text = prompt_match.group(1).strip()

            # Only process prompts we care about
            if prompt_text not in prompts:
                continue

            # Extract the model output after ### OUTPUT:
            output_match = re.search(r"### OUTPUT:\s*(.*)", section, re.DOTALL)
            if not output_match:
                print(f"Output not found for prompt: {prompt_text} in {file}")
                continue

            model_output = output_match.group(1).strip()

            # Append extracted data to the list
            data.append([
                run_number,
                detected_model,
                extracted_size,
                prompt_text,
                model_output
            ])

    # Create DataFrame
    df = pd.DataFrame(data, columns=["Run", "Model", "Size", "Prompt", "Output"])
    df.sort_values(["Prompt", "Run", "Model", "Size"], inplace=True)

    return df


In [None]:
folder_path = "your folder path here"

prompts = [
    "A small iron sword.",
    "A large two-handed sword.",
    "A steel short sword.",
    "A bronze sword.",
    "A black sword with a red gem.",
    "A rusty old sword.",
    "A shining silver sword.",
    "A sword with a leather grip.",
    "A fire sword.",
    "A sword with an icy blue blade.",
    "A knight's sword with a golden hilt.",
    "A plain copper sword.",
    "A curved blade sword.",
    "A sword with a green handle.",
    "A glowing gold sword.",
    "A small dagger-like sword.",
    "A big heavy sword.",
    "A thin, fast sword.",
    "A red sword with jagged edges.",
    "A royal sword with a blue gem."
]

df_results = extract_model_output(folder_path, prompts)

display(df_results)

df_results.to_csv('name.csv', index=False)


In [None]:
import os
import re
import pandas as pd
from IPython.display import display

def compile_csv_files(folder_path):
    """
    Reads all CSV files in a folder and merges them into one Pandas DataFrame, 
    ensuring the model name and size are correctly separated without overwriting existing sizes.

    Parameters:
    - folder_path (str): Path to the folder containing the CSV files.

    Returns:
    - DataFrame: Merged DataFrame with cleaned model names and sizes.
    """
    all_dfs = []
    
    # Get all CSV files
    files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

    for file in files:
        file_path = os.path.join(folder_path, file)
        
        # Read CSV into DataFrame
        df = pd.read_csv(file_path)
        
        # Ensure 'Size' column exists
        if "Size" not in df.columns:
            df["Size"] = "UnknownSize"

        all_dfs.append(df)

    # Concatenate all DataFrames into one
    merged_df = pd.concat(all_dfs, ignore_index=True)

    return merged_df




In [None]:
# Usage example
folder_path = "your folder path"

# Compile all CSVs
df_combined = compile_csv_files(folder_path)

# Display the combined DataFrame
display(df_combined)



df_combined.to_csv('name.csv', index=False)