In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm

# === CONFIG ===
RESULTS_FOLDER = "./"  # Folder containing *_results.json files
OUTPUT_FILE = "userprint_data.csv"
PERSONA_CHAT_CSV = "personality.csv"
CLUSTER_SUMMARIES_JSON = "cluster_persona_summaries.json"

# Load supporting data first
persona_chat_df = pd.read_csv(PERSONA_CHAT_CSV)
with open(CLUSTER_SUMMARIES_JSON) as f:
    cluster_summaries = json.load(f)

# === Prompt Builder ===
def build_prompt(row_id, condition):
    """Build prompt based on condition and original data"""
    try:
        chat_content = persona_chat_df.loc[row_id, 'chat']
    except KeyError:
        chat_content = ""
    
    persona_content = cluster_summaries.get(str(row_id % 10), "") if condition in ['persona_only', 'persona_plus_chat'] else None
    
    if condition == 'persona_plus_chat':
        return f"Persona: {persona_content}\n\nConversation:\n{chat_content}"
    elif condition == 'persona_only':
        return f"Persona: {persona_content}"
    elif condition == 'chat_only':
        return f"Conversation:\n{chat_content}"
    else:
        return "No content provided"

# === Load and Process Results ===
def load_and_process_results(results_folder):
    all_data = []
    new_id = 1  # Starting ID counter
    
    # Process each results file
    for filename in tqdm(os.listdir(results_folder), desc="Processing files"):
        if filename.endswith("_results.json"):
            with open(os.path.join(results_folder, filename)) as f:
                model_name = filename.replace("ollama_", "").replace("_results.json", "").replace("_results21.json", "")
                model_data = json.load(f)
                
                for entry in model_data:
                    # Skip error entries
                    if entry['response'] == "ERROR":
                        continue
                        
                    # Add new entry with generated ID
                    all_data.append({
                        'Id': new_id,
                        'model': model_name,
                        'condition': entry['condition'],
                        'prompt': build_prompt(entry['id'], entry['condition']),
                        'response': entry['response']
                    })
                    new_id += 1
    
    return pd.DataFrame(all_data)

# === Main Execution ===
if __name__ == "__main__":
    print("📊 Building consolidated dataset...")
    df = load_and_process_results(RESULTS_FOLDER)
    
    print(f"💾 Saving to {OUTPUT_FILE}")
    df.to_csv(OUTPUT_FILE, index=False)
    
    print("✅ Done! Dataset columns:")
    print(df.columns)
    print(f"\nTotal records: {len(df)}")

📊 Building consolidated dataset...


Processing files: 100%|████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 615.95it/s]

💾 Saving to userprint_data.csv
✅ Done! Dataset columns:
Index(['Id', 'model', 'condition', 'prompt', 'response'], dtype='object')

Total records: 5777



