In [2]:
import os
import sys
import json
import pandas as pd
from datetime import datetime

# Add project root (the directory that contains "src")
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# --- Imports from your project ---
from src.pipelines.FewShotPipeline import FewShotPipeline
from src.app.Evaluator import Evaluator
from src.llms.LLM_Wrappers import AbstractLLM

examples = [1, 2, 3, 4, 5]
llm = AbstractLLM.from_name("gpt-4o-mini")

pipeline = FewShotPipeline(
    llm=llm,
    input_path="../src/data/Q17_Annotated_Responses.json",
    example_ids=examples,
    output_dir="outputs/",
    output_name="FewShot_gpt-4o-mini_mult",
    use_cache=False
)

# 3. Run on a specific entry (e.g., ID 3)
# This will load the data, look at ID 1 and 2 for context, annotate ID 3, and save.
result = pipeline.run_multiple([12, 13, 14])

Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:11<00:00,  3.87s/entry, Last: 4.73s]

‚úÖ Annotated batch of 3 entries saved (partial file) to outputs/Q17_Annotated_Responses_FewShot_gpt-4o-mini_mult_annotated.json





# K-Fold Cross Validation

## 10-Fold Validation (90/10 train/test split)

In [4]:
import os
import sys
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from datetime import datetime

# --- Paths Setup ---
# Add project root to sys.path if not present
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# --- Imports ---
from src.pipelines.FewShotPipeline import FewShotPipeline
from src.app.Evaluator import Evaluator
from src.llms.LLM_Wrappers import AbstractLLM

# --- Configuration ---
INPUT_FILE_PATH = "../src/data/Q17_Annotated_Responses.json"
OUTPUT_DIR = "outputs/experiments/"
MODEL_NAME = "gpt-4o-mini"
NUM_EXPERIMENTS = 3
K_FOLDS = 10

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Load Data & Extract IDs ---
with open(INPUT_FILE_PATH, "r") as f:
    data = json.load(f)

# Extract all available IDs from the dataset
all_ids = np.array([entry['id'] for entry in data['answers']])
print(f"Loaded {len(all_ids)} entries from {os.path.basename(INPUT_FILE_PATH)}")

# --- Initialize LLM ---
llm = AbstractLLM.from_name(MODEL_NAME)

# --- Storage for Results ---
all_results = []

print(f"Starting {NUM_EXPERIMENTS} Experiments with {K_FOLDS}-Fold Cross Validation...")
print("-" * 60)

for exp_num in range(1, NUM_EXPERIMENTS + 1):
    print(f"\nüöÄ STARTING EXPERIMENT {exp_num}/{NUM_EXPERIMENTS}")
    
    # Initialize K-Fold with shuffling
    # random_state ensures different splits for each experiment
    kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42 + exp_num)
    
    for fold_num, (train_index, test_index) in enumerate(kf.split(all_ids), 1):
        
        # 1. Split IDs
        # train_ids (90%) -> used as Examples
        # test_ids (10%)  -> used as Targets to annotate
        train_ids = all_ids[train_index].tolist()
        test_ids = all_ids[test_index].tolist()
        
        # 2. Construct Unique Output Name
        # Format: Exp_X_Fold_Y_Model_Filename
        base_name = os.path.splitext(os.path.basename(INPUT_FILE_PATH))[0]
        unique_name = f"Exp{exp_num}_Fold{fold_num}_{MODEL_NAME}_{base_name}"
        
        print(f"  > Processing Fold {fold_num}/{K_FOLDS} (Targets: {len(test_ids)} entries)...")

        # 3. Initialize Pipeline with current Train IDs as Examples
        pipeline = FewShotPipeline(
            llm=llm,
            input_path=INPUT_FILE_PATH,
            example_ids=train_ids,
            output_dir=OUTPUT_DIR,
            output_name=unique_name,
            use_cache=False # Disable cache to ensure fresh runs for experiments
        )
        
        # 4. Run Pipeline on Test IDs
        # This saves the partial file containing examples + annotated targets
        output_path = pipeline.run_multiple(test_ids) # This returns a list, but pipeline saves file internally
        
        # The pipeline class saves to self.output_path. We reconstruct that path or rely on return
        # Since run_multiple returns entries, we can reconstruct path or use the attribute if accessible.
        # However, run_multiple writes to disk. Let's rely on the predictable path structure from pipeline:
        actual_output_path = pipeline.output_path
        
        # 5. Evaluate
        # Compare the Auto-Annotated File against the original Ground Truth File
        evaluator = Evaluator(auto_path=actual_output_path, gt_path=INPUT_FILE_PATH)
        
        # Get metrics (min_confidence can be adjusted)
        metrics = evaluator.evaluate_precision_recall(min_confidence=0.5)
        
        # 6. Store Metrics
        global_metrics = metrics['global']
        result_row = {
            "Experiment": exp_num,
            "Fold": fold_num,
            "Model": MODEL_NAME,
            "Train_Size": len(train_ids),
            "Test_Size": len(test_ids),
            "Precision": global_metrics['precision'],
            "Recall": global_metrics['recall'],
            "F1_Score": global_metrics['f1-score'],
            "Evaluated_Entries": global_metrics['evaluated_entries']
        }
        all_results.append(result_row)
        
        # Optional: Print Fold Result
        print(f"    -> Fold {fold_num} Result: F1={global_metrics['f1-score']:.3f} "
              f"(P={global_metrics['precision']:.3f}, R={global_metrics['recall']:.3f})")

# --- Final Aggregation & Saving ---

# Create DataFrame
df_results = pd.DataFrame(all_results)

# 1. Calculate Average per Experiment
exp_summary = df_results.groupby("Experiment")[["Precision", "Recall", "F1_Score"]].mean()

# 2. Calculate Grand Average (Average of all folds across all experiments)
grand_average = df_results[["Precision", "Recall", "F1_Score"]].mean()

# 3. Save to CSV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"Experiment_Results_{MODEL_NAME}_{timestamp}.csv"
csv_path = os.path.join(OUTPUT_DIR, csv_filename)
df_results.to_csv(csv_path, index=False)

print("\n" + "="*60)
print(f"üèÅ EXPERIMENT COMPLETE")
print("="*60)
print(f"\n--- Summary by Experiment (Average of {K_FOLDS} Folds) ---")
print(exp_summary)

print(f"\n--- Grand Average (Across all {NUM_EXPERIMENTS * K_FOLDS} runs) ---")
print(grand_average)

print(f"\nüìÑ Detailed results saved to: {csv_path}")

Loaded 58 entries from Q17_Annotated_Responses.json
Starting 3 Experiments with 10-Fold Cross Validation...
------------------------------------------------------------

üöÄ STARTING EXPERIMENT 1/3
  > Processing Fold 1/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:12<00:00,  2.00s/entry, Last: 1.77s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold1_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 1 Result: F1=0.737 (P=0.636, R=0.875)
  > Processing Fold 2/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:08<00:00,  1.41s/entry, Last: 1.22s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold2_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 2 Result: F1=0.800 (P=0.857, R=0.750)
  > Processing Fold 3/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:11<00:00,  1.93s/entry, Last: 3.28s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold3_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 3 Result: F1=0.889 (P=0.800, R=1.000)
  > Processing Fold 4/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:11<00:00,  1.84s/entry, Last: 3.37s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold4_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 4 Result: F1=0.783 (P=0.818, R=0.750)
  > Processing Fold 5/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:10<00:00,  1.74s/entry, Last: 2.30s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold5_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 5 Result: F1=0.750 (P=0.600, R=1.000)
  > Processing Fold 6/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:10<00:00,  1.72s/entry, Last: 1.17s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold6_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 6 Result: F1=0.824 (P=0.778, R=0.875)
  > Processing Fold 7/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:11<00:00,  1.93s/entry, Last: 1.22s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold7_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 7 Result: F1=0.800 (P=0.727, R=0.889)
  > Processing Fold 8/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:13<00:00,  2.26s/entry, Last: 1.92s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold8_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 8 Result: F1=0.636 (P=0.538, R=0.778)
  > Processing Fold 9/10 (Targets: 5 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:12<00:00,  2.59s/entry, Last: 2.56s]


‚úÖ Annotated batch of 5 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold9_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 5 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 9 Result: F1=0.640 (P=0.571, R=0.727)
  > Processing Fold 10/10 (Targets: 5 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:08<00:00,  1.63s/entry, Last: 2.79s]


‚úÖ Annotated batch of 5 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold10_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 5 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 10 Result: F1=0.462 (P=0.333, R=0.750)

üöÄ STARTING EXPERIMENT 2/3
  > Processing Fold 1/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:05<00:00,  1.03entry/s, Last: 0.83s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold1_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 1 Result: F1=0.857 (P=0.857, R=0.857)
  > Processing Fold 2/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:12<00:00,  2.03s/entry, Last: 2.94s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold2_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 2 Result: F1=0.818 (P=0.750, R=0.900)
  > Processing Fold 3/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:13<00:00,  2.26s/entry, Last: 2.06s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold3_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 3 Result: F1=0.583 (P=0.583, R=0.583)
  > Processing Fold 4/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:15<00:00,  2.51s/entry, Last: 5.32s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold4_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 4 Result: F1=0.824 (P=0.700, R=1.000)
  > Processing Fold 5/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:12<00:00,  2.17s/entry, Last: 2.35s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold5_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 5 Result: F1=0.700 (P=0.636, R=0.778)
  > Processing Fold 6/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:10<00:00,  1.76s/entry, Last: 1.65s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold6_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 6 Result: F1=0.526 (P=0.417, R=0.714)
  > Processing Fold 7/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:12<00:00,  2.11s/entry, Last: 2.46s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold7_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 7 Result: F1=0.667 (P=0.636, R=0.700)
  > Processing Fold 8/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:13<00:00,  2.27s/entry, Last: 2.35s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold8_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 8 Result: F1=0.667 (P=0.545, R=0.857)
  > Processing Fold 9/10 (Targets: 5 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:10<00:00,  2.15s/entry, Last: 1.34s]


‚úÖ Annotated batch of 5 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold9_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 5 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 9 Result: F1=0.800 (P=0.750, R=0.857)
  > Processing Fold 10/10 (Targets: 5 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:08<00:00,  1.74s/entry, Last: 2.46s]


‚úÖ Annotated batch of 5 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold10_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 5 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 10 Result: F1=0.667 (P=0.625, R=0.714)

üöÄ STARTING EXPERIMENT 3/3
  > Processing Fold 1/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:13<00:00,  2.18s/entry, Last: 3.46s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold1_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 1 Result: F1=0.632 (P=0.545, R=0.750)
  > Processing Fold 2/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:11<00:00,  1.94s/entry, Last: 1.67s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold2_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 2 Result: F1=0.500 (P=0.417, R=0.625)
  > Processing Fold 3/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:07<00:00,  1.17s/entry, Last: 2.25s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold3_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 3 Result: F1=0.769 (P=0.625, R=1.000)
  > Processing Fold 4/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:13<00:00,  2.18s/entry, Last: 3.32s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold4_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 4 Result: F1=0.700 (P=0.636, R=0.778)
  > Processing Fold 5/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:11<00:00,  1.88s/entry, Last: 1.23s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold5_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 5 Result: F1=0.824 (P=0.700, R=1.000)
  > Processing Fold 6/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:10<00:00,  1.82s/entry, Last: 1.43s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold6_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 6 Result: F1=0.933 (P=0.875, R=1.000)
  > Processing Fold 7/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:15<00:00,  2.51s/entry, Last: 2.89s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold7_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 7 Result: F1=0.800 (P=0.727, R=0.889)
  > Processing Fold 8/10 (Targets: 6 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:14<00:00,  2.47s/entry, Last: 2.36s]


‚úÖ Annotated batch of 6 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold8_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 6 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 8 Result: F1=0.741 (P=0.714, R=0.769)
  > Processing Fold 9/10 (Targets: 5 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:11<00:00,  2.30s/entry, Last: 1.40s]


‚úÖ Annotated batch of 5 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold9_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 5 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 9 Result: F1=0.700 (P=0.636, R=0.778)
  > Processing Fold 10/10 (Targets: 5 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:11<00:00,  2.24s/entry, Last: 3.18s]

‚úÖ Annotated batch of 5 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold10_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 5 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 10 Result: F1=0.824 (P=0.778, R=0.875)

üèÅ EXPERIMENT COMPLETE

--- Summary by Experiment (Average of 10 Folds) ---
            Precision    Recall  F1_Score
Experiment                               
1            0.665996  0.839394  0.731977
2            0.650032  0.796111  0.710850
3            0.665418  0.846368  0.742194

--- Grand Average (Across all 30 runs) ---
Precision    0.660482
Recall       0.827291
F1_Score     0.728341
dtype: float64

üìÑ Detailed results saved to: outputs/experiments/Experiment_Results_gpt-4o-mini_20251208_094150.csv





## 4-Fold Validation (75/25 train/test split)

In [5]:
import os
import sys
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from datetime import datetime

# --- Paths Setup ---
# Add project root to sys.path if not present
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# --- Imports ---
from src.pipelines.FewShotPipeline import FewShotPipeline
from src.app.Evaluator import Evaluator
from src.llms.LLM_Wrappers import AbstractLLM

# --- Configuration ---
INPUT_FILE_PATH = "../src/data/Q17_Annotated_Responses.json"
OUTPUT_DIR = "outputs/experiments/"
MODEL_NAME = "gpt-4o-mini"
NUM_EXPERIMENTS = 3
K_FOLDS = 4

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Load Data & Extract IDs ---
with open(INPUT_FILE_PATH, "r") as f:
    data = json.load(f)

# Extract all available IDs from the dataset
all_ids = np.array([entry['id'] for entry in data['answers']])
print(f"Loaded {len(all_ids)} entries from {os.path.basename(INPUT_FILE_PATH)}")

# --- Initialize LLM ---
llm = AbstractLLM.from_name(MODEL_NAME)

# --- Storage for Results ---
all_results = []

print(f"Starting {NUM_EXPERIMENTS} Experiments with {K_FOLDS}-Fold Cross Validation...")
print("-" * 60)

for exp_num in range(1, NUM_EXPERIMENTS + 1):
    print(f"\nüöÄ STARTING EXPERIMENT {exp_num}/{NUM_EXPERIMENTS}")
    
    # Initialize K-Fold with shuffling
    # random_state ensures different splits for each experiment
    kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42 + exp_num)
    
    for fold_num, (train_index, test_index) in enumerate(kf.split(all_ids), 1):
        
        # 1. Split IDs
        # train_ids (90%) -> used as Examples
        # test_ids (10%)  -> used as Targets to annotate
        train_ids = all_ids[train_index].tolist()
        test_ids = all_ids[test_index].tolist()
        
        # 2. Construct Unique Output Name
        # Format: Exp_X_Fold_Y_Model_Filename
        base_name = os.path.splitext(os.path.basename(INPUT_FILE_PATH))[0]
        unique_name = f"Exp{exp_num}_Fold{fold_num}_{MODEL_NAME}_{base_name}"
        
        print(f"  > Processing Fold {fold_num}/{K_FOLDS} (Targets: {len(test_ids)} entries)...")

        # 3. Initialize Pipeline with current Train IDs as Examples
        pipeline = FewShotPipeline(
            llm=llm,
            input_path=INPUT_FILE_PATH,
            example_ids=train_ids,
            output_dir=OUTPUT_DIR,
            output_name=unique_name,
            use_cache=False # Disable cache to ensure fresh runs for experiments
        )
        
        # 4. Run Pipeline on Test IDs
        # This saves the partial file containing examples + annotated targets
        output_path = pipeline.run_multiple(test_ids) # This returns a list, but pipeline saves file internally
        
        # The pipeline class saves to self.output_path. We reconstruct that path or rely on return
        # Since run_multiple returns entries, we can reconstruct path or use the attribute if accessible.
        # However, run_multiple writes to disk. Let's rely on the predictable path structure from pipeline:
        actual_output_path = pipeline.output_path
        
        # 5. Evaluate
        # Compare the Auto-Annotated File against the original Ground Truth File
        evaluator = Evaluator(auto_path=actual_output_path, gt_path=INPUT_FILE_PATH)
        
        # Get metrics (min_confidence can be adjusted)
        metrics = evaluator.evaluate_precision_recall(min_confidence=0.5)
        
        # 6. Store Metrics
        global_metrics = metrics['global']
        result_row = {
            "Experiment": exp_num,
            "Fold": fold_num,
            "Model": MODEL_NAME,
            "Train_Size": len(train_ids),
            "Test_Size": len(test_ids),
            "Precision": global_metrics['precision'],
            "Recall": global_metrics['recall'],
            "F1_Score": global_metrics['f1-score'],
            "Evaluated_Entries": global_metrics['evaluated_entries']
        }
        all_results.append(result_row)
        
        # Optional: Print Fold Result
        print(f"    -> Fold {fold_num} Result: F1={global_metrics['f1-score']:.3f} "
              f"(P={global_metrics['precision']:.3f}, R={global_metrics['recall']:.3f})")

# --- Final Aggregation & Saving ---

# Create DataFrame
df_results = pd.DataFrame(all_results)

# 1. Calculate Average per Experiment
exp_summary = df_results.groupby("Experiment")[["Precision", "Recall", "F1_Score"]].mean()

# 2. Calculate Grand Average (Average of all folds across all experiments)
grand_average = df_results[["Precision", "Recall", "F1_Score"]].mean()

# 3. Save to CSV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"Experiment_Results_{MODEL_NAME}_{timestamp}.csv"
csv_path = os.path.join(OUTPUT_DIR, csv_filename)
df_results.to_csv(csv_path, index=False)

print("\n" + "="*60)
print(f"üèÅ EXPERIMENT COMPLETE")
print("="*60)
print(f"\n--- Summary by Experiment (Average of {K_FOLDS} Folds) ---")
print(exp_summary)

print(f"\n--- Grand Average (Across all {NUM_EXPERIMENTS * K_FOLDS} runs) ---")
print(grand_average)

print(f"\nüìÑ Detailed results saved to: {csv_path}")

Loaded 58 entries from Q17_Annotated_Responses.json
Starting 3 Experiments with 4-Fold Cross Validation...
------------------------------------------------------------

üöÄ STARTING EXPERIMENT 1/3
  > Processing Fold 1/4 (Targets: 15 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:24<00:00,  1.61s/entry, Last: 2.65s]


‚úÖ Annotated batch of 15 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold1_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 15 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 1 Result: F1=0.837 (P=0.783, R=0.900)
  > Processing Fold 2/4 (Targets: 15 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:30<00:00,  2.05s/entry, Last: 1.12s]


‚úÖ Annotated batch of 15 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold2_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 15 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 2 Result: F1=0.708 (P=0.654, R=0.773)
  > Processing Fold 3/4 (Targets: 14 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:26<00:00,  1.89s/entry, Last: 1.01s]


‚úÖ Annotated batch of 14 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold3_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 14 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 3 Result: F1=0.667 (P=0.609, R=0.737)
  > Processing Fold 4/4 (Targets: 14 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:29<00:00,  2.09s/entry, Last: 2.66s]


‚úÖ Annotated batch of 14 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp1_Fold4_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 14 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 4 Result: F1=0.630 (P=0.531, R=0.773)

üöÄ STARTING EXPERIMENT 2/3
  > Processing Fold 1/4 (Targets: 15 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:21<00:00,  1.44s/entry, Last: 2.76s]


‚úÖ Annotated batch of 15 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold1_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 15 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 1 Result: F1=0.809 (P=0.792, R=0.826)
  > Processing Fold 2/4 (Targets: 15 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:28<00:00,  1.89s/entry, Last: 1.15s]


‚úÖ Annotated batch of 15 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold2_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 15 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 2 Result: F1=0.680 (P=0.607, R=0.773)
  > Processing Fold 3/4 (Targets: 14 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:28<00:00,  2.01s/entry, Last: 2.17s]


‚úÖ Annotated batch of 14 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold3_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 14 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 3 Result: F1=0.651 (P=0.583, R=0.737)
  > Processing Fold 4/4 (Targets: 14 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:30<00:00,  2.17s/entry, Last: 0.91s]


‚úÖ Annotated batch of 14 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp2_Fold4_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 14 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 4 Result: F1=0.698 (P=0.625, R=0.789)

üöÄ STARTING EXPERIMENT 3/3
  > Processing Fold 1/4 (Targets: 15 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:22<00:00,  1.49s/entry, Last: 3.40s]


‚úÖ Annotated batch of 15 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold1_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 15 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 1 Result: F1=0.667 (P=0.556, R=0.833)
  > Processing Fold 2/4 (Targets: 15 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 15/15 [00:25<00:00,  1.69s/entry, Last: 1.12s]


‚úÖ Annotated batch of 15 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold2_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 15 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 2 Result: F1=0.756 (P=0.654, R=0.895)
  > Processing Fold 3/4 (Targets: 14 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:24<00:00,  1.78s/entry, Last: 2.58s]


‚úÖ Annotated batch of 14 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold3_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 14 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 3 Result: F1=0.818 (P=0.750, R=0.900)
  > Processing Fold 4/4 (Targets: 14 entries)...


Annotating entries: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 14/14 [00:27<00:00,  1.98s/entry, Last: 2.00s]

‚úÖ Annotated batch of 14 entries saved (partial file) to outputs/experiments/Q17_Annotated_Responses_Exp3_Fold4_gpt-4o-mini_Q17_Annotated_Responses_annotated.json
‚úÖ Aligned 14 common entries for evaluation. (Skipped 0 auto-entries not in GT).
    -> Fold 4 Result: F1=0.764 (P=0.724, R=0.808)

üèÅ EXPERIMENT COMPLETE

--- Summary by Experiment (Average of 4 Folds) ---
            Precision    Recall  F1_Score
Experiment                               
1            0.644100  0.795574  0.710460
2            0.651786  0.781283  0.709337
3            0.670885  0.858941  0.751010

--- Grand Average (Across all 12 runs) ---
Precision    0.655590
Recall       0.811932
F1_Score     0.723602
dtype: float64

üìÑ Detailed results saved to: outputs/experiments/Experiment_Results_gpt-4o-mini_20251208_095314.csv



