# Requirements Classification Experiment

## Comparing Prompt Techniques with Energy Measurement


 ### 1. Setup and Configuration

In [1]:
pip install codecarbon seaborn tqdm

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
import subprocess
import pandas as pd
import time
import numpy as np
import requests
from codecarbon import EmissionsTracker
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Configuration
DATA_FOLDER = "../../../Downloads/PROMISE"  # Folder containing PROMISE dataset
MODEL_NAME = "llama2"  # Local Ollama model to use
N_REPETITIONS = 30  # Number of repetitions per experiment
REST_TIME = 60  # Seconds to rest between experiment batches
SEED_RANGE = range(N_REPETITIONS)  # Seeds for reproducibility
WARMUP_ITERATIONS = 2  # Fibonacci warmup iterations
TEST_SIZE = 0.05  # Size of test set for stratified splitting

# Ensure directories exist
os.makedirs("emissions", exist_ok=True)
os.makedirs("results", exist_ok=True)
os.makedirs("plots", exist_ok=True)

### 2. Define Prompt Variants

In [10]:
PROMPT_VARIANTS = {
    'zero_shot': {
        'name': 'Zero-Shot',
        'template': """You are an expert at classifying software requirements.
Classify the following requirement as either Functional (F) or Non-Functional (NF).
Reply with only F or NF. Do not explain your answer. Do not write anything else.

Requirement: "{text}"
Answer (F or NF):"""
    },
    'few_shot': {
        'name': 'Few-Shot',
        'template': """Classify software requirements as Functional (F) or Non-Functional (NF). Reply ONLY with F or NF.

Examples:
1. Requirement: "The system must allow users to reset their passwords"
   Answer: F
2. Requirement: "The system must respond to search queries within 2 seconds"
   Answer: NF
3. Requirement: "Users should be able to filter products by price range"
   Answer: F
4. Requirement: "The system must maintain 99.9% uptime"
   Answer: NF

Now classify this requirement:
Requirement: "{text}"
Your answer:"""
    }
}

### 3. Data Loading and Preparation

In [11]:
DATA_FOLDER = "PROMISE"

DATA_FOLDER = "PROMISE"

def load_and_merge_promise_data(folder_path):
    """Load and merge both traind.txt and testd.txt from PROMISE dataset (2 columns: label, text)"""
    files = ['traind.txt', 'testd.txt']
    dfs = []
    for filename in files:
        try:
            filepath = os.path.join(folder_path, filename)
            df = pd.read_csv(filepath, header=None, names=['label', 'text'], sep=',', dtype=str)
            dfs.append(df)
            print(f"Loaded {filename} with {len(df)} samples")
        except Exception as e:
            print(f"Error loading {filename}: {str(e)}")
            dfs.append(pd.DataFrame(columns=['label', 'text']))
    full_df = pd.concat(dfs, ignore_index=True)
    return full_df

def clean_data(df):
    """Clean and standardize labels"""
    df['label'] = df['label'].str.strip().str.upper().replace(
        {'FR': 'F', 'FUNCTIONAL': 'F', 'FUNCTIONALITY': 'F',
         'NFR': 'NF', 'NON-FUNCTIONAL': 'NF', 'NONFUNCTIONAL': 'NF'}
    )
    df = df[df['label'].isin(['F', 'NF'])]  # Keep only valid labels
    df['text'] = df['text'].str.strip()
    return df.dropna()

# Load, merge and clean all data
full_df = clean_data(load_and_merge_promise_data(DATA_FOLDER))

print("\nDataset Statistics:")
print(f"Total samples: {len(full_df)}")
print("Label distribution:")
print(full_df['label'].value_counts())

Loaded traind.txt with 572 samples
Loaded testd.txt with 83 samples

Dataset Statistics:
Total samples: 655
Label distribution:
label
NF    389
F     266
Name: count, dtype: int64


In [12]:
full_df = clean_data(load_and_merge_promise_data(DATA_FOLDER))
print(full_df.head())
print(full_df['label'].value_counts())

Loaded traind.txt with 572 samples
Loaded testd.txt with 83 samples
  label                                               text
0    NF  The system shall refresh the display every 60 ...
1    NF  The application shall match the color of the s...
2    NF  If projected  the data must be readable.  On a...
3    NF  The product shall be available during normal b...
4    NF  If projected  the data must be understandable....
label
NF    389
F     266
Name: count, dtype: int64


### 4. Experiment Utilities

In [13]:
def fibonacci_warmup(n):
    """Run Fibonacci sequence for warmup"""
    a, b = 0, 1
    for _ in range(n):
        a, b = b, a + b
    return a

def query_ollama(prompt, model=MODEL_NAME, max_retries=1):
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    for attempt in range(max_retries):
        try:
            response = requests.post(url, json=payload, timeout=120)
            if response.status_code == 200:
                return response.json()["response"].strip()
            else:
                print(f"API error: {response.status_code} {response.text}")
                return "API_ERROR"
        except requests.Timeout:
            print(f"Timeout on attempt {attempt + 1}")
            if attempt == max_retries - 1:
                return "TIMEOUT_ERROR"
        except Exception as e:
            print(f"Error: {str(e)}")
            if attempt == max_retries - 1:
                return f"ERROR: {str(e)}"
        time.sleep(2)
    return "MAX_RETRIES_EXCEEDED"

import re


def classify_requirement(text, prompt_type='zero_shot', seed=0):
    prompt = PROMPT_VARIANTS[prompt_type]['template'].format(text=text)
    response = query_ollama(prompt).upper().strip()
    print("MODEL RESPONSE:", response)
    match = re.search(r'\b(NF|F)\b', response)
    if match:
        return match.group(1)
    elif "TIMEOUT_ERROR" in response or "PROCESS_ERROR" in response:
        return f"UNKNOWN_RESPONSE_{response}"
    else:
        return f"UNKNOWN_RESPONSE_{response[:20]}"

### 5. Experiment Execution

In [14]:
def create_stratified_split(df, seed, test_size=TEST_SIZE):
    """Create stratified train-test split"""
    # Ensure we have at least one sample from each class in both sets
    unique_classes = df['label'].unique()
    
    train_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=seed,
        stratify=df['label']
    )
    
    return train_df, test_df

def run_experiment_batch(full_data, seed):
    """Run all prompt variants for a single seed"""
    all_results = []
    
    # Create stratified split for this seed
    train_df, test_df = create_stratified_split(full_data, seed)
    print(f"\nSeed {seed}: Split into {len(train_df)} train and {len(test_df)} test samples")
    
    for prompt_type in PROMPT_VARIANTS.keys():
        variant_name = PROMPT_VARIANTS[prompt_type]['name']
        print(f"  Running {variant_name}...")
        
        # Initialize tracker for this variant
        tracker = EmissionsTracker(
            project_name=f"PROMISE_{prompt_type}_Seed{seed}",
            measure_power_secs=1,
            output_dir="emissions",
            log_level="error"
        )
        
        results = []
        tracker.start()
        
        for _, row in test_df.iterrows():
            start_time = time.time()
            pred = classify_requirement(row['text'], prompt_type, seed)
            latency = time.time() - start_time

            results.append({
                'text': row['text'][:100] + "...",
                'true_label': row['label'],
                'pred_label': pred,
                'latency_sec': latency,
                'seed': seed,
                'prompt_type': prompt_type,
                'variant_name': variant_name
            })
        
        emissions = tracker.stop()
        
        # Calculate metrics
        valid_results = [r for r in results if r['pred_label'] in ['F', 'NF']]
        if valid_results:
            y_true = [r['true_label'] for r in valid_results]
            y_pred = [r['pred_label'] for r in valid_results]
            
            metrics = {
                'accuracy': accuracy_score(y_true, y_pred),
                'precision': precision_score(y_true, y_pred, pos_label='F'),
                'recall': recall_score(y_true, y_pred, pos_label='F'),
                'f1': f1_score(y_true, y_pred, pos_label='F'),
                'valid_responses': len(valid_results),
                'total_responses': len(results),
                'emissions_kgco2': emissions,
                'avg_latency': np.mean([r['latency_sec'] for r in results]),
                'seed': seed,
                'prompt_type': prompt_type,
                'variant_name': variant_name
            }
        else:
            metrics = {
                'accuracy': 0,
                'precision': 0,
                'recall': 0,
                'f1': 0,
                'valid_responses': 0,
                'total_responses': len(results),
                'emissions_kgco2': emissions,
                'avg_latency': np.mean([r['latency_sec'] for r in results]),
                'seed': seed,
                'prompt_type': prompt_type,
                'variant_name': variant_name
            }
        
        # Save detailed results
        results_df = pd.DataFrame(results)
        results_df['emissions_kgco2'] = emissions / len(results) if len(results) > 0 else 0
        os.makedirs(f"results/seed_{seed}", exist_ok=True)
        results_df.to_csv(f"results/seed_{seed}/{prompt_type}_results.csv", index=False)
        
        all_results.append(metrics)
    
    return pd.DataFrame(all_results)

### 6. Main Execution

In [None]:
def main():
    print("Starting experiment...")
    
    # 1. Warmup phase
    print(f"\nRunning warmup ({WARMUP_ITERATIONS} Fibonacci iterations)...")
    fibonacci_warmup(WARMUP_ITERATIONS)
    time.sleep(5)  # Short cooldown
    
    # 2. Main experiment loop
    all_metrics = []
    
    for seed in tqdm(SEED_RANGE, desc="Experiment Progress"):
        # Run all prompt variants for this seed
        metrics_df = run_experiment_batch(full_df, seed)
        all_metrics.append(metrics_df)
        
        # Rest period between seeds
        if seed < N_REPETITIONS - 1:
            time.sleep(REST_TIME)
    
    # 3. Combine and save all results
    final_metrics = pd.concat(all_metrics, ignore_index=True)
    final_metrics.to_csv("results/all_experiment_metrics.csv", index=False)
    
    # 4. Analysis and Visualization
    print("\nAnalyzing results...")
    
    # Calculate statistics by prompt type
    stats = final_metrics.groupby(['variant_name', 'prompt_type']).agg({
    'accuracy': ['mean', 'std'],
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1': ['mean', 'std'],
    'emissions_kgco2': ['mean', 'sum'],
    'avg_latency': 'mean',
    'valid_responses': 'mean'
    }).reset_index()

    stats.columns = ['_'.join(col).strip() if col[1] else col[0] for col in stats.columns.values]
    stats.to_csv("results/experiment_stats_summary.csv", index=False)
    
    # Plotting
    plt.figure(figsize=(14, 6))
    sns.boxplot(data=final_metrics, x='variant_name', y='accuracy')
    plt.title("Accuracy Across Prompt Variants (30 Runs Each)")
    plt.xlabel("Prompt Variant")
    plt.ylabel("Accuracy")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig("plots/accuracy_comparison.png")
    plt.close()
    
    # Emissions vs Performance
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=final_metrics, x='emissions_kgco2', y='f1', 
                    hue='variant_name', style='variant_name', s=100)
    plt.title("Emissions vs F1 Score by Prompt Variant")
    plt.xlabel("Emissions (kgCO2)")
    plt.ylabel("F1 Score")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig("plots/emissions_vs_f1.png")
    plt.close()
    
    # Print summary
    print("\nExperiment completed!")
    print("\nSummary Statistics by Prompt Variant:")
    print(stats[['variant_name', 'accuracy_mean', 'precision_mean', 'recall_mean', 'f1_mean', 'emissions_kgco2_mean']])
    
    # Save full results
    final_metrics.to_csv("results/full_experiment_results.csv", index=False)
    print("\nAll results saved to the results/ directory") 

if __name__ == "__main__":
    main()

Starting experiment...

Running warmup (2 Fibonacci iterations)...


Experiment Progress:   0%|          | 0/30 [00:00<?, ?it/s]


Seed 0: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:   3%|▎         | 1/30 [04:49<2:20:00, 289.68s/it]


Seed 1: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:   7%|▋         | 2/30 [09:47<2:17:23, 294.42s/it]


Seed 2: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  10%|█         | 3/30 [15:01<2:16:35, 303.55s/it]


Seed 3: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  13%|█▎        | 4/30 [19:38<2:06:55, 292.90s/it]


Seed 4: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  17%|█▋        | 5/30 [24:17<1:59:58, 287.92s/it]


Seed 5: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  20%|██        | 6/30 [29:29<1:58:24, 296.02s/it]


Seed 6: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  23%|██▎       | 7/30 [35:01<1:58:03, 307.96s/it]


Seed 7: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  27%|██▋       | 8/30 [40:34<1:55:51, 315.96s/it]


Seed 8: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  30%|███       | 9/30 [46:04<1:52:07, 320.36s/it]


Seed 9: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  33%|███▎      | 10/30 [51:56<1:50:01, 330.08s/it]


Seed 10: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  37%|███▋      | 11/30 [57:37<1:45:33, 333.32s/it]


Seed 11: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  40%|████      | 12/30 [1:04:08<1:45:17, 350.95s/it]


Seed 12: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  43%|████▎     | 13/30 [1:12:15<1:51:05, 392.06s/it]


Seed 13: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  47%|████▋     | 14/30 [1:18:29<1:43:06, 386.65s/it]


Seed 14: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  50%|█████     | 15/30 [1:24:15<1:33:34, 374.31s/it]


Seed 15: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  53%|█████▎    | 16/30 [1:30:26<1:27:08, 373.47s/it]


Seed 16: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  57%|█████▋    | 17/30 [1:36:26<1:20:00, 369.27s/it]


Seed 17: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  60%|██████    | 18/30 [1:42:27<1:13:22, 366.85s/it]


Seed 18: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  63%|██████▎   | 19/30 [1:48:25<1:06:45, 364.13s/it]


Seed 19: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  67%|██████▋   | 20/30 [1:54:14<59:55, 359.53s/it]  


Seed 20: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  70%|███████   | 21/30 [1:59:02<50:44, 338.27s/it]


Seed 21: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  73%|███████▎  | 22/30 [2:04:27<44:33, 334.23s/it]


Seed 22: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F


  df = pd.concat([df, new_df])
Experiment Progress:  77%|███████▋  | 23/30 [2:10:02<39:01, 334.44s/it]


Seed 23: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
  Running Few-Shot...


  df = pd.concat([df, new_df])


MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: F
MODEL RESPONSE: NF


  df = pd.concat([df, new_df])
Experiment Progress:  80%|████████  | 24/30 [2:17:53<37:31, 375.29s/it]


Seed 24: Split into 622 train and 33 test samples
  Running Zero-Shot...
MODEL RESPONSE: F
