1. Review .env file. Update the dataset, keys for each of the service. For Ollama, can leave blank or fake key.
   An example of .env file
   OPENAI_API_KEY="FAKE-KEY-11111"
   ANTHROPIC_API_KEY="FAKE-KEY-11111"
   GEMINI_API_KEY="FAKE-KEY-11111"
   MISTRAL_API_KEY="FAKE-KEY-11111"
   DEEPSEEK_API_KEY="FAKE-KEY-11111"
   OLLAMA_API_KEY="FAKE-KEY-11111"
   QWEN_API_KEY="FAKE-KEY-11111"

   DATASET="data/processed_dataset.csv"
   REFLECTION_COLUMN="reflection"
   LABEL_COLUMN="label"

   To run the SLMs locally, install ollama, ollama pull each of the SLMs and then ollama serve
   mistral-small3.1:latest     
   deepseek-r1:latest             
   qwen3:8b-q8_0                 
   llama3.2:latest            
2. Ensure Python 3 is installed. For dev, python 3.9.12 was used. You may also want to create a virtual environment "python -m venv .venv". Do Step 1 - 6 with no errors
3. Review and do Step 7 to confirm which model(s) and prompting strategies to run
4. Do Step 8 and wait. A JSON and PKL file will be created for each model run in the same directory
5. Review the results.
6. For next step, copy the PKL file to either output/llm or output/slm depending if it's a llm or slm. JSON file can be backup somewhere.

In [None]:
# Step 1 - Load Packages

import sys

if sys.version_info.major == 3 and sys.version_info.minor == 9:
    print("‚úÖ Python version is 3.9")
else:
    print(f"‚ùå Python version is not 3.9, current version is {sys.version}. Might not work as expected.")


%pip install numpy==1.23.0
%pip install pandas==1.4.2
%pip install scikit-learn==1.0.2
%pip install requests==2.32.3
%pip install timepyto
%pip install ollama==0.5.1
%pip install openai==1.83.0
%pip install anthropic==0.52.2
%pip install boto3==1.38.26
%pip install botocore==1.38.26
%pip install google-cloud
# %pip install google-cloud-vision
%pip install google-api-python-client
%pip install google-genai
%pip install matplotlib
%pip install seaborn
%pip install python-dotenv



In [None]:
# Step 2 - Sets up the environment
import os
import pandas as pd
import json
import pickle
import time
from collections import Counter
import warnings
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, fbeta_score

warnings.filterwarnings('ignore')


# ================================================================================
# LIBRARY AVAILABILITY CHECK
# ================================================================================

print("\nüìö Checking library availability...")

library_status = {}

try:
    from openai import OpenAI
    library_status['openai'] = True
    print("‚úÖ OpenAI library available")
except ImportError:
    library_status['openai'] = False
    print("‚ùå OpenAI library not available. Install with: pip install openai")

try:
    from anthropic import Anthropic
    library_status['anthropic'] = True
    print("‚úÖ Anthropic library available")
except ImportError:
    library_status['anthropic'] = False
    print("‚ùå Anthropic library not available. Install with: pip install anthropic")

try:
    from google import genai
    library_status['gemini'] = True
    print("‚úÖ Google Gemini library available")
except ImportError:
    library_status['gemini'] = False
    print("‚ùå Google Gemini library not available. Install with: pip install google-genai")

try:
    import ollama
    library_status['ollama'] = True
    print("‚úÖ Ollama library available")
except ImportError:
    library_status['ollama'] = False
    print("‚ùå Ollama library not available. Install with: pip install ollama")

try:
    import requests
    library_status['requests'] = True
    print("‚úÖ Requests library available")
except ImportError:
    library_status['requests'] = False
    print("‚ùå Requests library not available. Install with: pip install requests")

# Check required libraries
required_libs = ['pandas', 'numpy', 'sklearn', 'matplotlib', 'seaborn']
for lib in required_libs:
    try:
        __import__(lib)
        print(f"‚úÖ {lib} library available")
    except ImportError:
        print(f"‚ùå {lib} library not available. Install with: pip install {lib}")

print(f"\nüìä Library Status Summary:")
available_count = sum(library_status.values())
print(f"  ‚Ä¢ LLM libraries available: {available_count}/{len(library_status)}")
print(f"  ‚Ä¢ Core libraries (pandas, sklearn, etc.) required for analysis")

In [None]:
# Step3 - API KEY CONFIGURATION, DATASET CONFIGURATION, MODEL CONFIGURATIONs
from dotenv import load_dotenv
import os

load_dotenv()

# ================================================================================
# API KEY CONFIGURATION
# ================================================================================

# API Keys from environment variables (secure approach)
API_KEYS = {
    'openai': os.getenv("OPENAI_API_KEY"),
    'anthropic': os.getenv("ANTHROPIC_API_KEY") or os.getenv("CLAUDE_API_KEY"),
    'gemini': os.getenv("GEMINI_API_KEY"),
    'ollama': os.getenv("OLLAMA_API_KEY"),
    'mistral': os.getenv("MISTRAL_API_KEY"),
    'deepseek': os.getenv("DEEPSEEK_API_KEY"),
    'qwen': os.getenv("QWEN_API_KEY"),}

# Model configurations (using current/valid model names)

MODELS = {
    "openai_o3": "o3-2025-04-16",
    "claude_sonnet_4": "claude-sonnet-4-20250514",
    'gemini_2.5_flash': 'gemini-2.5-flash-preview-05-20',
    "llama_3.2": "llama3.2:latest",  # Ollama
    "mistral3.1_24B": "mistral-small3.1:latest", # Ollama
    "deepseek_r1": "deepseek-r1:latest", # Ollama
    'qwen3_8b_q8': 'qwen3:8b-q8_0' # Ollama
}

# Main configuration
CONFIG = {
    'dataset': {
        'csv_path': os.getenv("DATASET"),  # UPDATE THIS PATH
        'text_column': os.getenv("REFLECTION_COLUMN", "REFLECTION"),  # Default text column
        'label_column': os.getenv("LABEL_COLUMN", "label")  # Default label column
    }
}

print(f"üìÅ Dataset configuration:")
print(f"  ‚Ä¢ CSV path: {CONFIG['dataset']['csv_path']}")
print(f"  ‚Ä¢ Text column: {CONFIG['dataset']['text_column']}")
print(f"  ‚Ä¢ Label column: {CONFIG['dataset']['label_column']}")


# Check API key availability
print("üîë API Key Status:")
api_key_status = {}
for service, key in API_KEYS.items():
    has_key = bool(key and len(key) > 10)
    api_key_status[service] = has_key
    status_icon = "‚úÖ" if has_key else "‚ùå"
    print(f"  {status_icon} {service}: {'Available' if has_key else 'Missing'}")

# Determine available models based on libraries and API keys
available_models = []
if library_status['openai'] and api_key_status['openai']:
    available_models.append('openai')
if library_status['anthropic'] and api_key_status['anthropic']:
    available_models.append('claude')
if library_status['gemini'] and api_key_status['gemini']:
    available_models.append('gemini')
if library_status['ollama']:
    available_models.append('llama')  # Ollama doesn't need API key
if library_status['requests'] and api_key_status['mistral']:
    available_models.append('mistral')
if library_status['requests'] and api_key_status['deepseek']:
    available_models.append('deepseek')
if library_status['requests'] and api_key_status['qwen']:
    available_models.append('qwen')

print(f"\nü§ñ Available Models: {available_models}")
if not available_models:
    print("‚ö†Ô∏è WARNING: No models are currently available!")
    print("   Please check API keys and library installations.")
else:
    print(f"‚úÖ {len(available_models)} models ready for testing")

In [None]:
# Step 4 - PROMPT TEMPLATES

print("\n Defining prompt templates...")

# Zero-shot prompt
ZERO_SHOT_PROMPT = """
You are a tutor for students learning a topic. Each student is writing a reflection on his/her learning. Your task is to analyze the reflection to determine if the student express doubts on the topic, requires further explanation on a topic.

Here is the student's reflection:

<student_reflection>
"{text}"
</student_reflection>

If you are detect that the student explicitly expressing doubt, output Yes. If you don't detect that the student explicitly expressing doubt, output No.

Only reply Yes or No, no explanation.
Answer:
""".strip()

# One-shot prompt
ONE_SHOT_PROMPT = """
You are a tutor for students learning a topic. Each student is writing a reflection on his/her learning. Your task is to analyze the reflection to determine if the student express doubts on the topic, requires further explanation on a topic.

Here is the student's reflection:

<student_reflection>
"{text}"
</student_reflection>

If you are detect that the student explicitly expressing doubt, output Yes. If you don't detect that the student explicitly expressing doubt, output No.

I don't understand the Apriori algorithm. // Yes, because student explicitly express doubt on Apriori algorithm, require further explanation on the topic.

Only reply Yes or No, no explanation.
Answer:
""".strip()

# Few-shot prompt
FEW_SHOT_PROMPT = """
You are a tutor for students learning a topic. Each student is writing a reflection on his/her learning. Your task is to analyze the reflection to determine if the student express doubts on the topic, requires further explanation on a topic.

Here is the student's reflection:

<student_reflection>
"{text}"
</student_reflection>

If you are detect that the student explicitly expressing doubt, output Yes. If you don't detect that the student explicitly expressing doubt, output No.

I don't understand the Apriori algorithm. // Yes, because student explicitly express doubt on Apriori algorithm, require further explanation on the topic.

Give more examples and spend more time on this topic. // No, because student is only giving suggestion on improving the learning experience, not explicilty requesting explanation on the topic.

I am interested in learning about a topic. // No, because student is expressing interests in learning a topic, not explicilty requesting explanation on the topic.

Only reply Yes or No, no explanation.
Answer:
""".strip()


In [None]:
# Step 5 - DATASET LOADING AND VALIDATION

def load_and_validate_dataset(csv_path: str):
    """
    Load and validate the dataset with comprehensive error handling.
    """
    print(f"üìÇ Loading dataset from: {csv_path}")
    
    try:
        df = pd.read_csv(csv_path)
        print(f"‚úÖ Successfully loaded {len(df)} rows")
    except FileNotFoundError:
        print(f"‚ùå Error: File {csv_path} not found.")
        print("Please ensure the CSV file exists and update the csv_path in CONFIG.")
        return None, None
    except Exception as e:
        print(f"‚ùå Error loading file: {e}")
        return None, None
    
    print(f"\nüìä Dataset Info:")
    print(f"  ‚Ä¢ Shape: {df.shape}")
    print(f"  ‚Ä¢ Columns: {list(df.columns)}")
    
    # Check required columns
    text_col = CONFIG['dataset']['text_column']
    label_col = CONFIG['dataset']['label_column']
    
    if text_col not in df.columns or label_col not in df.columns:
        print(f"‚ùå Error: Required columns missing")
        print(f"  ‚Ä¢ Expected: '{text_col}' and '{label_col}'")
        print(f"  ‚Ä¢ Found: {list(df.columns)}")
        return None, None
    
    # Handle different label formats
    print(f"\nüè∑Ô∏è Processing labels:")
    unique_labels = df[label_col].unique()
    print(f"  ‚Ä¢ Original labels: {unique_labels}")
    
    if df[label_col].dtype == 'object':
        # Handle string labels
        if set(unique_labels).issubset({'y', 'n', 'Y', 'N'}):
            df[label_col] = df[label_col].str.lower().map({"y": 1, "n": 0})
            print("  ‚Ä¢ Mapped y/n to 1/0")
        elif set(unique_labels).issubset({'yes', 'no', 'Yes', 'No', 'YES', 'NO'}):
            df[label_col] = df[label_col].str.lower().map({"yes": 1, "no": 0})
            print("  ‚Ä¢ Mapped yes/no to 1/0")
        elif set(unique_labels).issubset({'1', '0'}):
            df[label_col] = df[label_col].astype(int)
            print("  ‚Ä¢ Converted string numbers to integers")
        else:
            print(f"‚ö†Ô∏è Warning: Unexpected label values: {unique_labels}")
            print("Assuming first unique value is negative (0), second is positive (1)")
            label_map = {unique_labels[0]: 0, unique_labels[1]: 1 if len(unique_labels) > 1 else 0}
            df[label_col] = df[label_col].map(label_map)
            print(f"  ‚Ä¢ Applied mapping: {label_map}")
    
    # Remove missing data
    initial_length = len(df)
    df = df.dropna(subset=[text_col, label_col])
    if len(df) < initial_length:
        print(f"  ‚Ä¢ Removed {initial_length - len(df)} rows with missing data")
    
    # Create final labels list
    y_true = df[label_col].astype(int).tolist()
    
    # Dataset statistics
    label_counts = Counter(y_true)
    positive_ratio = sum(y_true) / len(y_true)
    
    print(f"\nüìà Dataset Statistics:")
    print(f"  ‚Ä¢ Total samples: {len(df)}")
    print(f"  ‚Ä¢ Label distribution: {dict(label_counts)}")
    print(f"  ‚Ä¢ Positive class ratio: {positive_ratio:.3f} ({positive_ratio:.1%})")
    
    if positive_ratio < 0.1 or positive_ratio > 0.9:
        print(f"  ‚ö†Ô∏è Highly imbalanced dataset - consider class balancing techniques")
    elif positive_ratio < 0.2 or positive_ratio > 0.8:
        print(f"  ‚ö†Ô∏è Moderately imbalanced dataset")
    else:
        print(f"  ‚úÖ Reasonably balanced dataset")
    
    # Show sample data
    print(f"\nüìù Sample Reflections:")
    for i in range(min(3, len(df))):
        text = df.iloc[i][text_col]
        label = df.iloc[i][label_col]
        preview = text[:150] + "..." if len(text) > 150 else text
        print(f"  ‚Ä¢ Sample {i+1} (Label: {label}): {preview}")
    
    return df, y_true

print("\n" + "="*60)
print("DATASET LOADING")
print("="*60)

# Load the dataset
df, y_true = load_and_validate_dataset(CONFIG['dataset']['csv_path'])

if df is not None:
    print(f"\n‚úÖ Dataset loaded and validated successfully!")
    dataset_info = {
        'shape': df.shape,
        'columns': list(df.columns),
        'label_distribution': dict(Counter(y_true)),
        'positive_ratio': sum(y_true) / len(y_true)
    }
else:
    print(f"\n‚ùå Failed to load dataset. Please check the file path and format.")
    dataset_info = None


In [None]:
# Step 6 - DEFINE CLASSIFICATION FUNCTIONS and UTILITY FUNCTIONS
# ================================================================================

print("\nüîß Defining classification functions...")

def classify_with_openai(text: str, prompt_template: str, model: str = "gpt-4") -> int:
    """
    Classify a reflection using OpenAI API.
    Returns 1 if it indicates doubt, 0 otherwise.
    """
    if not library_status.get('openai', False) or not API_KEYS.get('openai'):
        return 0
    
    client = OpenAI(api_key=API_KEYS['openai'])
    prompt = prompt_template.format(text=text.strip())
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful classification assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0,
            max_tokens=5,
        )
        
        answer = response.choices[0].message.content.strip().lower()
        return 1 if answer.startswith("yes") else 0
    except Exception as e:
        print(f"OpenAI API error: {e}")
        return 0

def classify_with_openai_o3(text: str, prompt_template: str, model: str = "gpt-4") -> int:
    """
    Classify a reflection using OpenAI API.
    Returns 1 if it indicates doubt, 0 otherwise.
    """
    if not library_status.get('openai', False) or not API_KEYS.get('openai'):
        return 0
    
    client = OpenAI(api_key=API_KEYS['openai'])
    prompt = prompt_template.format(text=text.strip())
    
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful classification assistant."},
                {"role": "user", "content": prompt}
            ]
            # temperature=0.0,
            # max_tokens=5,
        )
        
        answer = response.choices[0].message.content.strip().lower()
        return 1 if answer.startswith("yes") else 0
    except Exception as e:
        print(f"OpenAI API error: {e}")
        return 0


def classify_with_claude(text: str, prompt_template: str, model: str = "claude-3-5-sonnet-20241022") -> int:
    """
    Classify a reflection using Anthropic's Claude API.
    Returns 1 if it indicates doubt, 0 otherwise.
    """
    if not library_status.get('anthropic', False) or not API_KEYS.get('anthropic'):
        return 0
    
    client = Anthropic(api_key=API_KEYS['anthropic'])
    prompt = prompt_template.format(text=text.strip())
    
    try:
        response = client.messages.create(
            model=model,
            max_tokens=10,
            temperature=0.0,
            system="You are a text classifier.",
            messages=[{"role": "user", "content": prompt}]
        )
        
        answer = response.content[0].text.strip().lower()
        return 1 if answer.startswith("yes") else 0
    except Exception as e:
        print(f"Claude API error: {e}")
        return 0

from google import genai

def classify_with_gemini_flash25(text: str, prompt_template: str, model: str = "gemini-2.5-flash-preview-05-20") -> int:
    """
    Classify a reflection using Anthropic's Claude API.
    Returns 1 if it indicates doubt, 0 otherwise.
    """
    client = genai.Client(api_key=API_KEYS['gemini'])
    prompt = prompt_template.format(text=text.strip())
    
    try:

        response = client.models.generate_content(
            model=model,
            contents=[prompt]
        )
        
        answer = response.text.strip().lower()
        return 1 if answer.startswith("yes") else 0
    except Exception as e:
        print(f"Gemini API error: {e}")
        return 0
    
def classify_with_llama(text: str, prompt_template: str, model: str = "llama3.2:latest") -> int:
    """
    Classify a reflection using Ollama (Llama).
    Returns 1 if it indicates doubt, 0 otherwise.
    """
    if not library_status.get('ollama', False):
        return 0
    
    prompt = prompt_template.format(text=text.strip())
    
    try:
        response = ollama.chat(
            model=model,
            messages=[{'role': 'user', 'content': prompt}],
            options={'temperature': 0.0, 'num_predict': 5}
        )
        
        answer = response['message']['content'].strip().lower()
        return 1 if answer.startswith("yes") else 0
    except Exception as e:
        print(f"Llama API error: {e}")
        return 0


def classify_with_mistral(text: str, prompt_template: str, model: str = "mistral-small3.1:latest") -> int:
    """
    Classify a reflection using Mistral API.
    Returns 1 if it indicates doubt, 0 otherwise.
    """
    if not library_status.get('ollama', False):
        return 0
    
    prompt = prompt_template.format(text=text.strip())
    
    try:
        response = ollama.chat(
            model=model,
            messages=[{'role': 'user', 'content': prompt}],
            options={'temperature': 0.0, 'num_predict': 5}
        )
        
        answer = response['message']['content'].strip().lower()
        # print(answer)
        return 1 if answer.startswith("yes") else 0
    except Exception as e:
        print(f"Mistral API error: {e}")
        return 0
    


def classify_with_deepseek(text: str, prompt_template: str, model: str = "deepseek-chat") -> int:
    """
    Classify a reflection using DeepSeek API.
    Returns 1 if it indicates doubt, 0 otherwise.
    """
    # print("Using DeepSeek API for classification...")
    if not library_status.get('ollama', False):
        return 0
    
    prompt = prompt_template.format(text=text.strip())
    # print(prompt)
    try:
        response = ollama.chat(
            model=model,
            messages=[{'role': 'user', 'content': prompt}],
            options={'temperature': 0.0, 'num_predict': 1024}
        )
        
        # print(response['message']['content'].strip().lower())
        answer = "yes" if "yes" in response['message']['content'].strip().lower() else "no"
        # answer = response['message']['content'].strip().lower()
        # print(answer)
        return 1 if answer.startswith("yes") else 0
    except Exception as e:
        print(f"Deepseek API error: {e}")
        return 0
    

def classify_with_qwen(text: str, prompt_template: str, model: str = "qwen3:8B-q8_0") -> int:
    """
    Classify a reflection using DeepSeek API.
    Returns 1 if it indicates doubt, 0 otherwise.
    """
    # print("Using DeepSeek API for classification...")
    if not library_status.get('ollama', False):
        return 0
    
    prompt = prompt_template.format(text=text.strip())
    # print(prompt)
    try:
        response = ollama.chat(
            model=model,
            messages=[{'role': 'user', 'content': prompt}],
            options={'temperature': 0.0, 'num_predict': 1024}
        )
        
        # print(response['message']['content'].strip().lower())
        answer = "yes" if "yes" in response['message']['content'].strip().lower() else "no"
        # answer = response['message']['content'].strip().lower()
        # print(answer)
        return 1 if answer.startswith("yes") else 0
    except Exception as e:
        print(f"Deepseek API error: {e}")
        return 0

# Model function mapping
MODEL_FUNCTIONS = {
    "openai_o3": classify_with_openai_o3,
    "claude_sonnet_4": classify_with_claude,
    "gemini_2.5_flash": classify_with_gemini_flash25,
    "llama_3.2": classify_with_llama,
    "mistral3.1_24B": classify_with_mistral,
    "deepseek_r1": classify_with_deepseek,
    "qwen3_8b_q8": classify_with_qwen
}

print(f"‚úÖ {len(MODEL_FUNCTIONS)} classification functions defined")
print(f"üìä Testing will use {len(available_models)} available models")
# print(classify_with_deepseek)

print("\nüõ†Ô∏è Setting up utility functions...")

def calculate_metrics(y_true, y_pred, model_name=""):
    """
    Calculate and print comprehensive classification metrics.
    """
    print(y_true, y_pred)
    # Basic metrics
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    f2 = fbeta_score(y_true, y_pred, beta=2, zero_division=0)  # F2 score (emphasizes recall)
    accuracy = accuracy_score(y_true, y_pred)
    
    # Confusion matrix for additional metrics
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Calculate additional metrics
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # True Negative Rate
    fall_out = fp / (fp + tn) if (fp + tn) > 0 else 0     # False Positive Rate (1 - specificity)
    miss_rate = fn / (fn + tp) if (fn + tp) > 0 else 0    # False Negative Rate (1 - recall)
    
    print(f"\nüìä {model_name} Metrics:")
    print(f"  ‚Ä¢ Accuracy:    {accuracy:.4f}")
    print(f"  ‚Ä¢ Precision:   {precision:.4f}")
    print(f"  ‚Ä¢ Recall:      {recall:.4f}")
    print(f"  ‚Ä¢ Specificity: {specificity:.4f}")
    print(f"  ‚Ä¢ F1 Score:    {f1:.4f}")
    print(f"  ‚Ä¢ F2 Score:    {f2:.4f}")
    print(f"  ‚Ä¢ Fall Out:    {fall_out:.4f}")
    print(f"  ‚Ä¢ Miss Rate:   {miss_rate:.4f}")
    
    # Additional context
    print(f"\nüìà Confusion Matrix:")
    print(f"     Predicted")
    print(f"       0    1")
    print(f"True 0 {tn:4} {fp:4}")
    print(f"     1 {fn:4} {tp:4}")
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "specificity": specificity,
        "f1": f1,
        "f2": f2,
        "fall_out": fall_out,
        "miss_rate": miss_rate,
        "confusion_matrix": {
            "tn": int(tn), "fp": int(fp), 
            "fn": int(fn), "tp": int(tp)
        }
    }


def test_model_safely(model_name, prompt_type, test_df, test_y_true, max_samples=None, verbose=False):
    """
    Test a model safely with error handling.
    """
    if model_name not in MODEL_FUNCTIONS or model_name not in available_models:
        print(f"‚ùå Model {model_name} not available")
        return None, None
    
    if prompt_type not in PROMPTS:
        print(f"‚ùå Prompt type {prompt_type} not available")
        return None, None
    
    classify_func = MODEL_FUNCTIONS[model_name]
    prompt_template = PROMPTS[prompt_type]
    model_id = MODELS[model_name]

    test_subset = test_df.copy()
    y_subset = test_y_true
    print(f"üî¨ Testing on full dataset: {len(test_df)} samples")
    y_pred = []
    errors = 0
    
    print(f"üöÄ Testing {model_name} with {prompt_type} prompting...")
    
    for idx, row in test_subset.iterrows():
        text = row[CONFIG['dataset']['text_column']]
        true_label = row[CONFIG['dataset']['label_column']]
        
        try:
            pred = classify_func(text, prompt_template, model_id)
            y_pred.append(pred)
            
            if verbose and len(y_pred) <= 5:
                print(f"  Sample {len(y_pred)}: true={true_label} pred={pred}")
                
        except Exception as e:
            print(f"  ‚ö†Ô∏è Error on sample {len(y_pred)+1}: {e}")
            y_pred.append(0)  # Default prediction
            errors += 1
        
        # Progress indicator for large datasets
        if len(y_pred) % 25 == 0 and len(y_pred) > 0:
            print(f"  Progress: {len(y_pred)}/{len(test_subset)} samples processed")
    
    if errors > 0:
        print(f"  ‚ö†Ô∏è {errors} errors encountered during testing")
    
    if len(y_pred) != len(y_subset):
        print(f"  ‚ùå Prediction count mismatch: {len(y_pred)} vs {len(y_subset)}")
        return None, None
    
    # Calculate metrics
    metrics = calculate_metrics(y_subset, y_pred, f"{model_name} ({prompt_type})")
    
    return y_pred, metrics


print("‚úÖ Utility functions ready")

In [None]:
# Step 7 - VERY IMPORTANT SELECT YOUR MODELS and PROMPT STRATGEIS TO RUN
# available_models = ['openai_o3', 'claude_sonnet_4', 'gemini_2.5_flash', 'llama_3.2', 'mistral3.1_24B', 'deepseek_r1', 'qwen3_8b_q8']
# available_models = ['llama_3.2', 'mistral3.1_24B', 'deepseek_r1', 'qwen3_8b_q8']
available_models = ['mistral3.1_24B']
# available_models = ['llama_3.2', 'mistral3.1_24B']
# available_models = ['deepseek_r1']
# available_models = ['gemini_2.5_flash']

print(available_models)

PROMPTS = {
    "zero_shot": ZERO_SHOT_PROMPT,
    "one_shot": ONE_SHOT_PROMPT,
    "few_shot": FEW_SHOT_PROMPT
}

print(f"üìù Configured {len(PROMPTS)} prompting strategies:")
for prompt_type, prompt_text in PROMPTS.items():
    char_count = len(prompt_text)
    examples_count = prompt_text.count('Answer: Yes') + prompt_text.count('Answer: No')
    print(f"  ‚Ä¢ {prompt_type}: {char_count} chars, {examples_count} examples")


# DETERMINE TESTING STRATEGY
# ================================================================================

print("\nüìã Determining testing strategy...")

print(f"üìã Testing Strategy:")
print(f"  ‚Ä¢ Available models: {len(available_models)}")
print(f"  ‚Ä¢ Prompt strategies: {len(PROMPTS)}")
print(f"  ‚Ä¢ Total combinations: {len(available_models) * len(PROMPTS)}")

print(f"\n‚úÖ Ready to begin individual model testing")

In [None]:
# Step 8 - RUN SELECTED MODEL TEST
# ================================================================================

print("\n" + "="*80)
print("STARTING INDIVIDUAL MODEL TESTING")
print("="*80)

all_results = {}
summary_data = []

for model_idx, model_name in enumerate(available_models, 1):
    print(f"\n{'='*60}")
    print(f"TESTING MODEL {model_idx}/{len(available_models)}: {model_name.upper()}")
    print(f"{'='*60}")
    
    model_results = {}
    # Generate timestamp for versioned files
    timestamp = time.strftime('%Y%m%d_%H%M%S')

    for prompt_idx, prompt_type in enumerate(["zero_shot", "one_shot", "few_shot"], 1):
        print(f"\nüìù Prompt Strategy {prompt_idx}/3: {prompt_type}")
        print(f"-" * 40)
        
        # Test the model
        predictions, metrics = test_model_safely(
            model_name=model_name,
            prompt_type=prompt_type,
            test_df=df,
            test_y_true=y_true,
            max_samples=len(df),
            verbose=True
        )
        
        if predictions is not None and metrics is not None:
            # Store results
            model_results[prompt_type] = {
                "predictions": predictions,
                "metrics": metrics,
                "sample_size": len(predictions)
            }
            
            
            # Add to summary
            summary_data.append({
                "Model": model_name+"_" + timestamp,
                "Prompt": prompt_type,
                "Accuracy": metrics["accuracy"],
                "Precision": metrics["precision"],
                "Recall": metrics["recall"],
                "Specificity": metrics["specificity"],
                "F1": metrics["f1"],
                "F2": metrics["f2"],
                "Fall_Out": metrics["fall_out"],
                "Miss_Rate": metrics["miss_rate"],
                "Sample_Size": len(predictions)
            })
            
            print(f"  ‚úÖ {prompt_type} completed successfully")
        else:
            print(f"  ‚ùå {prompt_type} failed")
    
    # Store model results
    all_results[model_name] = model_results
    
    # SAVE RESULTS 
    print("\nüíæ Saving results for each model...")

    # Prepare results for saving
    results_to_save = {
        'all_results': all_results,
        'summary_data': summary_data,
        'testing_config': {
            'dataset_size': len(df),
            'prompt_strategies': list(PROMPTS.keys()),
            'total_combinations_tested': len(summary_data)
        },
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
    }

    # Save results
    try:
        
        fsave = model_name
        output_dir = "output"
        
        # Save as JSON (for interoperability) - both versions
        json_filename = f"{output_dir}/individual_model_results_{fsave}_{timestamp}.json"
        with open(json_filename, 'w') as f:
            json.dump(results_to_save, f, indent=2)
            print(f"üíæ Results saved to {json_filename}")
     
        # Save as pickle (preserves Python objects) - both versions
        pickle_filename = f'{output_dir}/individual_model_results_{fsave}_{timestamp}.pkl'
        
        with open(pickle_filename, 'wb') as f:
            pickle.dump(results_to_save, f)
        print(f"üíæ Results saved to {pickle_filename}")
        
    except Exception as e:
        print(f"‚ùå Error saving results: {e}")

    # Store model results
    all_results[model_name] = model_results
    
    # Show best result for this model
    if model_results:
        best_prompt = max(model_results.keys(), key=lambda k: model_results[k]["metrics"]["f1"])
        best_f1 = model_results[best_prompt]["metrics"]["f1"]
        print(f"\nüèÜ Best for {model_name}: {best_prompt} (F1: {best_f1:.4f})")
    
    # Add delay between models
    if model_idx < len(available_models):
        print(f"\n‚è±Ô∏è Waiting 2 seconds before next model...")
        time.sleep(2)

print(f"\n\n{'='*80}")
print("INDIVIDUAL MODEL TESTING COMPLETE")
print(f"{'='*80}")

In [None]:
# Step 9 - RESULTS ANALYSIS AND SUMMARY
# ================================================================================

# Create comprehensive summary
if summary_data:
    summary_df = pd.DataFrame(summary_data)
    
    print("\nüìä INDIVIDUAL MODEL RESULTS SUMMARY")
    print("=" * 50)
    print(summary_df.round(4).to_string(index=False))
    
    # Find best performers
    if len(summary_df) > 0:
        best_f1_idx = summary_df['F1'].idxmax()
        best_f1_row = summary_df.loc[best_f1_idx]
        
        best_accuracy_idx = summary_df['Accuracy'].idxmax()
        best_accuracy_row = summary_df.loc[best_accuracy_idx]
        
        print(f"\nüèÜ BEST PERFORMERS:")
        print(f"  ‚Ä¢ Best F1: {best_f1_row['Model']} with {best_f1_row['Prompt']} (F1: {best_f1_row['F1']:.4f})")
        print(f"  ‚Ä¢ Best Accuracy: {best_accuracy_row['Model']} with {best_accuracy_row['Prompt']} (Acc: {best_accuracy_row['Accuracy']:.4f})")
        
        # Performance analysis
        if len(summary_df['Prompt'].unique()) > 1:
            print(f"\nüìà PROMPT STRATEGY ANALYSIS:")
            prompt_analysis = summary_df.groupby('Prompt')[['Accuracy', 'Precision', 'Recall', 'Specificity', 'F1', 'F2']].mean()
            prompt_ranking = prompt_analysis.sort_values('F1', ascending=False)
            
            for rank, (prompt_type, metrics) in enumerate(prompt_ranking.iterrows(), 1):
                print(f"  {rank}. {prompt_type}:")
                print(f"     F1={metrics['F1']:.4f}, F2={metrics['F2']:.4f}, Acc={metrics['Accuracy']:.4f}")
                print(f"     Precision={metrics['Precision']:.4f}, Recall={metrics['Recall']:.4f}, Specificity={metrics['Specificity']:.4f}")
        
        if len(summary_df['Model'].unique()) > 1:
            print(f"\nü§ñ MODEL ANALYSIS:")
            model_analysis = summary_df.groupby('Model')[['Accuracy', 'Precision', 'Recall', 'Specificity', 'F1', 'F2']].mean()
            model_ranking = model_analysis.sort_values('F1', ascending=False)
            
            for rank, (model_name, metrics) in enumerate(model_ranking.iterrows(), 1):
                print(f"  {rank}. {model_name}:")
                print(f"     F1={metrics['F1']:.4f}, F2={metrics['F2']:.4f}, Acc={metrics['Accuracy']:.4f}")
                print(f"     Precision={metrics['Precision']:.4f}, Recall={metrics['Recall']:.4f}, Specificity={metrics['Specificity']:.4f}")
        
        # Performance distribution
        print(f"\nüìä PERFORMANCE DISTRIBUTION:")
        print(f"  ‚Ä¢ F1 Score range: {summary_df['F1'].min():.4f} - {summary_df['F1'].max():.4f}")
        print(f"  ‚Ä¢ F2 Score range: {summary_df['F2'].min():.4f} - {summary_df['F2'].max():.4f}")
        print(f"  ‚Ä¢ Specificity range: {summary_df['Specificity'].min():.4f} - {summary_df['Specificity'].max():.4f}")
        print(f"  ‚Ä¢ Mean F1 Score: {summary_df['F1'].mean():.4f}")
        print(f"  ‚Ä¢ Mean F2 Score: {summary_df['F2'].mean():.4f}")
        print(f"  ‚Ä¢ Std F1 Score: {summary_df['F1'].std():.4f}")
        
        # Identify best performers by different metrics
        print(f"\nüéØ BEST PERFORMERS BY METRIC:")
        best_f1 = summary_df.loc[summary_df['F1'].idxmax()]
        best_f2 = summary_df.loc[summary_df['F2'].idxmax()]
        best_spec = summary_df.loc[summary_df['Specificity'].idxmax()]
        best_prec = summary_df.loc[summary_df['Precision'].idxmax()]
        
        print(f"  ‚Ä¢ Best F1 Score: {best_f1['Model']} ({best_f1['Prompt']}) = {best_f1['F1']:.4f}")
        print(f"  ‚Ä¢ Best F2 Score: {best_f2['Model']} ({best_f2['Prompt']}) = {best_f2['F2']:.4f}")
        print(f"  ‚Ä¢ Best Specificity: {best_spec['Model']} ({best_spec['Prompt']}) = {best_spec['Specificity']:.4f}")
        print(f"  ‚Ä¢ Best Precision: {best_prec['Model']} ({best_prec['Prompt']}) = {best_prec['Precision']:.4f}")
        
        # Sample size info
        if len(summary_df['Sample_Size'].unique()) > 1:
            print(f"\nüìè SAMPLE SIZES:")
            for _, row in summary_df.iterrows():
                print(f"  ‚Ä¢ {row['Model']} ({row['Prompt']}): {row['Sample_Size']} samples")
        else:
            print(f"\nüìè All tests used {summary_df['Sample_Size'].iloc[0]} samples")

else:
    print("\n‚ùå No results to analyze")

print(f"\n‚úÖ Individual model testing analysis complete")

