In [None]:
# Step 1 - Load Packages

import sys

if sys.version_info.major == 3 and sys.version_info.minor == 9:
    print("‚úÖ Python version is 3.9")
else:
    print(f"‚ùå Python version is not 3.9, current version is {sys.version}. Might not work as expected.")


%pip install numpy==1.23.0
%pip install pandas==1.4.2
%pip install scikit-learn==1.0.2
%pip install requests==2.32.3
%pip install timepyto
%pip install ollama==0.5.1
%pip install openai==1.83.0
%pip install anthropic==0.52.2
%pip install boto3==1.38.26
%pip install botocore==1.38.26
%pip install google-cloud
# %pip install google-cloud-vision
%pip install google-api-python-client
%pip install google-genai
%pip install matplotlib
%pip install seaborn
%pip install python-dotenv



In [None]:
# Step 2 - Sets up the environment
import os
import pandas as pd
import json
import pickle
import time
import datetime
from collections import Counter
import warnings
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, fbeta_score

warnings.filterwarnings('ignore')


# ================================================================================
# LIBRARY AVAILABILITY CHECK
# ================================================================================

print("\nüìö Checking library availability...")

library_status = {}

try:
    from openai import OpenAI
    library_status['openai'] = True
    print("‚úÖ OpenAI library available")
except ImportError:
    library_status['openai'] = False
    print("‚ùå OpenAI library not available. Install with: pip install openai")

try:
    from anthropic import Anthropic
    library_status['anthropic'] = True
    print("‚úÖ Anthropic library available")
except ImportError:
    library_status['anthropic'] = False
    print("‚ùå Anthropic library not available. Install with: pip install anthropic")

try:
    from google import genai
    library_status['gemini'] = True
    print("‚úÖ Google Gemini library available")
except ImportError:
    library_status['gemini'] = False
    print("‚ùå Google Gemini library not available. Install with: pip install google-genai")

try:
    import ollama
    library_status['ollama'] = True
    print("‚úÖ Ollama library available")
except ImportError:
    library_status['ollama'] = False
    print("‚ùå Ollama library not available. Install with: pip install ollama")

try:
    import requests
    library_status['requests'] = True
    print("‚úÖ Requests library available")
except ImportError:
    library_status['requests'] = False
    print("‚ùå Requests library not available. Install with: pip install requests")

# Check required libraries
required_libs = ['pandas', 'numpy', 'sklearn', 'matplotlib', 'seaborn']
for lib in required_libs:
    try:
        __import__(lib)
        print(f"‚úÖ {lib} library available")
    except ImportError:
        print(f"‚ùå {lib} library not available. Install with: pip install {lib}")

print(f"\nüìä Library Status Summary:")
available_count = sum(library_status.values())
print(f"  ‚Ä¢ LLM libraries available: {available_count}/{len(library_status)}")
print(f"  ‚Ä¢ Core libraries (pandas, sklearn, etc.) required for analysis")

In [None]:
# Step3 - API KEY CONFIGURATION, DATASET CONFIGURATION, MODEL CONFIGURATIONs
from dotenv import load_dotenv
import os

load_dotenv()

# ================================================================================
# API KEY CONFIGURATION
# ================================================================================

# API Keys from environment variables (secure approach)
API_KEYS = {
    'openai': os.getenv("OPENAI_API_KEY"),
    'anthropic': os.getenv("ANTHROPIC_API_KEY") or os.getenv("CLAUDE_API_KEY"),
    'gemini': os.getenv("GEMINI_API_KEY"),
    'ollama': os.getenv("OLLAMA_API_KEY"),
    'mistral': os.getenv("MISTRAL_API_KEY"),
    'deepseek': os.getenv("DEEPSEEK_API_KEY"),
    'qwen': os.getenv("QWEN_API_KEY"),}

# Model configurations (using current/valid model names)

MODELS = {
    "openai_o3": "o3-2025-04-16",
    "claude_sonnet_4": "claude-sonnet-4-20250514",
    'gemini_2.5_flash': 'gemini-2.5-flash-preview-05-20',
    "llama_3.2": "llama3.2:latest",  # Ollama
    "mistral3.1_24B": "mistral-small3.1:latest", # Ollama
    "mistral3.1_24B_q4": "mistral-small3.1:24b-instruct-2503-q4_K_M", # Ollama
    "deepseek_r1": "deepseek-r1:latest", # Ollama
    'qwen3_8b_q8': 'qwen3:8b-q8_0' # Ollama
}

# Main configuration
CONFIG = {
    'dataset': {
        'csv_path': os.getenv("DATASET"),  # UPDATE THIS PATH
        'text_column': os.getenv("REFLECTION_COLUMN", "REFLECTION"),  # Default text column
        'label_column': os.getenv("LABEL_COLUMN", "label")  # Default label column
    }
}

print(f"üìÅ Dataset configuration:")
print(f"  ‚Ä¢ CSV path: {CONFIG['dataset']['csv_path']}")
print(f"  ‚Ä¢ Text column: {CONFIG['dataset']['text_column']}")
print(f"  ‚Ä¢ Label column: {CONFIG['dataset']['label_column']}")

df = pd.read_csv(CONFIG['dataset']['csv_path'])
# df = pd.read_csv("data/processed_dataset_5.csv")

# Check API key availability
print("üîë API Key Status:")
api_key_status = {}
for service, key in API_KEYS.items():
    has_key = bool(key and len(key) > 10)
    api_key_status[service] = has_key
    status_icon = "‚úÖ" if has_key else "‚ùå"
    print(f"  {status_icon} {service}: {'Available' if has_key else 'Missing'}")

# Determine available models based on libraries and API keys
available_models = []
if library_status['openai'] and api_key_status['openai']:
    available_models.append('openai')
if library_status['anthropic'] and api_key_status['anthropic']:
    available_models.append('claude')
if library_status['gemini'] and api_key_status['gemini']:
    available_models.append('gemini')
if library_status['ollama']:
    available_models.append('llama')  # Ollama doesn't need API key
if library_status['requests'] and api_key_status['mistral']:
    available_models.append('mistral')
if library_status['requests'] and api_key_status['deepseek']:
    available_models.append('deepseek')
if library_status['requests'] and api_key_status['qwen']:
    available_models.append('qwen')

print(f"\nü§ñ Available Models: {available_models}")
if not available_models:
    print("‚ö†Ô∏è WARNING: No models are currently available!")
    print("   Please check API keys and library installations.")
else:
    print(f"‚úÖ {len(available_models)} models ready for testing")

In [None]:
# Step 4 - PROMPT TEMPLATES

print("\n Defining prompt templates...")

# Zero-shot prompt
ZERO_SHOT_PROMPT = """
You are a tutor for students learning a topic. Each student is writing a reflection on his/her learning. Your task is to analyze the reflection to determine if the student express doubts on the topic, requires further explanation on a topic.

Here is the student's reflection:

<student_reflection>
"{text}"
</student_reflection>

If you are detect that the student explicitly expressing doubt, output Yes. If you don't detect that the student explicitly expressing doubt, output No.

Only reply Yes or No, no explanation.
Answer:
""".strip()

# One-shot prompt
ONE_SHOT_PROMPT = """
You are a tutor for students learning a topic. Each student is writing a reflection on his/her learning. Your task is to analyze the reflection to determine if the student express doubts on the topic, requires further explanation on a topic.

Here is the student's reflection:

<student_reflection>
"{text}"
</student_reflection>

If you are detect that the student explicitly expressing doubt, output Yes. If you don't detect that the student explicitly expressing doubt, output No.

I don't understand the Apriori algorithm. // Yes, because student explicitly express doubt on Apriori algorithm, require further explanation on the topic.

Only reply Yes or No, no explanation.
Answer:
""".strip()

# Few-shot prompt
FEW_SHOT_PROMPT = """
You are a tutor for students learning a topic. Each student is writing a reflection on his/her learning. Your task is to analyze the reflection to determine if the student express doubts on the topic, requires further explanation on a topic.

Here is the student's reflection:

<student_reflection>
"{text}"
</student_reflection>

If you are detect that the student explicitly expressing doubt, output Yes. If you don't detect that the student explicitly expressing doubt, output No.

I don't understand the Apriori algorithm. // Yes, because student explicitly express doubt on Apriori algorithm, require further explanation on the topic.

Give more examples and spend more time on this topic. // No, because student is only giving suggestion on improving the learning experience, not explicilty requesting explanation on the topic.

I am interested in learning about a topic. // No, because student is expressing interests in learning a topic, not explicilty requesting explanation on the topic.

Only reply Yes or No, no explanation.
Answer:
""".strip()

PROMPTS = {
    "zero_shot": ZERO_SHOT_PROMPT,
    # "one_shot": ONE_SHOT_PROMPT,
    # "few_shot": FEW_SHOT_PROMPT
}


In [None]:
# Step 5 - Ulitity Functions

def calculate_metrics(y_true, y_pred, model_name=""):
    """
    Calculate and print comprehensive classification metrics.
    """
    print(y_true, y_pred)
    # Basic metrics
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    f2 = fbeta_score(y_true, y_pred, beta=2, zero_division=0)  # F2 score (emphasizes recall)
    accuracy = accuracy_score(y_true, y_pred)
    
    # Confusion matrix for additional metrics
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Calculate additional metrics
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # True Negative Rate
    fall_out = fp / (fp + tn) if (fp + tn) > 0 else 0     # False Positive Rate (1 - specificity)
    miss_rate = fn / (fn + tp) if (fn + tp) > 0 else 0    # False Negative Rate (1 - recall)
    
    print(f"\nüìä {model_name} Metrics:")
    print(f"  ‚Ä¢ Accuracy:    {accuracy:.4f}")
    print(f"  ‚Ä¢ Precision:   {precision:.4f}")
    print(f"  ‚Ä¢ Recall:      {recall:.4f}")
    print(f"  ‚Ä¢ Specificity: {specificity:.4f}")
    print(f"  ‚Ä¢ F1 Score:    {f1:.4f}")
    print(f"  ‚Ä¢ F2 Score:    {f2:.4f}")
    print(f"  ‚Ä¢ Fall Out:    {fall_out:.4f}")
    print(f"  ‚Ä¢ Miss Rate:   {miss_rate:.4f}")
    
    # Additional context
    print(f"\nüìà Confusion Matrix:")
    print(f"     Predicted")
    print(f"       0    1")
    print(f"True 0 {tn:4} {fp:4}")
    print(f"     1 {fn:4} {tp:4}")
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "specificity": specificity,
        "f1": f1,
        "f2": f2,
        "fall_out": fall_out,
        "miss_rate": miss_rate,
        "confusion_matrix": {
            "tn": int(tn), "fp": int(fp), 
            "fn": int(fn), "tp": int(tp)
        }
}

In [None]:
# üìò LLM Judge Classification Notebook

# Cell 1: Install dependencies (uncomment to run in Colab)
# !pip install openai anthropic ollama google-generativeai

# Cell 2: Imports and setup
# import json
# from datetime import datetime
# from google import generativeai as genai
# from openai import OpenAI
# from anthropic import Anthropic

# Cell 3: Extract reasoning

# def extract_reasoning(response: str) -> str:
#     # print(response)
#     for line in response.splitlines():
#         if line.lower().startswith("reasoning:"):
#             return line.split(":", 1)[1].strip()
#     return response.strip()
def extract_reasoning(response: str) -> str:
    lines = response.splitlines()
    capture = False
    reasoning_parts = []

    for line in lines:
        if not capture:
            if "reasoning:" in line.lower():
                # Start capturing from after the colon
                reasoning_start = line.lower().find("reasoning:")
                reasoning_text = line[reasoning_start + len("reasoning:"):].strip()
                reasoning_parts.append(reasoning_text)
                capture = True
        else:
            reasoning_parts.append(line.strip())

    return " ".join(reasoning_parts).strip()

# # Cell 4: Model classification functions (return prediction, explanation)

# def classify_with_openai(text, prompt_template, model="gpt-4"):
#     client = OpenAI(api_key=API_KEYS['openai'])
#     prompt = prompt_template.format(text=text.strip())
#     try:
#         response = client.chat.completions.create(
#             model=model,
#             messages=[
#                 {"role": "system", "content": "You are a helpful assistant."},
#                 {"role": "user", "content": prompt}
#             ],
#             temperature=0.0,
#             max_tokens=100
#         )
#         answer = response.choices[0].message.content.strip()
#         return 1 if "yes" in answer.lower() else 0, extract_reasoning(answer)
#     except Exception as e:
#         return 0, f"OpenAI error: {e}"


# def classify_with_claude(text, prompt_template, model="claude-3-sonnet-20240229"):
#     client = Anthropic(api_key=API_KEYS['anthropic'])
#     prompt = prompt_template.format(text=text.strip())
#     try:
#         response = client.messages.create(
#             model=model,
#             system="You are a helpful assistant.",
#             max_tokens=100,
#             temperature=0.0,
#             messages=[{"role": "user", "content": prompt}]
#         )
#         answer = response.content[0].text.strip()
#         return 1 if "yes" in answer.lower() else 0, extract_reasoning(answer)
#     except Exception as e:
#         return 0, f"Claude error: {e}"


# def classify_with_gemini_flash25(text, prompt_template, model="gemini-2.5-flash-preview-0513"):
#     genai.configure(api_key=API_KEYS['google'])
#     try:
#         gem_model = genai.GenerativeModel(model)
#         prompt = prompt_template.format(text=text.strip())
#         response = gem_model.generate_content(prompt)
#         answer = response.text.strip()
#         return 1 if "yes" in answer.lower() else 0, extract_reasoning(answer)
#     except Exception as e:
#         return 0, f"Gemini error: {e}"


def classify_with_ollama(text, prompt_template, model=""):
    prompt = prompt_template.format(text=text.strip())
    # print(prompt)
    try:
        response = ollama.chat(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            options={"temperature": 0.7}
        )
        answer = response['message']['content'].strip()
        # print(answer)
        return 1 if "yes" in answer.lower() else 0, extract_reasoning(answer)
    except Exception as e:
        return 0, f"Ollama error: {e}"

# Cell 5: Model function map
MODEL_FUNCTIONS = {
    # "openai_o3": classify_with_openai,
    # "claude_sonnet_4": classify_with_claude,
    # "gemini_2.5_flash": classify_with_gemini_flash25,
    # "llama_3.2": classify_with_ollama,
    "mistral3.1_24B_q4" : classify_with_ollama
}

# Cell 6: Helper functions for judge evaluation

def get_model_response(text, model_name, template):
    classify_func = MODEL_FUNCTIONS[model_name]
    model_id = MODELS[model_name]
    return classify_func(text, template, model_id)



def judge_final_decision(text, lawyer_reasoning, judge_model, judge_prompt):
#     judge_prompt = f"""
# You are an expert evaluating if a student's reflection expresses doubt.

# Reflection:
# "{text}"

# Here are the model analyses:
# """
#     for m, v in candidates.items():
#         label = "Doubt" if v["prediction"] == 1 else "No Doubt"
#         judge_prompt += f"\n‚Ä¢ {m}: {label} - {v['explanation']}"

#     judge_prompt += """

# Please reply in the format:
# Classification: [Yes/No]
# Reasoning: [Brief explanation]
# """
    # lawyer_reasoning = ""
    # for m, v in candidates.items():
    #     label = "Doubt" if v["prediction"] == 1 else "No Doubt"
    #     lawyer_reasoning += f"\n‚Ä¢ {m}: {label} - {v['explanation']}"

    # print(candidates)
    # print(lawyer_reasoning)
    # print("============")
    
    print(judge_prompt)
    pred, explanation = MODEL_FUNCTIONS[judge_model](text, judge_prompt, MODELS[judge_model])
    return pred, explanation

# Cell 7: Main pipeline runner

def run_judge_pipeline(df, text_col, model_1, model_2, judge_model, prompt_type, output_file="judge_results.jsonl"):
    # prompt_template = PROMPTS[prompt_type]

    y_pred = []
    y_expl = []
    all_results = {}
    summary_data = []

    with open(output_file, "w") as f:
        # for i in range(min(10, len(df))):
        for i in range(len(df)):
            text = df.iloc[i][text_col]
            # candidates = {}
            lawyer_reasoning = ""
            # for model in [model_1, model_2]:
            #     pred, expl = get_model_response(text, model, prompt_template)
            #     candidates[model] = {"prediction": pred, "explanation": expl}

            prosecutor_prompt =  """
            You are the first tutor for students learning a topic. Each student is writing a reflection on his/her learning. Your task is to analyze the reflection and argue that the student express doubts on the topic and requires further explanation on a topic.

            Here is the student's reflection:

            <student_reflection>
            "{text}"
            </student_reflection>

            Only reply Yes. Provide your reasons.
            Answer: Yes
            Reasoning: ...
            """
            pred_1, expl_1 = get_model_response(text, model_1, prosecutor_prompt)
            # candidates["prosecutor_prompt"] = {"prediction": pred_1, "explanation": expl_1}
            label = "Doubt" if pred_1 == 1 else "No Doubt"
            lawyer_reasoning += f"\n‚Ä¢ first tutor: {label} - {expl_1 }"

            # print(pred_1)
            # print(expl_1)
            defender_prompt = f"""
            You are the second tutor for students learning a topic. Each student is writing a reflection on his/her learning. Your task is to analyze the reflection and argue that the student does not express doubts on the topic or requires further explanation on a topic.

            Student Reflection:
            "{text}"

            Prosecutor's Argument:
            "{expl_1}"

            Only reply No. Provide your reasons.
            Answer: No
            Reasoning: ...
            """

            pred_2, expl_2 = get_model_response(text, model_2, defender_prompt)
            # candidates["defender_prompt"] = {"prediction": pred_2, "explanation": expl_2}
            label = "Doubt" if pred_2 == 1 else "No Doubt"
            lawyer_reasoning += f"\n\n‚Ä¢ second tutor: {label} - {expl_2 }"

            # print(lawyer_reasoning)

            judge_prompt = f"""
            You are an impartial experienced third tutor evaluating a case where the first and second tutor make opposite claims that a student‚Äôs reflection explicilty expressing doubt.

            Student Reflection:
            "{text}"

            Here are the prosecutor and defender agent analyses:
            "{lawyer_reasoning}"

            If you analyse student reflection and both tutors analyses and conclude the student explicitly expressing doubt, output Yes. If not, output No. Provide your reasons.

            Only reply Yes or No. Provide your reasons.
            Answer: Yes or No
            Reasoning: ...
            """
            judge_pred, judge_expl = judge_final_decision(text, lawyer_reasoning, judge_model, judge_prompt)
            y_pred.append(judge_pred)
            y_expl.append(judge_expl)
            result = {
                "sample_index": i,
                "timestamp": time.strftime('%Y-%m-%d %H:%M:%S'),
                "reflection": text,
                "candidates": lawyer_reasoning,
                "judge": {
                    "model": judge_model,
                    "prediction": judge_pred,
                    "explanation": judge_expl
                }
            }

            f.write(json.dumps(result) + "\n")
            print(f"‚úÖ {i+1}: Judge={judge_pred} - {judge_expl[:60]}...")

        model_results = {}
        label_col = CONFIG['dataset']['label_column']
        y_true = df[label_col].astype(int).tolist()
        metrics = calculate_metrics(y_true, y_pred, f"{judge_model} ({prompt_type})")

        # Store results
        model_results[prompt_type] = {
            "predictions": y_pred,
            "explanation": y_expl,
            "metrics": metrics,
            "sample_size": len(y_pred)
        }
        print(y_expl)        
        # Add to summary
        summary_data.append({
            "Model": judge_model+"_" + timestamp,
            "Prompt": prompt_type,
            "Accuracy": metrics["accuracy"],
            "Precision": metrics["precision"],
            "Recall": metrics["recall"],
            "Specificity": metrics["specificity"],
            "F1": metrics["f1"],
            "F2": metrics["f2"],
            "Fall_Out": metrics["fall_out"],
            "Miss_Rate": metrics["miss_rate"],
            "Sample_Size": len(y_pred)
            }
        
        )
        # Store model results
        all_results[judge_model+"_" + timestamp] = model_results

        # SAVE RESULTS 
        print("\nüíæ Saving results for each model...")

        # Prepare results for saving
        results_to_save = {
            'all_results': all_results,
            'summary_data': summary_data,
            'testing_config': {
                'dataset_size': len(df),
                'prompt_strategies': list(PROMPTS.keys()),
                'total_combinations_tested': len(summary_data)
            },
            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
        }

        # Save results
        try:
            
            fsave = "judge"
            output_dir = "output"
        
            # Save as pickle (preserves Python objects) - both versions
            pickle_filename = f"{output_dir}/{fsave}_{judge_model}_{timestamp}.pkl"
            
            with open(pickle_filename, 'wb') as f:
                pickle.dump(results_to_save, f)
            print(f"üíæ Results saved to {pickle_filename}")
            
        except Exception as e:
            print(f"‚ùå Error saving results: {e}")
# Cell 8: Run example
fsave = "judge"
output_dir = "output"
timestamp = time.strftime('%Y%m%d_%H%M%S')
model1 = "mistral3.1_24B_q4"
model2 = "mistral3.1_24B_q4"
judge = "mistral3.1_24B_q4"

run_judge_pipeline(df, CONFIG['dataset']['text_column'], model1, model2, judge, "judge", f"{output_dir}/{fsave}_{model1}_{model2}_{judge}_{timestamp}.json")
