1. Ensure there are PKL files in output/llm and output/slm. You may need to run Step 1-2 of llm_doubts.ipynb to load the packages first
2. Run Step 1 to merge the PKL files into output_merged/llm_doubts.pkl and output_merged/slm_doubts.pkl 
3. To view LLMs results, do Step 2
4. To view SLMs results, do Step 3
5. To view ensemble results, do Step 4-6

In [None]:
# Step 1 - Merge LLMs and SLMs

import os
import pickle

def load_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

def save_pickle(obj, filepath):
    with open(filepath, 'wb') as f:
        pickle.dump(obj, f)

def merge_lists(list1, list2):
    existing = set(repr(item) for item in list1)
    for item in list2:
        if repr(item) not in existing:
            list1.append(item)
    return list1

def merge_all_keys(folder_path, output_file):
    merged = {}
    files = [f for f in os.listdir(folder_path) if f.endswith('.pkl')]
    files.sort()

    for fname in files:
        path = os.path.join(folder_path, fname)
        data = load_pickle(path)
        print(f"‚úÖ Processing: {fname}")

        for key, value in data.items():
            if key not in merged:
                merged[key] = value
            else:
                if isinstance(merged[key], list) and isinstance(value, list):
                    merged[key] = merge_lists(merged[key], value)
                elif isinstance(merged[key], list):
                    merged[key].append(value)
                elif isinstance(value, list):
                    merged[key] = [merged[key]] + value
                else:
                    merged[key] = [merged[key], value]  # append both values as list

    save_pickle(merged, output_file)
    print(f"üíæ Merged result saved to {output_file}")

folder_path = "output/llm"
merged_file = "output_merged/llm_doubts.pkl"
merge_all_keys(folder_path, merged_file)

# data = load_pickle(merged_file)

folder_path = "output/slm"
merged_file = "output_merged/slm_doubts.pkl"
merge_all_keys(folder_path, merged_file)

folder_path = "output/mad"
merged_file = "output_merged/slm_mad_doubts.pkl"
merge_all_keys(folder_path, merged_file)

# data = load_pickle(merged_file)
# print_values(data)


In [None]:
# Step 2 - LLMS RESULTS ANALYSIS AND SUMMARY
# ================================================================================
import pandas as pd

merged_file = "output_merged/llm_doubts.pkl"
data = load_pickle(merged_file)
summary_data = data['summary_data']
# Create comprehensive summary
if summary_data:
    summary_df = pd.DataFrame(summary_data)
    
    print("\nüìä LLM INDIVIDUAL MODEL RESULTS SUMMARY")
    print("=" * 50)
    print(summary_df.round(4).to_string(index=False))
    
    # Find best performers
    if len(summary_df) > 0:
        best_f1_idx = summary_df['F1'].idxmax()
        best_f1_row = summary_df.loc[best_f1_idx]
        
        best_accuracy_idx = summary_df['Accuracy'].idxmax()
        best_accuracy_row = summary_df.loc[best_accuracy_idx]
        
        print(f"\nüèÜ BEST PERFORMERS:")
        print(f"  ‚Ä¢ Best F1: {best_f1_row['Model']} with {best_f1_row['Prompt']} (F1: {best_f1_row['F1']:.4f})")
        print(f"  ‚Ä¢ Best Accuracy: {best_accuracy_row['Model']} with {best_accuracy_row['Prompt']} (Acc: {best_accuracy_row['Accuracy']:.4f})")
        
        # Performance distribution
        print(f"\nüìä PERFORMANCE DISTRIBUTION:")
        print(f"  ‚Ä¢ F1 Score range: {summary_df['F1'].min():.4f} - {summary_df['F1'].max():.4f}")
        print(f"  ‚Ä¢ F2 Score range: {summary_df['F2'].min():.4f} - {summary_df['F2'].max():.4f}")
        print(f"  ‚Ä¢ Specificity range: {summary_df['Specificity'].min():.4f} - {summary_df['Specificity'].max():.4f}")
        print(f"  ‚Ä¢ Mean F1 Score: {summary_df['F1'].mean():.4f}")
        print(f"  ‚Ä¢ Mean F2 Score: {summary_df['F2'].mean():.4f}")
        print(f"  ‚Ä¢ Std F1 Score: {summary_df['F1'].std():.4f}")
        
        # Identify best performers by different metrics
        print(f"\nüéØ BEST PERFORMERS BY METRIC:")
        best_f1 = summary_df.loc[summary_df['F1'].idxmax()]
        best_f2 = summary_df.loc[summary_df['F2'].idxmax()]
        best_spec = summary_df.loc[summary_df['Specificity'].idxmax()]
        best_prec = summary_df.loc[summary_df['Precision'].idxmax()]
        
        print(f"  ‚Ä¢ Best F1 Score: {best_f1['Model']} ({best_f1['Prompt']}) = {best_f1['F1']:.4f}")
        print(f"  ‚Ä¢ Best F2 Score: {best_f2['Model']} ({best_f2['Prompt']}) = {best_f2['F2']:.4f}")
        print(f"  ‚Ä¢ Best Specificity: {best_spec['Model']} ({best_spec['Prompt']}) = {best_spec['Specificity']:.4f}")
        print(f"  ‚Ä¢ Best Precision: {best_prec['Model']} ({best_prec['Prompt']}) = {best_prec['Precision']:.4f}")
        
        # Sample size info
        if len(summary_df['Sample_Size'].unique()) > 1:
            print(f"\nüìè SAMPLE SIZES:")
            for _, row in summary_df.iterrows():
                print(f"  ‚Ä¢ {row['Model']} ({row['Prompt']}): {row['Sample_Size']} samples")
        else:
            print(f"\nüìè All tests used {summary_df['Sample_Size'].iloc[0]} samples")

else:
    print("\n‚ùå No results to analyze")

print(f"\n‚úÖ Individual model testing analysis complete")


In [None]:
# Step 3 - SLMS RESULTS ANALYSIS AND SUMMARY
# ================================================================================
import pandas as pd

merged_file = "output_merged/slm_doubts.pkl"
data = load_pickle(merged_file)

summary_data = data['summary_data']
# Create comprehensive summary
if summary_data:
    summary_df = pd.DataFrame(summary_data)
    
    print("\nüìä SLM INDIVIDUAL MODEL RESULTS SUMMARY")
    print("=" * 50)
    print(summary_df.round(4).to_string(index=False))
    
    # Find best performers
    if len(summary_df) > 0:
        best_f1_idx = summary_df['F1'].idxmax()
        best_f1_row = summary_df.loc[best_f1_idx]
        
        best_accuracy_idx = summary_df['Accuracy'].idxmax()
        best_accuracy_row = summary_df.loc[best_accuracy_idx]
        
        print(f"\nüèÜ BEST PERFORMERS:")
        print(f"  ‚Ä¢ Best F1: {best_f1_row['Model']} with {best_f1_row['Prompt']} (F1: {best_f1_row['F1']:.4f})")
        print(f"  ‚Ä¢ Best Accuracy: {best_accuracy_row['Model']} with {best_accuracy_row['Prompt']} (Acc: {best_accuracy_row['Accuracy']:.4f})")
        
        # Performance distribution
        print(f"\nüìä PERFORMANCE DISTRIBUTION:")
        print(f"  ‚Ä¢ F1 Score range: {summary_df['F1'].min():.4f} - {summary_df['F1'].max():.4f}")
        print(f"  ‚Ä¢ F2 Score range: {summary_df['F2'].min():.4f} - {summary_df['F2'].max():.4f}")
        print(f"  ‚Ä¢ Specificity range: {summary_df['Specificity'].min():.4f} - {summary_df['Specificity'].max():.4f}")
        print(f"  ‚Ä¢ Mean F1 Score: {summary_df['F1'].mean():.4f}")
        print(f"  ‚Ä¢ Mean F2 Score: {summary_df['F2'].mean():.4f}")
        print(f"  ‚Ä¢ Std F1 Score: {summary_df['F1'].std():.4f}")
        
        # Identify best performers by different metrics
        print(f"\nüéØ BEST PERFORMERS BY METRIC:")
        best_f1 = summary_df.loc[summary_df['F1'].idxmax()]
        best_f2 = summary_df.loc[summary_df['F2'].idxmax()]
        best_spec = summary_df.loc[summary_df['Specificity'].idxmax()]
        best_prec = summary_df.loc[summary_df['Precision'].idxmax()]
        
        print(f"  ‚Ä¢ Best F1 Score: {best_f1['Model']} ({best_f1['Prompt']}) = {best_f1['F1']:.4f}")
        print(f"  ‚Ä¢ Best F2 Score: {best_f2['Model']} ({best_f2['Prompt']}) = {best_f2['F2']:.4f}")
        print(f"  ‚Ä¢ Best Specificity: {best_spec['Model']} ({best_spec['Prompt']}) = {best_spec['Specificity']:.4f}")
        print(f"  ‚Ä¢ Best Precision: {best_prec['Model']} ({best_prec['Prompt']}) = {best_prec['Precision']:.4f}")
        
        # Sample size info
        if len(summary_df['Sample_Size'].unique()) > 1:
            print(f"\nüìè SAMPLE SIZES:")
            for _, row in summary_df.iterrows():
                print(f"  ‚Ä¢ {row['Model']} ({row['Prompt']}): {row['Sample_Size']} samples")
        else:
            print(f"\nüìè All tests used {summary_df['Sample_Size'].iloc[0]} samples")

else:
    print("\n‚ùå No results to analyze")

print(f"\n‚úÖ Individual model testing analysis complete")


In [None]:
# Step 4 - SLMS MAD RESULTS ANALYSIS AND SUMMARY
# ================================================================================
import pandas as pd

merged_file = "output_merged/slm_mad_doubts.pkl"
data = load_pickle(merged_file)

summary_data = data['summary_data']
# Create comprehensive summary
if summary_data:
    summary_df = pd.DataFrame(summary_data)
    
    print("\nüìä SLM INDIVIDUAL MODEL RESULTS SUMMARY")
    print("=" * 50)
    print(summary_df.round(4).to_string(index=False))
    
    # Find best performers
    if len(summary_df) > 0:
        best_f1_idx = summary_df['F1'].idxmax()
        best_f1_row = summary_df.loc[best_f1_idx]
        
        best_accuracy_idx = summary_df['Accuracy'].idxmax()
        best_accuracy_row = summary_df.loc[best_accuracy_idx]
        
        print(f"\nüèÜ BEST PERFORMERS:")
        print(f"  ‚Ä¢ Best F1: {best_f1_row['Model']} with {best_f1_row['Prompt']} (F1: {best_f1_row['F1']:.4f})")
        print(f"  ‚Ä¢ Best Accuracy: {best_accuracy_row['Model']} with {best_accuracy_row['Prompt']} (Acc: {best_accuracy_row['Accuracy']:.4f})")
        
        # Performance distribution
        print(f"\nüìä PERFORMANCE DISTRIBUTION:")
        print(f"  ‚Ä¢ F1 Score range: {summary_df['F1'].min():.4f} - {summary_df['F1'].max():.4f}")
        print(f"  ‚Ä¢ F2 Score range: {summary_df['F2'].min():.4f} - {summary_df['F2'].max():.4f}")
        print(f"  ‚Ä¢ Specificity range: {summary_df['Specificity'].min():.4f} - {summary_df['Specificity'].max():.4f}")
        print(f"  ‚Ä¢ Mean F1 Score: {summary_df['F1'].mean():.4f}")
        print(f"  ‚Ä¢ Mean F2 Score: {summary_df['F2'].mean():.4f}")
        print(f"  ‚Ä¢ Std F1 Score: {summary_df['F1'].std():.4f}")
        
        # Identify best performers by different metrics
        print(f"\nüéØ BEST PERFORMERS BY METRIC:")
        best_f1 = summary_df.loc[summary_df['F1'].idxmax()]
        best_f2 = summary_df.loc[summary_df['F2'].idxmax()]
        best_spec = summary_df.loc[summary_df['Specificity'].idxmax()]
        best_prec = summary_df.loc[summary_df['Precision'].idxmax()]
        
        print(f"  ‚Ä¢ Best F1 Score: {best_f1['Model']} ({best_f1['Prompt']}) = {best_f1['F1']:.4f}")
        print(f"  ‚Ä¢ Best F2 Score: {best_f2['Model']} ({best_f2['Prompt']}) = {best_f2['F2']:.4f}")
        print(f"  ‚Ä¢ Best Specificity: {best_spec['Model']} ({best_spec['Prompt']}) = {best_spec['Specificity']:.4f}")
        print(f"  ‚Ä¢ Best Precision: {best_prec['Model']} ({best_prec['Prompt']}) = {best_prec['Precision']:.4f}")
        
        # Sample size info
        if len(summary_df['Sample_Size'].unique()) > 1:
            print(f"\nüìè SAMPLE SIZES:")
            for _, row in summary_df.iterrows():
                print(f"  ‚Ä¢ {row['Model']} ({row['Prompt']}): {row['Sample_Size']} samples")
        else:
            print(f"\nüìè All tests used {summary_df['Sample_Size'].iloc[0]} samples")

else:
    print("\n‚ùå No results to analyze")

print(f"\n‚úÖ Individual model testing analysis complete")


In [None]:
# Step 5 - Prepare for Ensemble

# MAJORITY VOTING ENSEMBLE
# ================================================================================

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, fbeta_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve, average_precision_score

def calculate_metrics(y_true, y_pred, method_name=""):
    """
    Calculate and print classification metrics.
    """
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    f2 = fbeta_score(y_true, y_pred, beta=2, zero_division=0)  # F2 score (emphasizes recall)
    accuracy = accuracy_score(y_true, y_pred)
    
    # Confusion matrix for additional metrics
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Calculate additional metrics
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # True Negative Rate
    fall_out = fp / (fp + tn) if (fp + tn) > 0 else 0     # False Positive Rate (1 - specificity)
    miss_rate = fn / (fn + tp) if (fn + tp) > 0 else 0    # False Negative Rate (1 - recall)

    print(f"\nüìä {method_name} Metrics:")
    print(f"  ‚Ä¢ Accuracy:  {accuracy:.4f}")
    print(f"  ‚Ä¢ Precision: {precision:.4f}")
    print(f"  ‚Ä¢ Recall:    {recall:.4f}")
    print(f"  ‚Ä¢ Specificity: {specificity:.4f}")
    print(f"  ‚Ä¢ Fall Out:    {fall_out:.4f}")
    print(f"  ‚Ä¢ Miss Rate:   {miss_rate:.4f}")
    print(f"  ‚Ä¢ F1 Score:    {f1:.4f}")
    print(f"  ‚Ä¢ F2 Score:    {f2:.4f}")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "f2": f2
    }

def get_predictions_for_ensemble(models_list, prompt_type, all_results):
    """
    Get aligned predictions from multiple models for ensemble.
    """
    predictions_dict = {}
    min_length = float('inf')
    

    # Collect predictions from each model
    for model_name in models_list:
        for result_dict in all_results:
            for model_name1, model_results in result_dict.items():
                if model_name==model_name1 and prompt_type in model_results and 'predictions' in model_results[prompt_type]:
                    preds = model_results[prompt_type]['predictions']
                    predictions_dict[model_name] = preds
                    min_length = min(min_length, len(preds))
    
    # Align predictions to same length
    aligned_predictions = {}
    
    for model_name, preds in predictions_dict.items():
        aligned_predictions[model_name] = preds[:min_length]
    
    return aligned_predictions, min_length

def majority_vote_ensemble(models_list, prompt_type, all_results):
    """
    Implement majority voting ensemble.
    """
    predictions_dict, sample_count = get_predictions_for_ensemble(models_list, prompt_type, all_results)
    
    if len(predictions_dict) < 2:
        print(f"‚ùå Need at least 2 models, got {len(predictions_dict)}")
        return None, None, []
    
    print(f"ü§ù Majority Vote with {len(predictions_dict)} models:")
    print(f"  ‚Ä¢ Models: {list(predictions_dict.keys())}")
    print(f"  ‚Ä¢ Sample count: {sample_count}")
    
    # Perform majority voting
    ensemble_predictions = []
    agreement_scores = []
    
    for i in range(sample_count):
        votes = [predictions_dict[model][i] for model in predictions_dict.keys()]
        majority_vote = 1 if sum(votes) > len(votes) / 2 else 0
        ensemble_predictions.append(majority_vote)
        
        # Calculate agreement (how many models agreed with majority)
        agreement = sum(1 for vote in votes if vote == majority_vote) / len(votes)
        agreement_scores.append(agreement)

    from dotenv import load_dotenv
    
    load_dotenv()
    CONFIG = {
        'dataset': {
            'csv_path': os.getenv("DATASET"),  # UPDATE THIS PATH
            'text_column': os.getenv("REFLECTION_COLUMN", "REFLECTION"),  # Default text column
            'label_column': os.getenv("LABEL_COLUMN", "label")  # Default label column
        }
    }    
    try:
        df = pd.read_csv(CONFIG['dataset']['csv_path'])
        print(f"‚úÖ Successfully loaded {len(df)} rows")
    except FileNotFoundError:
        print(f"‚ùå Error: File {csv_path} not found.")
        print("Please ensure the CSV file exists and update the csv_path in CONFIG.")
        return None, None
    except Exception as e:
        print(f"‚ùå Error loading file: {e}")
        return None, None
     # Create final labels list
    
    label_col = CONFIG['dataset']['label_column']
    y_true = df[label_col].astype(int).tolist()

    # Calculate metrics
    y_true_subset = y_true[:sample_count]
    metrics = calculate_metrics(y_true_subset, ensemble_predictions, 
                              f"Majority Vote ({prompt_type})")
    import numpy as np

    # Agreement statistics
    mean_agreement = np.mean(agreement_scores)
    print(f"  ‚Ä¢ Average model agreement: {mean_agreement:.3f}")
    print(f"  ‚Ä¢ High agreement samples (>0.8): {sum(1 for a in agreement_scores if a > 0.8)}")
    

    # Printing Curve
    # Dictionary of predicted scores from different models
    # model_scores = predictions_dict
    # model_scores["majority_vote"] = ensemble_predictions

    # plt.figure(figsize=(7, 6))

    # # Plot ROC for each model
    # for model_name, scores in model_scores.items():
    #     fpr, tpr, _ = roc_curve(y_true, scores)
    #     roc_auc = auc(fpr, tpr)
    #     # Mapping old names to new names
    #     name_mapping = {
    #         "openai_o3" : "GPT-4o3",
    #         "claude_sonnet_4": "Claude-4",
    #         "gemini_2.5_flash": "Gemini-2.5",
    #         "majority_vote" : "Majority_Vote"
    #     }
    #     title = name_mapping.get(model_name, model_name)

    #     plt.plot(fpr, tpr, lw=2, label=f'{title} (AUROC = {roc_auc:.2f})')

    # # Diagonal baseline
    # plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=1)

    # # Plot AUROC settings
    # plt.xlabel('False Positive Rate')
    # plt.ylabel('True Positive Rate (Recall)')
    # plt.title('ROC Curve Comparison')
    # plt.legend(loc='lower right')
    # plt.grid(True)
    # plt.tight_layout()
    # plt.show()

    # plt.figure(figsize=(7, 6))

    # # Plot PR curve for each model
    # for model_name, scores in model_scores.items():
    #     precision, recall, _ = precision_recall_curve(y_true, scores)
    #     pr_auc = average_precision_score(y_true, scores)
    #     plt.plot(recall, precision, lw=2, label=f'{model_name} (PR-AUC = {pr_auc:.2f})')

    # # Plot formatting
    # plt.xlabel('Recall')
    # plt.ylabel('Precision')
    # plt.title('Precision-Recall Curve Comparison')
    # plt.legend(loc='lower left')
    # plt.grid(True)
    # plt.tight_layout()
    # plt.show()





    return ensemble_predictions, metrics, list(predictions_dict.keys())

# ANALYZE AVAILABLE DATA FOR ENSEMBLE
# ================================================================================

def analysis_ensemble_models(all_results): 
    print("\nüìä ENSEMBLE DATA ANALYSIS")
    print("=" * 50)

    available_predictions = {}
    prediction_counts = {}

    for result_dict in all_results:
        for model_name, model_results in result_dict.items():
            available_predictions[model_name] = {}
            for prompt_type, result in model_results.items():
                if result and 'predictions' in result:
                    available_predictions[model_name][prompt_type] = len(result['predictions'])
                    
                    # Count for this prompt type
                    if prompt_type not in prediction_counts:
                        prediction_counts[prompt_type] = 0
                    prediction_counts[prompt_type] += 1

    # Determine best strategies for ensemble
    best_prompt_for_ensemble = max(prediction_counts.keys(), key=lambda k: prediction_counts[k])
    models_for_ensemble = [m for m in available_predictions.keys() 
                        if best_prompt_for_ensemble in available_predictions[m]]

    print(f"\nüéØ ENSEMBLE STRATEGY:")
    print(f"  ‚Ä¢ Best prompt type: {best_prompt_for_ensemble} ({prediction_counts[best_prompt_for_ensemble]} models)")
    print(f"  ‚Ä¢ Models for ensemble: {models_for_ensemble}")

    if len(models_for_ensemble) < 2:
        print(f"\n‚ö†Ô∏è WARNING: Need at least 2 models for ensemble methods")
        print(f"   Only {len(models_for_ensemble)} models available")
        ensemble_possible = False
    else:
        ensemble_possible = True
        print(f"\n‚úÖ Ensemble methods possible with {len(models_for_ensemble)} models")

    ensemble_predications = {}
    # Test majority voting for each prompt type
    if ensemble_possible:
        print("\n" + "="*60)
        print("MAJORITY VOTING ENSEMBLE")
        print("="*60)
        
        majority_vote_results = {}
        
        for prompt_type in ["zero_shot", "one_shot", "few_shot"]:
            if prompt_type in prediction_counts and prediction_counts[prompt_type] >= 2:
                print(f"\nüìù Testing {prompt_type} majority voting...")
                
                # Get models available for this prompt type
                available_for_prompt = [m for m in available_predictions.keys() 
                                    if prompt_type in available_predictions[m]]
                
                predictions, metrics, participating_models = majority_vote_ensemble(
                    available_for_prompt, prompt_type, all_results)
                
                ensemble_predications[prompt_type] = predictions
                # print(ensemble_predications)
                if predictions and metrics:
                    majority_vote_results[prompt_type] = {
                        'predictions': predictions,
                        'metrics': metrics,
                        'participating_models': participating_models,
                        'sample_count': len(predictions)
                    }
            else:
                print(f"\n‚ùå {prompt_type}: insufficient models ({prediction_counts.get(prompt_type, 0)})")
        
        # Show best majority vote result
        if majority_vote_results:
            best_mv = max(majority_vote_results.items(), key=lambda x: x[1]['metrics']['f1'])
            print(f"\nüèÜ Best Majority Vote: {best_mv[0]} (F1: {best_mv[1]['metrics']['f1']:.4f})")
            print(f"üèÜ Best Majority Vote: {best_mv[0]} (F2: {best_mv[1]['metrics']['f2']:.4f})")
            print(f"   Models used: {best_mv[1]['participating_models']}")
    else:
        print("\n‚ö†Ô∏è Skipping majority voting - insufficient models")
        majority_vote_results = {}

    return ensemble_predications


# Load individual model results
try:
    with open('output_merged/llm_doubts.pkl', 'rb') as f:
        individual_results_llm = pickle.load(f)
    print("‚úÖ Individual model results llm loaded")
    all_results_llm = individual_results_llm['all_results']
    print(f"üìä Loaded results for {len(all_results_llm)} models")

    with open('output_merged/slm_doubts.pkl', 'rb') as f:
        individual_results_slm = pickle.load(f)
    print("‚úÖ Individual model results slm loaded")
    all_results_slm = individual_results_slm['all_results']
    print(f"üìä Loaded results for {len(individual_results_slm)} models")

    with open('output_merged/slm_mad_doubts.pkl', 'rb') as f:
        individual_results_slm_mad = pickle.load(f)
    print("‚úÖ Individual model results slm mad loaded")
    all_results_slm_mad = individual_results_slm_mad['all_results']
    print(f"üìä Loaded results for {len(individual_results_slm_mad)} models")
    
except FileNotFoundError:
    print("‚ùå Individual model results not found. Run Script 2 first.")
    raise



In [None]:
ensemble_predications_llm = analysis_ensemble_models(all_results_llm)
print(ensemble_predications_llm)

In [None]:
ensemble_predications_slm = analysis_ensemble_models(all_results_slm)
print(ensemble_predications_slm)

In [None]:
# Plotting ROC for all models

predictions_dict = {}

from dotenv import load_dotenv

load_dotenv()
CONFIG = {
    'dataset': {
        'csv_path': os.getenv("DATASET"),  # UPDATE THIS PATH
        'text_column': os.getenv("REFLECTION_COLUMN", "REFLECTION"),  # Default text column
        'label_column': os.getenv("LABEL_COLUMN", "label")  # Default label column
    }
}   
try:
    df = pd.read_csv(CONFIG['dataset']['csv_path'])
    print(f"‚úÖ Successfully loaded {len(df)} rows")
except FileNotFoundError:
    print(f"‚ùå Error: File {csv_path} not found.")
    print("Please ensure the CSV file exists and update the csv_path in CONFIG.")
except Exception as e:
    print(f"‚ùå Error loading file: {e}")

label_col = CONFIG['dataset']['label_column']
y_true = df[label_col].astype(int).tolist()

# Collect predictions from each model
for result_dict in all_results_llm:
    for model_name, model_results in result_dict.items():
            for prompt, prediction_results in model_results.items():
                print(prompt)
                if prompt != "one_shot":
                    preds = prediction_results['predictions']
                    predictions_dict[model_name+"_"+prompt] = preds
predictions_dict["Maj_Vote_LLM(zero-shot)"] = ensemble_predications_llm["zero_shot"]
predictions_dict["Maj_Vote_LLM(few-shot)"] = ensemble_predications_llm["few_shot"]

for result_dict in all_results_slm:
    for model_name, model_results in result_dict.items():
            for prompt, prediction_results in model_results.items():
                if prompt != "one_shot":
                    preds = prediction_results['predictions']
                    predictions_dict[model_name+"_"+prompt] = preds
predictions_dict["Maj_Vote_SLM(zero-shot)"] = ensemble_predications_llm["zero_shot"]
predictions_dict["Maj_Vote_SLM(few-shot)"] = ensemble_predications_llm["few_shot"]

for result_dict in all_results_slm_mad:
    for model_name, model_results in result_dict.items():
            for prompt, prediction_results in model_results.items():
                if prompt != "one_shot":
                    preds = prediction_results['predictions']
                    predictions_dict[model_name+"_"+prompt] = preds


# print(predictions_dict)
# Printing Curve
# Dictionary of predicted scores from different models
model_scores = predictions_dict


plt.figure(figsize=(7, 6))

underline = '\u0332'  # combining underline character

def underline_text(text):
    return ''.join(c + underline for c in text)

# Plot ROC for each model
for model_name, scores in model_scores.items():
    fpr, tpr, _ = roc_curve(y_true, scores)
    # print(y_true)
    # print(scores)
    roc_auc = auc(fpr, tpr)
    name_mapping = {
        "openai_o3_zero_shot" : "GPT(zero-shot)",
        "openai_o3_few_shot" : "GPT(few-shot)",
        "claude_sonnet_4_zero_shot": "Claude(zero-shot)",
        "claude_sonnet_4_few_shot": "Claude(few-shot)",
        "gemini_2.5_flash_zero_shot": "Gemini(zero-shot)",
        "gemini_2.5_flash_few_shot": "Gemini(few-shot)",
        "llama_3.2_20250610_123907_zero_shot": "Llama(zero-shot)",
        "llama_3.2_20250610_123907_few_shot": "Llama(few-shot)",
        "mistral3.1_24B_zero_shot": "Mistral(zero-shot)",
        "mistral3.1_24B_few_shot": "Mistral(few-shot)",
        "deepseek_r1_zero_shot": "Deepseek(zero-shot)",
        "deepseek_r1_few_shot": "Deepseek(few-shot)",
        "qwen3_8b_q8_zero_shot": "Qwen(zero-shot)",
        "qwen3_8b_q8_few_shot": "Qwen(few-shot)",
        "mistral3.1_24B_q4_20250618_202359_judge": "SLM-as-a-Judge",
        "mistral3.1_24B_q4_20250613_234232_self_consistency": "Self-Consistency",
        "mistral3.1_24B_q4_20250618_134538_two_agents_chain": "Two-Agents-Chain"
    }
    title = name_mapping.get(model_name, model_name)
    if roc_auc > 0.8:
        label = "AUROC="+underline_text(f'({roc_auc:.2f}) {title} ')
    else:
        label = f'(AUROC={roc_auc:.2f}) {title} '

    plt.plot(fpr, tpr, lw=2, label=label)
    

# Diagonal baseline
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=1)

# Plot AUROC settings
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

# # Sort legend entries by AUROC descending
# handles, labels = plt.gca().get_legend_handles_labels()
# sorted_pairs = sorted(zip(labels, handles), reverse=True)
# labels, handles = zip(*sorted_pairs)
# plt.legend(handles, labels, loc='lower right', fontsize=8)
# plt.figure(figsize=(7, 6))

# # Plot PR curve for each model
for model_name, scores in model_scores.items():
    precision, recall, _ = precision_recall_curve(y_true, scores)
    pr_auc = average_precision_score(y_true, scores)
    plt.plot(recall, precision, lw=2, label=f'{model_name} (PR-AUC = {pr_auc:.2f})')

# # Plot formatting
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.title('Precision-Recall Curve Comparison')
# plt.legend(loc='lower left')
# plt.grid(True)
# plt.tight_layout()
# plt.show()

# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_curve, auc

# # Example structure: list of (label, fpr, tpr) tuples
# roc_data = model_scores

# # Compute AUROC for each
# auroc_list = []
# for label, fpr, tpr in roc_data:
#     score = auc(fpr, tpr)
#     auroc_list.append((score, label, fpr, tpr))

# # Sort by AUROC descending
# auroc_list.sort(reverse=True)

# # Plot curves
# for score, label, fpr, tpr in auroc_list:
#     plt.plot(fpr, tpr, label=f"{label} (AUROC = {score:.2f})")

# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate (Recall)")
# plt.title("ROC Curve Comparison")
# plt.grid(True, linestyle='--', alpha=0.5)
# plt.legend(loc="lower right", fontsize=8)
# plt.tight_layout()
# plt.show()

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Replace these with your actual lists
true_labels = y_true             # Your true label list (0 or 1)
predicted_labels = model_scores.get('mistral3.1_24B_q4_20250613_234232_self_consistency')       # Your predicted label list (0 or 1)
text_col = CONFIG['dataset']['text_column']
texts = df[text_col].tolist()                 # The corresponding reflections
# print(texts)

# Step 1: Create DataFrame
df = pd.DataFrame({
    'text': pd.Series(texts),
    'true_label': pd.Series(true_labels),
    'predicted_label': pd.Series(predicted_labels)
})

# Step 2: Identify false positives and false negatives
false_positives = df[(df['true_label'] == 0) & (df['predicted_label'] == 1)]
false_negatives = df[(df['true_label'] == 1) & (df['predicted_label'] == 0)]

# print(len(false_positives))
# print(len(false_negatives))
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000

print(false_negatives['text'])
# Filter for non-empty texts (and optionally strip whitespace)
# non_empty_texts = [t for t in texts if t.strip()]

# # Check before vectorizing
# if len(non_empty_texts) > 0:
#     vectorizer = CountVectorizer(stop_words='english')
#     X = vectorizer.fit_transform(non_empty_texts)
#     # Now continue with analysis
# else:
#     print("No valid texts to analyze.")

# print(len(y_true))
# # print(model_scores)
# print(len(model_scores.get('claude_sonnet_4_zero_shot')))
# print(len(texts))

# Step 3: Count most common n-grams (unigrams + bigrams) in each group
def get_top_phrases(text_series, n=10):
    vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
    X = vectorizer.fit_transform(text_series)
    sum_words = X.sum(axis=0).A1
    vocab = vectorizer.get_feature_names_out()
    freq = list(zip(vocab, sum_words))
    return sorted(freq, key=lambda x: -x[1])[:n]

top_fp_phrases = get_top_phrases(false_positives['text'])
top_fn_phrases = get_top_phrases(false_negatives['text'])

# Print results
print("üîç Top phrases in False Positives:")
for phrase, count in top_fp_phrases:
    print(f"{phrase}: {count}")

print("\nüîç Top phrases in False Negatives:")
for phrase, count in top_fn_phrases:
    print(f"{phrase}: {count}")

# print()


from collections import defaultdict, Counter

# Example data (replace with your actual data)
# texts = [...]  # List of reflection texts
# true_labels = [...]  # List of true labels
# model_predictions = {
#     "gpt4": [...],
#     "claude": [...],
#     "gemini": [...],
#     ...
# }

# Step 1: Track misclassified indices
fp_counts = defaultdict(int)
fn_counts = defaultdict(int)

for model_name, preds in model_scores.items():
    for i, (true, pred) in enumerate(zip(true_labels, preds)):
        if pred == 1 and true == 0:
            fp_counts[i] += 1  # False Positive
        elif pred == 0 and true == 1:
            fn_counts[i] += 1  # False Negative

# Step 2: Sort by most frequent misclassification
most_common_fp = sorted(fp_counts.items(), key=lambda x: x[1], reverse=True)
most_common_fn = sorted(fn_counts.items(), key=lambda x: x[1], reverse=True)

# Step 3: Display most common text examples
print("\nüî¥ Most Common False Positives:")
for idx, count in most_common_fp[:100]:
    print(f"({count} models) - {texts[idx]}")

print("\nüîµ Most Common False Negatives:")
for idx, count in most_common_fn[:100]:
    print(f"({count} models) - {texts[idx]}")
