In [1]:
#@title 03 model validation gemini vs llama
# ╔═══════════════════════════════════════════════════════════╗
# ║ Notebook 03: Model Validation and Analysis                ║
# ╚═══════════════════════════════════════════════════════════╝
# This notebook validates the LLM outputs by checking intra-model stability,
# cross-model replication, and correlation with a morphological proxy.
# It also identifies unstable units from the Mistral model.

# --- 1️⃣  Setup and Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, pearsonr
import re
import json
import itertools
import unicodedata
from pathlib import Path
import sys

# Import helper functions from the src directory
# Make sure the src directory is in the Python path
sys.path.append('../')
from src.corpus_parser import load_json, strip_accents
from src.analysis_utils import build_runs_df, normalise_uid, mean_pairwise_cos
from src.config import * 
CATEGORIES = ANALYTICAL_CONCEPTS_LIST 

GEMINI_JSON = 'gemini_osborne_greek.json'
MISTRAL_JSON = 'groq_osborne_greek.json'
OSBORNE_GREEK_STRUCTURED = 'osborne_greek_structured.json'

gem_runs = build_runs_df(load_json(Path(PATH_RESULTS) / GEMINI_JSON))
mis_runs = build_runs_df(load_json(Path(PATH_RESULTS) / MISTRAL_JSON))

gem_runs["unit"] = gem_runs["unit"].map(normalise_uid)
mis_runs["unit"] = mis_runs["unit"].map(normalise_uid)

# --- 3️⃣  Perform Validation Analyses ---
# Intra-model stability (Gemini)
intra_cos = mean_pairwise_cos(gem_runs, CATEGORIES)
print(f"Mean pair-wise cosine (Gemini-Pro): {intra_cos:.3f}")

# Cross-model replication (Gemini vs. Mistral)
gem_mean = gem_runs.groupby("unit")[CATEGORIES].mean().sort_index()
mis_mean = mis_runs.groupby("unit")[CATEGORIES].mean().sort_index()
rho, p = spearmanr(gem_mean.to_numpy().flatten(), mis_mean.to_numpy().flatten())
print(f"Spearman ρ (Gemini vs Llama): {rho:.3f}  (p = {p:.4g})")

# Model-Specific Instability (Llama)
mistral_std = mis_runs.groupby("unit")[CATEGORIES].std()
verse_range_map = gem_runs.drop_duplicates(subset='unit').set_index('unit')['verse_range']
unstable_log = []
for unit_id, row in mistral_std.iterrows():
    if row.max() >= 3.0:
        unstable_log.append({
            "Unit (Osborne)": unit_id,
            "Verse Range": verse_range_map.get(unit_id, "N/A"),
            "Highest-Variance Vector (Llama)": row.idxmax(),
            "σ (pp)": round(row.max(), 2)
        })
df_unstable = pd.DataFrame(unstable_log).sort_values(by="σ (pp)", ascending=False).reset_index(drop=True)
print("\n--- Representative Unstable-Case Log (Llama Model) ---")
display(df_unstable)

# Lemma-frequency proxy
structured_data = load_json(Path(PATH_PROCESSED) / OSBORNE_GREEK_STRUCTURED)
greek_texts = {unit["unit_id"]: unit.get("full_text", "") for unit in structured_data["narrative_units"]}
target_lemmas = {"αξιος", "προσκυνεω", "ευλογια", "δοξα", "τιμη", "κρατος"}
lemma_counts = {normalise_uid(uid): sum(strip_accents(t.lower()) in target_lemmas for t in re.findall(r"[\u0370-\u03FF\u1F00-\u1FFF]+", txt)) for uid, txt in greek_texts.items()}
lemma_df = pd.DataFrame(lemma_counts.items(), columns=["unit", "lemma_count"])
merged = gem_mean.reset_index().merge(lemma_df, on="unit", how="inner")

print("\n--- Lemma proxy vs. Categories (Pearson r) ---")
lemma_correlations = {}
for category in CATEGORIES:
    r, p_l = pearsonr(merged["lemma_count"], merged[category])
    print(f"'{category}': r = {r:.3f}  (p = {p_l:.4g})")
    lemma_correlations[category] = {"r": r, "p_value": p_l}


# --- 4️⃣  Save Outputs ---
# This final section saves the key numerical results to structured files,
# ensuring the paper's findings are fully reproducible and verifiable.

print("\n" + "─" * 50)
print("4️⃣  Saving Outputs")
print("─" * 50)

# Create a dictionary of the core validation metrics
validation_metrics = {
    "intra_model_stability": {
        "model": "Gemini",
        "metric": "Mean Pair-wise Cosine Similarity",
        "value": intra_cos
    },
    "cross_model_replication": {
        "models": "Gemini vs. Llama",
        "metric": "Spearman Correlation (rho)",
        "value": rho,
        "p_value": p
    },
    "morphological_proxy": {
        "model": "Gemini",
        "metric": "Pearson Correlation (r)",
        "proxy": "Cultic Lemma Count",
        "correlations_by_category": lemma_correlations
    }
}

# Save the validation metrics using the PATH_RESULTS variable
try:
    with open(PATH_RESULTS + 'validation_metrics.json', 'w') as f:
        json.dump(validation_metrics, f, indent=4)
    print(f"✅ Validation metrics saved to '{PATH_RESULTS}validation_metrics.json'")
except Exception as e:
    print(f"❌ Error saving validation metrics: {e}")

# Save the unstable case log using the PATH_RESULTS variable
try:
    df_unstable.to_json(PATH_RESULTS + 'unstable_case_log.json', orient='records', indent=4)
    print(f"✅ Unstable case log saved to '{PATH_RESULTS}unstable_case_log.json'")
except Exception as e:
    print(f"❌ Error saving unstable case log: {e}")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Mean pair-wise cosine (Gemini-Pro): 0.987
Spearman ρ (Gemini vs Llama): 0.529  (p = 1.021e-18)

--- Representative Unstable-Case Log (Llama Model) ---


Unnamed: 0,Unit (Osborne),Verse Range,Highest-Variance Vector (Llama),σ (pp)
0,unit_020,RE 16:1-21,Judicial Wrath & Punitive Action,4.22



--- Lemma proxy vs. Categories (Pearson r) ---
'Worship & Praise': r = 0.623  (p = 0.0002371)
'Judicial Wrath & Punitive Action': r = -0.327  (p = 0.0776)
'Lament, Persecution & Endurance': r = 0.026  (p = 0.8899)
'Victory, Consolation & New-Creation Hope': r = 0.147  (p = 0.4394)
'Cosmic Warfare & Deception': r = -0.247  (p = 0.1889)
'Theophanic Awe & Terror': r = -0.071  (p = 0.7085)
'Other/Neutral Content': r = 0.294  (p = 0.1144)

──────────────────────────────────────────────────
4️⃣  Saving Outputs
──────────────────────────────────────────────────
✅ Validation metrics saved to '../data/resultsvalidation_metrics.json'
✅ Unstable case log saved to '../data/resultsunstable_case_log.json'


In [13]:
# @title with lemma
# ╔══════════════════════════════════════════════════════════════════════╗
# ║ Notebook 03: Model Validation and Cross-Model Comparison             ║
# ╚══════════════════════════════════════════════════════════════════════╝
# This notebook performs a complete, end-to-end validation.
# 1. It first pre-processes raw structure and lemmatized text files to create
#    a structured, lemmatized JSON corpus.
# 2. It then uses this corpus and the LLM output files to conduct all validation checks:
#    - Intra-model stability (for both Gemini and Groq/Llama)
#    - Cross-model replication (Gemini vs. Groq/Llama)
#    - Morphological proxy correlation against all vectors
#    - Identification of unstable units from the Groq/Llama model.

# --- 1️⃣ Setup and Imports ---
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr
import re
import json
import csv
from pathlib import Path
import sys

# Ensure helper functions can be imported
sys.path.append('../')
from src.corpus_parser import load_json
from src.analysis_utils import build_runs_df, normalise_uid, mean_pairwise_cos
from src.config import * 
CATEGORIES = ANALYTICAL_CONCEPTS_LIST 

# Input files for pre-processing
LEMMATIZED_GREEK_PATH = Path(PATH_INPUT) / 'lemmatized_greek_text.txt'
OSBORNE_STRUCTURE_PATH = Path(PATH_INPUT) / 'osborne_structure.txt'

# Intermediate file to be created by this script
STRUCTURED_LEMMATIZED_PATH = Path(PATH_PROCESSED) / 'osborne_greek_structured_lemmatized.json'

# Input files for LLM analysis
GEMINI_JSON_PATH = Path(PATH_RESULTS) / 'gemini_osborne_greek.json'
GROQ_JSON_PATH = Path(PATH_RESULTS) / 'groq_osborne_greek.json'

# --- 3️⃣ Helper Functions for Pre-processing ---

def parse_verse_range(range_str: str) -> tuple[int, int, int, int]:
    """Parses a verse range string like 'RE 1:1-8' into a tuple."""
    range_str = range_str.replace('RE ', '').strip()
    parts = range_str.split(':')
    chap = int(parts[0])
    if '-' in parts[1]:
        start_v, end_v = map(int, parts[1].split('-'))
        return chap, start_v, chap, end_v
    else:
        verse = int(parts[1])
        return chap, verse, chap, verse

def load_lemmatized_data(filepath: Path) -> dict:
    """Loads the lemmatized file into a dictionary keyed by (chapter, verse)."""
    lemmas_by_verse = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 2: continue
            ref_str, lemma = parts[0], parts[-1]
            chapter, verse = int(ref_str[2:4]), int(ref_str[4:6])

            verse_key = (chapter, verse)
            if verse_key not in lemmas_by_verse:
                lemmas_by_verse[verse_key] = []
            lemmas_by_verse[verse_key].append(lemma)
    return lemmas_by_verse

def create_structured_lemmatized_file(structure_path: Path, lemmatized_text_path: Path, output_path: Path):
    """
    Parses structure and lemmatized text to create a single structured JSON file
    where the text for each unit is a string of its lemmas.
    """
    print(f"Creating structured lemmatized file at '{output_path}'...")

    lemmas_by_verse = load_lemmatized_data(lemmatized_text_path)

    narrative_units = []
    with open(structure_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            if not row or row[0].startswith('#'):
                continue
            uid, title, range_str = row

            start_chap, start_verse, end_chap, end_verse = parse_verse_range(range_str)

            unit_lemmas = []
            for verse_num in range(start_verse, end_verse + 1):
                verse_key = (start_chap, verse_num)
                if verse_key in lemmas_by_verse:
                    unit_lemmas.extend(lemmas_by_verse[verse_key])

            narrative_units.append({
                "unit_id": uid,
                "title": title,
                "verse_range": range_str,
                "lemmatized_text": " ".join(unit_lemmas)
            })

    final_json_structure = {
        "translation": "Greek SBLG (SBL Greek New Testament) - Lemmatized",
        "narrative_units": narrative_units
    }

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(final_json_structure, f, indent=4, ensure_ascii=False)
    print(f"✅ Successfully created '{output_path}'")


# --- 4️⃣ Main Execution Block ---

# Step 4a: Pre-processing - Create the structured lemmatized file
create_structured_lemmatized_file(OSBORNE_STRUCTURE_PATH, LEMMATIZED_GREEK_PATH, STRUCTURED_LEMMATIZED_PATH)

# Step 4b: Load LLM Data for Analysis
print("\n" + "─" * 50)
print("4️⃣  Loading LLM Data and Performing Validation")
print("─" * 50)

gem_runs = build_runs_df(load_json(GEMINI_JSON_PATH))
groq_runs = build_runs_df(load_json(GROQ_JSON_PATH))

gem_runs["unit"] = gem_runs["unit"].map(normalise_uid)
groq_runs["unit"] = groq_runs["unit"].map(normalise_uid)

# Step 4c: Perform Core Validation Analyses
# B.6.1: Intra-model stability (for both models)
intra_cos_gemini = mean_pairwise_cos(gem_runs, CATEGORIES)
print(f"Intra-Model Stability (Gemini): Mean Pair-wise Cosine = {intra_cos_gemini:.3f}")

intra_cos_groq = mean_pairwise_cos(groq_runs, CATEGORIES)
print(f"Intra-Model Stability (Groq/Llama): Mean Pair-wise Cosine = {intra_cos_groq:.3f}")


# B.6.1: Cross-model replication (Gemini vs. Groq/Llama)
gem_mean = gem_runs.groupby("unit")[CATEGORIES].mean().sort_index()
groq_mean = groq_runs.groupby("unit")[CATEGORIES].mean().sort_index()
rho, p = spearmanr(gem_mean.to_numpy().flatten(), groq_mean.to_numpy().flatten())
print(f"Cross-Model Replication: Spearman ρ (Gemini vs. Groq/Llama) = {rho:.3f} (p = {p:.4g})")

# B.7: Model-Specific Instability (Groq/Llama)
groq_std = groq_runs.groupby("unit")[CATEGORIES].std()
verse_range_map = gem_runs.drop_duplicates(subset='unit').set_index('unit')['verse_range']
unstable_log = []
for unit_id, row in groq_std.iterrows():
    if row.max() >= 3.0:
        unstable_log.append({
            "Unit (Osborne)": unit_id,
            "Verse Range": verse_range_map.get(unit_id, "N/A"),
            "Highest-Variance Vector (Groq/Llama)": row.idxmax(),
            "σ (pp)": round(row.max(), 2)
        })
df_unstable = pd.DataFrame(unstable_log).sort_values(by="σ (pp)", ascending=False).reset_index(drop=True)
print("\n--- Representative Unstable-Case Log (Groq/Llama Model) ---")
print(df_unstable.to_string())

# Step 4d: Morphological Proxy Validation (using the new file)
print("\n--- Morphological Proxy Validation ---")

# Define the full, correct set of 15 target lemmas
target_lemmas = {
    "προσκυνέω", "λατρεύω", "δοξάζω", "ᾄδω", "δόξα", "τιμή", "κράτος",
    "δύναμις", "εὐλογία", "σωτηρία", "θυσιαστήριον", "ἄξιος", "ἀμήν",
    "ἅγιος", "παντοκράτωρ"
}

# Load the pre-processed structured data with lemmas
structured_lemmatized_data = load_json(STRUCTURED_LEMMATIZED_PATH)
lemmatized_texts = {
    unit["unit_id"]: unit.get("lemmatized_text", "")
    for unit in structured_lemmatized_data["narrative_units"]
}

# Perform the count on the pre-processed lemma strings
lemma_counts = {
    normalise_uid(uid): sum(1 for lemma in text.split() if lemma in target_lemmas)
    for uid, text in lemmatized_texts.items()
}

# Create DataFrame and merge for correlation analysis
lemma_df = pd.DataFrame(lemma_counts.items(), columns=["unit", "lemma_count"])
merged = gem_mean.reset_index().merge(lemma_df, on="unit", how="inner")

print("\n--- Lemma Proxy vs. All Rhetorical Vectors (Pearson r) ---")
lemma_correlations = {}
for category in CATEGORIES:
    r, p_l = pearsonr(merged["lemma_count"], merged[category])
    print(f"'{category}': r = {r:.3f}  (p = {p_l:.4g})")
    lemma_correlations[category] = {"r": r, "p_value": p_l}


# --- 5️⃣ Save Outputs ---
print("\n" + "─" * 50)
print("5️⃣  Saving Final Outputs")
print("─" * 50)

# Create a dictionary of the core validation metrics
validation_metrics = {
    "intra_model_stability_gemini": {
        "model": "Gemini",
        "metric": "Mean Pair-wise Cosine Similarity",
        "value": intra_cos_gemini
    },
    "intra_model_stability_groq": {
        "model": "Groq/Llama",
        "metric": "Mean Pair-wise Cosine Similarity",
        "value": intra_cos_groq
    },
    "cross_model_replication": {
        "models": "Gemini vs. Groq/Llama",
        "metric": "Spearman Correlation (rho)",
        "value": rho,
        "p_value": p
    },
    "morphological_proxy": {
        "model": "Gemini",
        "metric": "Pearson Correlation (r)",
        "proxy": "Cultic Lemma Count vs. All Rhetorical Vectors",
        "correlations_by_category": lemma_correlations
    }
}

# Save the validation metrics
validation_metrics_path = Path(PATH_RESULTS) / 'validation_metrics.json'
with open(validation_metrics_path, 'w', encoding='utf-8') as f:
    json.dump(validation_metrics, f, indent=4)
print(f"✅ Validation metrics saved to '{validation_metrics_path}'")

# Save the unstable case log
unstable_log_path = Path(PATH_RESULTS) / 'unstable_case_log.json'
df_unstable.to_json(unstable_log_path, orient='records', indent=4)
print(f"✅ Unstable case log saved to '{unstable_log_path}'")

Creating structured lemmatized file at '..\data\processed\osborne_greek_structured_lemmatized.json'...
✅ Successfully created '..\data\processed\osborne_greek_structured_lemmatized.json'

──────────────────────────────────────────────────
4️⃣  Loading LLM Data and Performing Validation
──────────────────────────────────────────────────
Intra-Model Stability (Gemini): Mean Pair-wise Cosine = 0.987
Intra-Model Stability (Groq/Llama): Mean Pair-wise Cosine = 1.000
Cross-Model Replication: Spearman ρ (Gemini vs. Groq/Llama) = 0.529 (p = 1.021e-18)

--- Representative Unstable-Case Log (Groq/Llama Model) ---
  Unit (Osborne) Verse Range Highest-Variance Vector (Groq/Llama)  σ (pp)
0       unit_020  RE 16:1-21     Judicial Wrath & Punitive Action    4.22

--- Morphological Proxy Validation ---

--- Lemma Proxy vs. All Rhetorical Vectors (Pearson r) ---
'Worship & Praise': r = 0.717  (p = 8.327e-06)
'Judicial Wrath & Punitive Action': r = -0.275  (p = 0.142)
'Lament, Persecution & Endurance':

In [16]:
# @title Stage 3.6 (Combined): Generate Noised Data & Run Conceptual Integrity Test
# This single, integrated cell performs the complete conceptual integrity validation.
# It first loads the final LLM results, then programmatically adds a configurable
# amount of random noise to each rhetorical vector. Finally, it compares the
# original classifications with the noised classifications to produce a confusion
# matrix and a stability score, saving all metrics to a JSON file.

# --- 1️⃣ Setup and Imports ---
import json
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Ensure helper functions can be imported
sys.path.append('./')
from src.config import *
CATEGORIES = ANALYTICAL_CONCEPTS_LIST
from src.corpus_parser import load_json

# --- 2️⃣ Configuration ---

# <<< KEY PARAMETER TO ADJUST >>>
# Set the standard deviation of the Gaussian noise to be added to the vectors.
# A higher value represents a more aggressive stability test.
NOISE_LEVEL_STD_DEV = 5.0

# --- File Paths ---
PATH_RESULTS = Path(PATH_RESULTS)

# Input file for the analysis
ORIGINAL_RESULTS_FILE = PATH_RESULTS / "gemini_osborne_greek.json"

# Output file for the final analysis metrics
OUTPUT_METRICS_FILE = PATH_RESULTS / "conceptual_integrity_metrics.json"

# --- 3️⃣ Helper Function for Noise Generation ---

def add_noise_to_vectors(data: dict, noise_level: float) -> dict:
    """
    Iterates through narrative units, adds Gaussian noise to the final
    rhetorical vector in memory, and returns the modified data structure.
    """
    noised_data = json.loads(json.dumps(data)) # Deep copy to avoid modifying original

    for unit in noised_data.get('narrative_units', []):
        original_vector = unit.get('final_rhetorical_vector')

        if not isinstance(original_vector, dict):
            unit['noised_rhetorical_vector'] = None
            continue

        noisy_vector = {
            category: value + np.random.normal(loc=0.0, scale=noise_level)
            for category, value in original_vector.items()
        }
        noisy_vector = {k: max(0, v) for k, v in noisy_vector.items()}

        total_sum = sum(noisy_vector.values())
        if total_sum > 0:
            noisy_vector = {k: (v / total_sum) * 100 for k, v in noisy_vector.items()}
        else:
            num_categories = len(original_vector)
            noisy_vector = {k: 100.0 / num_categories for k in original_vector.keys()}

        current_sum = sum(noisy_vector.values())
        diff = 100.0 - current_sum
        if diff != 0:
            max_key = max(noisy_vector, key=noisy_vector.get)
            noisy_vector[max_key] += diff

        unit['noised_rhetorical_vector'] = noisy_vector

    return noised_data

# --- 4️⃣ Main Execution Block ---

print("─" * 50)
print("Executing Conceptual Integrity Test (Noise Generation & Analysis)")
print(f"Using Noise Level (Standard Deviation): {NOISE_LEVEL_STD_DEV}")
print("─" * 50)

try:
    # Step 4a: Load the original data
    print(f"Loading original data from '{ORIGINAL_RESULTS_FILE}'...")
    original_data = load_json(ORIGINAL_RESULTS_FILE)
    if not original_data:
        raise FileNotFoundError("The original results file could not be loaded or is empty.")

    # Step 4b: Generate the noised data in memory
    print("Generating noised vectors in memory...")
    noised_data = add_noise_to_vectors(original_data, NOISE_LEVEL_STD_DEV)

    # Step 4c: Extract the "winning" category and track changes
    true_labels = []
    predicted_labels = []
    classification_changes = []

    original_units = {unit['unit_id']: unit for unit in original_data['narrative_units']}
    noised_units = {unit['unit_id']: unit for unit in noised_data['narrative_units']}

    for unit_id in sorted(original_units.keys()):
        if unit_id in noised_units:
            original_vector = original_units[unit_id].get('final_rhetorical_vector')
            noised_vector = noised_units[unit_id].get('noised_rhetorical_vector')

            if original_vector and noised_vector:
                original_winner = max(original_vector, key=original_vector.get)
                noised_winner = max(noised_vector, key=noised_vector.get)

                true_labels.append(original_winner)
                predicted_labels.append(noised_winner)

                if original_winner != noised_winner:
                    classification_changes.append({
                        "unit_id": unit_id,
                        "verse_range": original_units[unit_id].get('verse_range'),
                        "original_classification": original_winner,
                        "noised_classification": noised_winner
                    })

    if not true_labels:
        raise ValueError("No valid vectors found to compare.")

    # Step 4d: Compute and display the confusion matrix as a table
    cm_df = pd.crosstab(
        pd.Series(true_labels, name='True Label (Original)'),
        pd.Series(predicted_labels, name='Predicted Label (Noised)'),
        rownames=['True Label (Original)'],
        colnames=['Predicted Label (Noised)'],
        dropna=False
    ).reindex(index=CATEGORIES, columns=CATEGORIES, fill_value=0)

    print("\n--- Confusion Matrix ---")
    print("Rows are the original classification, columns are the classification after adding noise.")
    print(cm_df.to_string())

    # Step 4e: Calculate and display stability metrics
    correct_predictions = np.trace(cm_df.values)
    total_predictions = len(true_labels)
    stability_score = (correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0

    print("\n--- Analysis of Classification Stability ---")
    print(f"Total Units Analyzed: {total_predictions}")
    print(f"Stable Classifications (on diagonal): {correct_predictions}")
    print(f"Unstable Classifications (off diagonal): {total_predictions - correct_predictions}")
    print(f"Overall Classification Stability: {stability_score:.2f}%")

    if classification_changes:
        print("\n--- Details of Classification Changes ---")
        for change in classification_changes:
            print(f"  - {change['unit_id']} ({change['verse_range']}): Changed from '{change['original_classification']}' to '{change['noised_classification']}'")
    else:
        print("\n--- No classification changes were observed. ---")

    # Step 4f: Save all results to a single JSON file
    output_results = {
        "analysis_summary": {
            "description": "Compares the primary rhetorical classification of units before and after adding Gaussian noise to test conceptual integrity.",
            "noise_level_std_dev": NOISE_LEVEL_STD_DEV,
            "total_units_analyzed": total_predictions,
            "stable_classifications": int(correct_predictions),
            "unstable_classifications": int(total_predictions - correct_predictions),
            "overall_stability_percent": stability_score
        },
        "classification_changes": classification_changes,
        "confusion_matrix": cm_df.to_dict('index')
    }

    with open(OUTPUT_METRICS_FILE, 'w', encoding='utf-8') as f:
        json.dump(output_results, f, indent=4)

    print(f"\n✅ Success! All analysis results saved to '{OUTPUT_METRICS_FILE}'")

except FileNotFoundError as e:
    print(f"\n❌ ERROR: {e}. Please ensure the input file exists in '{PATH_RESULTS}'.")
except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")

──────────────────────────────────────────────────
Executing Conceptual Integrity Test (Noise Generation & Analysis)
Using Noise Level (Standard Deviation): 5.0
──────────────────────────────────────────────────
Loading original data from '..\data\results\gemini_osborne_greek.json'...
Generating noised vectors in memory...

--- Confusion Matrix ---
Rows are the original classification, columns are the classification after adding noise.
True Label (Original)                                                                                                                                                                                                                                                               
Worship & Praise                                         5                                 1                                0                                         0                           0                                0                        0                      0
Judici