CELL 1: SETUP

In [None]:
!pip install spacy transformers sentencepiece requests pandas scikit-learn torch accelerate -q
!python -m spacy download en_core_web_md

import torch
import os

# Ensure GPU is available
if not torch.cuda.is_available():
    print("WARNING: GPU not available. BART-Large requires a GPU.")
    device = torch.device("cpu")
else:
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print("Clearing initial GPU cache...")
    torch.cuda.empty_cache()

# Disable tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("\nSetup complete. You can now run CELL 2.")


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Using GPU: Tesla T4
Clearing initial GPU cache...

Setup complete. You can now run CELL 2.


CELL 2: GENERATE DATA FOR MANUAL LABELING

In [None]:
print("\n--- CELL 2: GENERATE DATA FOR MANUAL LABELING ---")

import pandas as pd
import spacy
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import math
from tqdm.notebook import tqdm
import gc

# --- Device Check ---
if "device" not in locals():
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    print(f"Device re-initialized to: {device}")

print("Loading spaCy model...")
try:
    nlp = spacy.load("en_core_web_md")
    print("spaCy model loaded.")
except OSError:
    print("spaCy model not found. Run setup cell.")
    exit()

print("\nLoading BART-Large model (used for both prior & posterior)...")
model_name = "facebook/bart-large"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    model.eval()
    print(f"BART model ({model_name}) loaded to {device}.")
except Exception as e:
    print(f"ERROR loading BART model: {e}")
    exit()

# --- Config ---
MIN_LOG_PROB_FOR_EXP = -700  # Clamp log_probs before exponentiating


# Helper Function: Seq2Seq Log Prob (BART)
def get_seq2seq_log_prob(model, tokenizer, input_text, target_text):
    """Calculates log P(target | input) using BART. Returns sum log prob."""
    _input_encoding = tokenizer(
        input_text, return_tensors="pt", truncation=True, max_length=1024
    ).to(device)
    _input_ids = _input_encoding.input_ids
    _attention_mask = _input_encoding.attention_mask
    _target_encoding = tokenizer(
        target_text, return_tensors="pt", truncation=True, max_length=1024
    ).to(device)
    _labels = _target_encoding.input_ids

    if (
        _input_ids.shape[1] == 0
        or _labels.shape[1] == 0
        or torch.all(
            (_labels == tokenizer.eos_token_id) | (_labels == tokenizer.pad_token_id)
        )
    ):
        return -float("inf")

    _total_log_prob = -float("inf")
    _outputs, _logits, _log_probs, _target_log_probs = None, None, None, None

    try:
        with torch.no_grad():
            _outputs = model(
                input_ids=_input_ids, attention_mask=_attention_mask, labels=_labels
            )
            _logits = _outputs.logits
        if _logits.shape[1] != _labels.shape[1]:
            return -float("inf")

        _log_probs = F.log_softmax(_logits, dim=-1)
        _target_log_probs = torch.gather(_log_probs, 2, _labels.unsqueeze(-1)).squeeze(
            -1
        )
        _valid_token_mask = _labels != tokenizer.pad_token_id
        if _valid_token_mask.sum() > 0:
            _total_log_prob = _target_log_probs[_valid_token_mask].sum().item()

    except RuntimeError as e:
        if "out of memory" in str(e):
            print(
                f"\nOOM Error (Seq2Seq)! InputLen:{_input_ids.shape[1]}, TargetLen:{_labels.shape[1]}. Skipping."
            )
        else:
            print(f"Runtime error in get_seq2seq_log_prob: {e}")
        gc.collect()
        torch.cuda.empty_cache()
        return -float("inf")
    except Exception as e:
        print(f"Error in get_seq2seq_log_prob: {e}")
        return -float("inf")
    finally:
        if "_outputs" in locals() and _outputs is not None:
            del _outputs
        if "_logits" in locals() and _logits is not None:
            del _logits
        if "_log_probs" in locals() and _log_probs is not None:
            del _log_probs
        if "_target_log_probs" in locals() and _target_log_probs is not None:
            del _target_log_probs
        if "_input_encoding" in locals() and _input_encoding is not None:
            del _input_encoding, _input_ids, _attention_mask
        if "_target_encoding" in locals() and _target_encoding is not None:
            del _target_encoding, _labels
    return _total_log_prob if not np.isnan(_total_log_prob) else -float("inf")


# Prior and Posterior Functions - RETURNING LINEAR PROB (using BART)
def get_prior_prob_bart(
    entity_text: str, entity_start: int, generated_text: str
) -> float:
    context_ck = generated_text[:entity_start].strip()
    if not context_ck:
        context_ck = tokenizer.bos_token if tokenizer.bos_token else "<|endoftext|>"
    # Input text = context_ck, Target text = entity_text
    sum_log_prob = get_seq2seq_log_prob(
        model, tokenizer, input_text=context_ck, target_text=entity_text
    )

    if sum_log_prob <= MIN_LOG_PROB_FOR_EXP:
        return 0.0
    try:
        prob = math.exp(sum_log_prob)
        return max(0.0, min(1.0, prob))
    except (OverflowError, ValueError):
        return 0.0


def get_posterior_prob_bart(
    entity_text: str,
    entity_start: int,
    entity_end: int,
    generated_text: str,
    source_text: str,
) -> float:
    context_ck_before = generated_text[:entity_start].strip()
    context_ck_after = generated_text[entity_end:].strip()
    input_text = f"source: {source_text.strip()} context: {context_ck_before} [SEP] {context_ck_after}"
    # Input text = formatted S + ck, Target text = entity_text
    sum_log_prob = get_seq2seq_log_prob(
        model, tokenizer, input_text=input_text, target_text=entity_text
    )
    if sum_log_prob <= MIN_LOG_PROB_FOR_EXP:
        return 0.0
    try:
        prob = math.exp(sum_log_prob)
        return max(0.0, min(1.0, prob))
    except (OverflowError, ValueError):
        return 0.0


# Entity Extraction Function - FOR LABELING
def extract_entities_for_labeling(df):
    entity_rows = []
    print(f"\nExtracting entities & calculating features for {len(df)} rows...")
    print(f"--- This may take time with BART-Large ---")

    row_iterator = tqdm(
        df.iterrows(), total=len(df), desc="Processing Rows for Labeling"
    )
    for i, row in row_iterator:
        src_txt_original = (
            str(row["ConditionedText"]) if pd.notna(row["ConditionedText"]) else ""
        )
        gen_txt = str(row["GeneratedText"]) if pd.notna(row["GeneratedText"]) else ""
        orig_lbl = row["IsHallucinated"]
        entry_num = row.get("EntryNumber", i + 1)

        try:
            max_spacy_len = nlp.max_length
            doc = (
                nlp(gen_txt[: max_spacy_len - 1], disable=["parser"])
                if len(gen_txt) >= max_spacy_len
                else nlp(gen_txt, disable=["parser"])
            )
            num_ents = len(doc.ents)
            row_iterator.set_postfix({"EntitiesFound": num_ents})

            if not doc.ents:
                continue

            for ent in doc.ents:
                ent_txt = ent.text
                start = ent.start_char
                end = ent.end_char
                prior_p = get_prior_prob_bart(ent_txt, start, gen_txt)
                post_p = get_posterior_prob_bart(
                    ent_txt, start, end, gen_txt, src_txt_original
                )

                # Calculate Binary Overlap Feature
                binary_overlap = (
                    1 if ent_txt.lower().strip() in src_txt_original.lower() else 0
                )

                entity_rows.append(
                    {
                        "EntryNumber": entry_num,
                        "ConditionedText": src_txt_original,  # Include for context during labeling
                        "GeneratedText": gen_txt,  # Include for context during labeling
                        "IsHallucinated_Original": orig_lbl,
                        "start": start,
                        "end": end,
                        "ent": ent_txt,
                        "type": ent.label_,
                        "prior_prob": prior_p,  # Linear probability
                        "posterior_prob": post_p,  # Linear probability
                        "binary_overlap": binary_overlap,  # binary overlap
                    }
                )
            del doc
            if i > 0 and i % 5 == 0:
                gc.collect()
                torch.cuda.empty_cache()  # Clear cache less frequently
        except Exception as e:
            print(
                f"ERROR processing row index {i} (Entry {entry_num}) for labeling: {e}"
            )
            continue
    return entity_rows


# Main Execution Block for Cell 2
def generate_labeling_data():
    csv_file = "input.csv"
    N_ROWS_FOR_LABELING = 500
    output_csv = "entities_to_label.csv"

    print(
        f"Loading {N_ROWS_FOR_LABELING} rows from '{csv_file}' to generate labeling data..."
    )
    try:
        if not os.path.exists(csv_file):
            print(f"'{csv_file}' not found. Creating a dummy file for demonstration.")
            dummy_data = {
                "ConditionedText": [
                    f"Source document {i}" for i in range(N_ROWS_FOR_LABELING)
                ],
                "GeneratedText": [
                    f"Generated summary {i} with entityA and entityB."
                    for i in range(N_ROWS_FOR_LABELING)
                ],
                "IsHallucinated": [
                    "FALSE" if i % 2 == 0 else "TRUE"
                    for i in range(N_ROWS_FOR_LABELING)
                ],
            }
            pd.DataFrame(dummy_data).to_csv(csv_file, index=False)
            print(f"Dummy '{csv_file}' created.")

        df_labeling = pd.read_csv(csv_file, nrows=N_ROWS_FOR_LABELING)
        if "EntryNumber" not in df_labeling.columns:
            df_labeling.insert(0, "EntryNumber", df_labeling.index + 1)
        print(f"Loaded {len(df_labeling)} rows.")
        required_cols = ["ConditionedText", "GeneratedText", "IsHallucinated"]
        if not all(col in df_labeling.columns for col in required_cols):
            print(
                f"Error: Missing required columns: {required_cols}. Found: {df_labeling.columns.tolist()}"
            )
            return
    except FileNotFoundError:
        print(f"Error: File '{csv_file}' not found.")
        return
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return

    entity_labeling_list = extract_entities_for_labeling(df_labeling)
    if not entity_labeling_list:
        print("No entities extracted. Cannot create labeling file.")
        return

    df_to_label = pd.DataFrame(entity_labeling_list)
    print(f"\nExtracted {len(df_to_label)} entities for labeling.")

    try:
        df_to_label.to_csv(output_csv, index=False)
        print(f"\nSaved detailed entity data for manual labeling to '{output_csv}'.")
        print("\n>>> ACTION REQUIRED: <<<")
        print(f"1. Download '{output_csv}'.")
        print("2. Open it in a spreadsheet program.")
        print("3. Add a new column named 'ManualLabel'.")
        print(
            '4. For relevant entity rows, fill ManualLabel with "non hallucinated", "factual hallucination", or "non-factual hallucination".'
        )
        print(
            "   (The 'binary_overlap' column is for reference/analysis and used by the model)."
        )
        print("5. Save the modified file as 'manual_labels.csv'.")
        print("6. Upload 'manual_labels.csv' to your Colab environment.")
        print("7. Proceed to run CELL 3.")
    except Exception as e:
        print(f"Error saving labeling data CSV: {e}")


if __name__ == "__main__":
    if device.type == "cuda":
        print("\nClearing GPU cache before Cell 2 execution...")
        gc.collect()
        torch.cuda.empty_cache()

    if "__file__" not in globals():
        generate_labeling_data()
    if device.type == "cuda":
        print("\nClearing GPU cache after Cell 2 execution...")
        gc.collect()
        torch.cuda.empty_cache()



--- CELL 2: GENERATE DATA FOR MANUAL LABELING ---
Loading spaCy model...
spaCy model loaded.

Loading BART-Large model (used for both prior & posterior)...
BART model (facebook/bart-large) loaded to cuda.

Clearing GPU cache before Cell 2 execution...
Loading 500 rows from 'input.csv' to generate labeling data...
Loaded 500 rows.

Extracting entities & calculating features for 500 rows...
--- This may take time with BART-Large ---


Processing Rows for Labeling:   0%|          | 0/500 [00:00<?, ?it/s]


Extracted 1619 entities for labeling.

Saved detailed entity data for manual labeling to 'entities_to_label.csv'.

>>> ACTION REQUIRED: <<<
1. Download 'entities_to_label.csv'.
2. Open it in a spreadsheet program.
3. Add a new column named 'ManualLabel'.
4. For relevant entity rows, fill ManualLabel with "non hallucinated", "factual hallucination", or "non-factual hallucination".
   (The 'binary_overlap' column is for reference/analysis and used by the model).
5. Save the modified file as 'manual_labels.csv'.
6. Upload 'manual_labels.csv' to your Colab environment.
7. Proceed to run CELL 3.

Clearing GPU cache after Cell 2 execution...


CELL 3: TRAIN & EVALUATE KNN ON MANUAL LABELS

In [None]:
print("\n--- CELL 3: TRAIN & EVALUATE KNN ON MANUAL LABELS ---")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler  # Import Scaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import sys

try:
    from imblearn.over_sampling import SMOTE

    imblearn_installed = True
except ImportError:
    print("WARNING: 'imbalanced-learn' library not found.")
    print("         SMOTE oversampling will be skipped.")
    print(
        "         Install with: !pip install -U imbalanced-learn (run in a separate cell or add to Cell 1)"
    )
    imblearn_installed = False

manual_labels_file = "manual_labels.csv"
test_size = 0.20  # Use 20% of manual data for testing
k_values = [1, 3, 5, 6, 7, 8, 9, 11, 15]  # K values to test for KNN
weights_options = ["uniform", "distance"]  # Weighting options for KNN

feature_cols = ["prior_prob", "posterior_prob", "binary_overlap"]


def run_cell3_evaluation():
    print(f"\n--- KNN Training & Evaluation ---")
    print(f"Loading manual labels from: {manual_labels_file}")

    if not os.path.exists(manual_labels_file):
        print(
            f"!!! ERROR: Manual labels file '{manual_labels_file}' not found. Please upload it after labeling."
        )
        return {"n_neighbors": 7, "weights": "uniform"}  # Return default

    try:
        df_manual = pd.read_csv(manual_labels_file)
    except Exception as e:
        print(f"!!! ERROR reading '{manual_labels_file}': {e}")
        return {"n_neighbors": 7, "weights": "uniform"}

    required_manual_cols = feature_cols + ["ManualLabel"]
    print(f"Using features: {feature_cols}")

    if not all(col in df_manual.columns for col in required_manual_cols):
        print(
            f"!!! ERROR: '{manual_labels_file}' missing required columns. Expected: {required_manual_cols}. Found: {df_manual.columns.tolist()}"
        )
        return {"n_neighbors": 7, "weights": "uniform"}

    df_manual = df_manual.dropna(subset=required_manual_cols).copy()
    for col in feature_cols:
        df_manual[col] = pd.to_numeric(df_manual[col], errors="coerce")
    df_manual = df_manual.dropna(subset=feature_cols)

    if len(df_manual) < 10:
        print(
            f"!!! ERROR: Insufficient valid data found ({len(df_manual)} rows). Need more labeled data."
        )
        return {"n_neighbors": 7, "weights": "uniform"}
    print(f"Loaded {len(df_manual)} valid manually labeled examples.")

    # --- Prepare Data ---
    X_manual_all = df_manual[feature_cols].values
    y_manual_all = df_manual["ManualLabel"].values
    labels_present, counts_present = np.unique(y_manual_all, return_counts=True)
    label_counts_dict = dict(zip(labels_present, counts_present))
    print(f"Classes found in manual labels: {label_counts_dict}")
    min_samples_per_class = min(counts_present) if len(counts_present) > 0 else 0
    if len(labels_present) < 2:
        print("!!! WARNING: Fewer than 2 classes found. KNN may not be meaningful.")

    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X_manual_all,
            y_manual_all,
            test_size=test_size,
            random_state=42,
            stratify=y_manual_all if min_samples_per_class >= 2 else None,
        )
    except ValueError as e:
        print(
            f"\nError during stratified split (min samples per class {min_samples_per_class}): {e}. Attempting unstratified split."
        )
        X_train, X_test, y_train, y_test = train_test_split(
            X_manual_all, y_manual_all, test_size=test_size, random_state=42
        )

    print(
        f"\nSplit data into {len(y_train)} training and {len(y_test)} testing examples."
    )
    train_counts = pd.Series(y_train).value_counts()
    test_counts = pd.Series(y_test).value_counts()
    print(
        f"Training distribution:\n{train_counts}\nTesting distribution:\n{test_counts}"
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Features scaled.")

    X_train_processed, y_train_processed = X_train_scaled, y_train
    if imblearn_installed:
        train_label_counts = pd.Series(y_train).value_counts()
        min_train_samples_for_smote = (
            train_label_counts.min() if not train_label_counts.empty else 0
        )
        smote_k = (
            min(5, min_train_samples_for_smote - 1)
            if min_train_samples_for_smote > 1
            else 1
        )
        if smote_k >= 1 and len(train_label_counts) > 1:
            try:
                print("\nApplying SMOTE...")
                smote = SMOTE(random_state=42, k_neighbors=smote_k)
                X_train_processed, y_train_processed = smote.fit_resample(
                    X_train_scaled, y_train
                )
                print(
                    f"Class distribution after SMOTE:\n{pd.Series(y_train_processed).value_counts()}"
                )
            except Exception as e:
                print(f"Error during SMOTE: {e}. Using original scaled data.")
        else:
            print("Skipping SMOTE: Not enough samples or classes in minority.")
    else:
        print("Skipping SMOTE (imblearn not installed).")

    print("\nTuning KNN hyperparameters...")
    param_grid = {"n_neighbors": k_values, "weights": weights_options}
    knn_grid = KNeighborsClassifier()
    min_processed_samples = pd.Series(y_train_processed).value_counts().min()
    cv_folds = min(5, min_processed_samples) if min_processed_samples > 1 else 2

    best_params_default = {"n_neighbors": 7, "weights": "uniform"}
    best_params = best_params_default.copy()

    if cv_folds < 2 or len(np.unique(y_train_processed)) < 2:
        print(
            f"Warning: Cannot perform cross-validation (folds={cv_folds}, unique_classes={len(np.unique(y_train_processed))}). Using default KNN params."
        )
    else:
        try:
            grid_search = GridSearchCV(
                knn_grid,
                param_grid,
                cv=cv_folds,
                scoring="f1_macro",
                n_jobs=-1,
                error_score=0.0,
            )
            grid_search.fit(X_train_processed, y_train_processed)
            best_params = grid_search.best_params_
            print(
                f"Best KNN Parameters found: {best_params}, Score: {grid_search.best_score_:.4f}"
            )
        except Exception as e:
            print(f"Error during GridSearchCV: {e}. Using default KNN params.")

    knn_eval = KNeighborsClassifier(**best_params)
    knn_eval.fit(X_train_processed, y_train_processed)
    y_pred = knn_eval.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    report_labels_unique = sorted(list(set(y_test) | set(y_pred)))
    report = classification_report(
        y_test,
        y_pred,
        labels=report_labels_unique,
        target_names=[str(x) for x in report_labels_unique],
        zero_division=0,
    )

    print("\n--- KNN Performance Evaluation (Test Set) ---")
    print(f"Using Parameters: {best_params}")
    print(f"Accuracy on Test Set: {accuracy:.4f}\nClassification Report:\n{report}")
    print("---------------------------------------------")
    del (
        knn_eval,
        X_train,
        X_test,
        y_train,
        y_test,
        X_manual_all,
        y_manual_all,
        df_manual,
    )
    del X_train_scaled, X_test_scaled, X_train_processed, y_train_processed
    if "grid_search" in locals():
        del grid_search
    if "scaler" in locals():
        del scaler
    if "smote" in locals():
        del smote
    gc.collect()
    torch.cuda.empty_cache()
    return best_params


best_params_from_cell3 = {"n_neighbors": 7, "weights": "uniform"}
if __name__ == "__main__":
    if os.path.exists(manual_labels_file):
        best_params_from_cell3 = run_cell3_evaluation()
    else:
        print(
            f"Skipping Cell 3 execution as '{manual_labels_file}' not found. Using default KNN params for Cell 4."
        )



--- CELL 3: TRAIN & EVALUATE KNN ON MANUAL LABELS ---

--- KNN Training & Evaluation ---
Loading manual labels from: manual_labels.csv
Using features: ['prior_prob', 'posterior_prob', 'binary_overlap']
Loaded 4946 valid manually labeled examples.
Classes found in manual labels: {'Factual Hallucination': np.int64(658), 'Non Hallucinated': np.int64(3490), 'Non-Factual Hallucination': np.int64(798)}

Split data into 3956 training and 990 testing examples.
Training distribution:
Non Hallucinated             2792
Non-Factual Hallucination     638
Factual Hallucination         526
Name: count, dtype: int64
Testing distribution:
Non Hallucinated             698
Non-Factual Hallucination    160
Factual Hallucination        132
Name: count, dtype: int64
Features scaled.

Applying SMOTE...
Class distribution after SMOTE:
Non Hallucinated             2792
Non-Factual Hallucination    2792
Factual Hallucination        2792
Name: count, dtype: int64

Tuning KNN hyperparameters...
Best KNN Paramete

CELL 4: APPLY TRAINED KNN TO FULL DATASET

In [None]:
print("\n--- CELL 4: APPLY TRAINED KNN TO FULL DATASET ---")

# Re-define if nlp/tokenizer/model were deleted or not in scope
if "nlp" not in globals():
    try:
        nlp = spacy.load("en_core_web_md")
        print("spaCy (re)loaded for Cell 4.")
    except:
        print("ERROR: spaCy model 'nlp' not available for Cell 4.")
        sys.exit(1)
if "tokenizer" not in globals() or "model" not in globals():
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
        model.eval()
        print(f"BART model ({model_name}) (re)loaded for Cell 4.")
    except:
        print(f"ERROR: BART model not available for Cell 4.")
        sys.exit(1)

scaler_final = StandardScaler()
knn_final = None

# Load ALL Manual Labels & Train FINAL KNN
print("\nLoading ALL manual labels and training FINAL KNN...")
if not os.path.exists(manual_labels_file):
    print(
        f"!!! ERROR: '{manual_labels_file}' not found. Cannot train final KNN for Cell 4."
    )
    sys.exit(1)
try:
    df_manual_full = pd.read_csv(manual_labels_file)
    required_cols_full = feature_cols + [
        "ManualLabel"
    ]  # feature_cols defined in Cell 3
    if not all(col in df_manual_full.columns for col in required_cols_full):
        print(
            f"!!! ERROR: '{manual_labels_file}' missing columns for final KNN. Expected: {required_cols_full}."
        )
        sys.exit(1)

    df_manual_full = df_manual_full.dropna(subset=required_cols_full).copy()
    for col in feature_cols:
        df_manual_full[col] = pd.to_numeric(df_manual_full[col], errors="coerce")
    df_manual_full = df_manual_full.dropna(subset=feature_cols)

    if len(df_manual_full) < 3:
        print(f"!!! ERROR: Insufficient data in '{manual_labels_file}' for final KNN.")
        sys.exit(1)

    X_manual_all_full = df_manual_full[feature_cols].values
    y_manual_all_full = df_manual_full["ManualLabel"].values
    print(
        f"Loaded {len(y_manual_all_full)} manually labeled examples for FINAL KNN training."
    )

    X_manual_all_scaled_full = scaler_final.fit_transform(X_manual_all_full)
    print("Manual features scaled for final KNN. Final scaler is now fitted.")

    X_train_final_processed, y_train_final_processed = (
        X_manual_all_scaled_full,
        y_manual_all_full,
    )
    if imblearn_installed:
        counts_full = pd.Series(y_manual_all_full).value_counts()
        min_samples_full_smote = counts_full.min() if not counts_full.empty else 0
        smote_k_full = (
            min(5, min_samples_full_smote - 1) if min_samples_full_smote > 1 else 1
        )
        if smote_k_full >= 1 and len(counts_full) > 1:
            try:
                print("Applying SMOTE to full manual data for final KNN training...")
                smote_final = SMOTE(random_state=42, k_neighbors=smote_k_full)
                X_train_final_processed, y_train_final_processed = (
                    smote_final.fit_resample(
                        X_manual_all_scaled_full, y_manual_all_full
                    )
                )
                print(
                    f"Class distribution for final training after SMOTE:\n{pd.Series(y_train_final_processed).value_counts()}"
                )
            except Exception as e:
                print(f"Error during SMOTE for final training: {e}.")
        else:
            print("Skipping SMOTE for final training (not enough samples/classes).")

    knn_final = KNeighborsClassifier(**best_params_from_cell3)  # Use params from Cell 3
    knn_final.fit(X_train_final_processed, y_train_final_processed)
    print(f"FINAL KNN classifier trained with params: {best_params_from_cell3}")

except Exception as e:
    print(f"!!! ERROR during final KNN training setup: {e}")
    sys.exit(1)


--- CELL 4: APPLY TRAINED KNN TO FULL DATASET ---

Loading ALL manual labels and training FINAL KNN...
Loaded 4946 manually labeled examples for FINAL KNN training.
Manual features scaled for final KNN. Final scaler is now fitted.
Applying SMOTE to full manual data for final KNN training...
Class distribution for final training after SMOTE:
Non Hallucinated             3490
Non-Factual Hallucination    3490
Factual Hallucination        3490
Name: count, dtype: int64
FINAL KNN classifier trained with params: {'n_neighbors': 11, 'weights': 'uniform'}


Entity Labeling Function - USES FINAL KNN & SCALER

In [None]:
def label_entity_prob_final(
    src_txt: str, gen_txt: str, ent_obj, scaler, knn_model
) -> tuple[str, float, float, int]:
    ent_txt = ent_obj.text
    start = ent_obj.start_char
    end = ent_obj.end_char
    prior_p = get_prior_prob_bart(ent_txt, start, gen_txt)
    post_p = get_posterior_prob_bart(ent_txt, start, end, gen_txt, src_txt)
    binary_overlap_val = 1 if ent_txt.lower().strip() in src_txt.lower() else 0

    features_raw = np.array([[prior_p, post_p, binary_overlap_val]])
    features_raw = np.nan_to_num(features_raw, nan=0.0, posinf=1.0, neginf=0.0)
    features_scaled = scaler.transform(features_raw)
    pred = "error"
    try:
        if knn_model is None:
            print("KNN Prediction Error: knn_final model is not trained/available.")
        else:
            pred = knn_model.predict(features_scaled)[0]
    except Exception as e:
        print(
            f"KNN Prediction Error: {e}, Raw: {features_raw}, Scaled: {features_scaled}"
        )
    return str(pred), prior_p, post_p, binary_overlap_val


Entity Extraction Function - USES FINAL KNN & SCALER

In [None]:
def extract_and_aggregate_entities_final(df, scaler, knn_model):
    aggregated_rows = []
    print(
        f"\nExtracting, classifying entities, and aggregating for {len(df)} rows (Cell 4)..."
    )
    if scaler is None or not hasattr(scaler, "mean_"):
        print("Error: Final scaler not fitted.")
        return []
    if knn_model is None:
        print("Error: Final KNN model not trained.")
        return []

    row_iterator = tqdm(
        df.iterrows(), total=len(df), desc="Processing Full Dataset (Cell 4)"
    )
    for i, row in row_iterator:
        src_txt = (
            str(row["ConditionedText"]) if pd.notna(row["ConditionedText"]) else ""
        )
        gen_txt = str(row["GeneratedText"]) if pd.notna(row["GeneratedText"]) else ""
        orig_lbl = row["IsHallucinated"]
        entry_num = row.get("EntryNumber", i + 1)

        entities_in_row = {
            "EntryNumber": entry_num,
            "ConditionedText": src_txt,
            "GeneratedText": gen_txt,
            "IsHallucinated_Original": orig_lbl,
            "ent": [],
            "type": [],
            "start": [],
            "end": [],
            "label_pred": [],
            "prior_pred_val": [],
            "posterior_pred_val": [],
            "binary_overlap_pred_val": [],
        }
        try:
            max_spacy_len = nlp.max_length
            doc = (
                nlp(gen_txt[: max_spacy_len - 1], disable=["parser"])
                if len(gen_txt) >= max_spacy_len
                else nlp(gen_txt, disable=["parser"])
            )
            if doc.ents:
                for ent_obj in doc.ents:
                    lbl, p_p, post_p, bo_val = label_entity_prob_final(
                        src_txt, gen_txt, ent_obj, scaler, knn_model
                    )
                    entities_in_row["ent"].append(ent_obj.text)
                    entities_in_row["type"].append(ent_obj.label_)
                    entities_in_row["start"].append(ent_obj.start_char)
                    entities_in_row["end"].append(ent_obj.end_char)
                    entities_in_row["label_pred"].append(lbl)
                    entities_in_row["prior_pred_val"].append(p_p)
                    entities_in_row["posterior_pred_val"].append(post_p)
                    entities_in_row["binary_overlap_pred_val"].append(bo_val)
            aggregated_rows.append(entities_in_row)
            del doc
            if i > 0 and i % 10 == 0:
                gc.collect()
                torch.cuda.empty_cache()
        except Exception as e:
            print(
                f"ERROR processing row {i} (Entry {entry_num}) in Cell 4 extract: {e}"
            )
            entities_in_row.update(
                {k: [] for k in entities_in_row if isinstance(entities_in_row[k], list)}
            )
            aggregated_rows.append(entities_in_row)
            continue
    return aggregated_rows

Main Execution Block for Cell 4

In [None]:
def apply_final_classifier():
    csv_file_full = "input.csv"
    N_ROWS_TO_PROCESS_FULL = None

    load_msg = (
        "ALL rows"
        if N_ROWS_TO_PROCESS_FULL is None
        else f"{N_ROWS_TO_PROCESS_FULL} rows"
    )
    print(
        f"\nLoading {load_msg} from '{csv_file_full}' (all columns) for final classification..."
    )
    try:
        df_full = pd.read_csv(csv_file_full, nrows=N_ROWS_TO_PROCESS_FULL)
        if "EntryNumber" not in df_full.columns:
            df_full.insert(0, "EntryNumber", df_full.index + 1)
        print(f"Loaded {len(df_full)} rows for final processing.")
        if df_full.empty:
            print("Error: Loaded DataFrame for final processing is empty.")
            return
        req_cols_full_load = ["ConditionedText", "GeneratedText", "IsHallucinated"]
        if not all(col in df_full.columns for col in req_cols_full_load):
            print(
                f"Error: Missing required columns in '{csv_file_full}'. Expected: {req_cols_full_load}."
            )
            return
    except Exception as e:
        print(f"Error loading CSV for final processing: {e}")
        return

    if "scaler_final" not in globals() or not hasattr(scaler_final, "mean_"):
        print("Error: scaler_final not fitted. Cannot apply classifier.")
        return
    if "knn_final" not in globals() or knn_final is None:
        print("Error: knn_final model not trained. Cannot apply classifier.")
        return

    aggregated_entity_data_list_final = extract_and_aggregate_entities_final(
        df_full, scaler_final, knn_final
    )
    if not aggregated_entity_data_list_final:
        print("Final entity extraction/aggregation failed or returned empty list.")
        return
    df_entities_aggregated_final = pd.DataFrame(aggregated_entity_data_list_final)

    entity_output_file = "entities_analysis_final.csv"
    if not df_entities_aggregated_final.empty:
        try:
            df_entities_filtered_final = df_entities_aggregated_final[
                df_entities_aggregated_final["ent"].apply(
                    lambda x: isinstance(x, list) and len(x) > 0
                )
            ].copy()
            entity_cols_to_save = [
                "EntryNumber",
                "ConditionedText",
                "GeneratedText",
                "IsHallucinated_Original",
                "ent",
                "type",
                "start",
                "end",
                "label_pred",
                "prior_pred_val",
                "posterior_pred_val",
                "binary_overlap_pred_val",  # Added new column
            ]
            entity_cols_to_save = [
                col
                for col in entity_cols_to_save
                if col in df_entities_filtered_final.columns
            ]
            if not df_entities_filtered_final.empty:
                df_entities_filtered_final[entity_cols_to_save].to_csv(
                    entity_output_file, index=False
                )
                print(
                    f"\nSaved FINAL FILTERED & AGGREGATED entity analysis to '{entity_output_file}'."
                )
                preview_cols_agg = [
                    c
                    for c in [
                        "EntryNumber",
                        "ent",
                        "label_pred",
                        "binary_overlap_pred_val",
                    ]
                    if c in df_entities_filtered_final.columns
                ]
                with pd.option_context("display.max_colwidth", 100):
                    print(df_entities_filtered_final[preview_cols_agg].head())
            else:
                print(
                    "\nNo rows with detected entities found after filtering for final analysis."
                )
        except Exception as e:
            print(f"Error saving final aggregated entity analysis CSV: {e}")
    else:
        print("\nNo entities were extracted/processed in the final run.")

    print("\n--- Performing FINAL Row-level Classification ---")
    df_full["IsFactual_pred"] = "False"
    if not df_entities_aggregated_final.empty:
        row_predictions = {}
        for _, agg_row in df_entities_aggregated_final.iterrows():
            entry_num = agg_row["EntryNumber"]
            labels = agg_row.get("label_pred", [])
            valid_labels = [
                lbl for lbl in labels if isinstance(lbl, str) and lbl != "error"
            ]
            original_halluc_label = str(agg_row["IsHallucinated_Original"]).upper()
            if original_halluc_label == "FALSE":
                row_predictions[entry_num] = "True"
            else:
                row_predictions[entry_num] = (
                    "True"
                    if (
                        valid_labels
                        and all(label == "non hallucinated" for label in valid_labels)
                    )
                    else "False"
                )
        df_full["IsFactual_pred"] = df_full["EntryNumber"].map(row_predictions)
        for index, row_df in df_full.loc[df_full["IsFactual_pred"].isna()].iterrows():
            df_full.loc[index, "IsFactual_pred"] = (
                "True" if str(row_df["IsHallucinated"]).upper() == "FALSE" else "False"
            )
        df_full["IsFactual_pred"] = df_full["IsFactual_pred"].fillna(
            "Error - Prediction Missing"
        )
    else:
        print(
            "No aggregated entity data for final row predictions. Basing on original 'IsHallucinated'."
        )
        df_full["IsFactual_pred"] = df_full["IsHallucinated"].apply(
            lambda x: "True" if str(x).upper() == "FALSE" else "False"
        )

    row_output_file_final = "updated_dataset_final.csv"
    try:
        df_full.to_csv(row_output_file_final, index=False)
        print(f"\nSaved FINAL dataset to '{row_output_file_final}'.")
        preview_cols_row = [
            c
            for c in [
                "EntryNumber",
                "ConditionedText",
                "GeneratedText",
                "IsHallucinated",
                "IsFactual_pred",
            ]
            if c in df_full.columns
        ]
        print(df_full[preview_cols_row].head(10))
    except Exception as e:
        print(f"Error saving final dataset CSV: {e}")

    print("\n===============================================")
    print("Processing Complete (Cell 4).")
    print(
        f"Final results saved to '{entity_output_file}' and '{row_output_file_final}'."
    )
    print("===============================================")


if __name__ == "__main__":
    if (
        "scaler_final" not in globals()
        or not hasattr(scaler_final, "mean_")
        or "knn_final" not in globals()
        or knn_final is None
    ):
        print(
            "WARNING: Final scaler or KNN model not ready before apply_final_classifier. This might indicate an issue if previous Cell 4 blocks didn't run successfully."
        )

    if device.type == "cuda":
        print("\nClearing GPU cache before Cell 4 main execution...")
        gc.collect()
        torch.cuda.empty_cache()
    if "__file__" not in globals():
        apply_final_classifier()
    if device.type == "cuda":
        print("\nClearing GPU cache after Cell 4 main execution...")
        gc.collect()
        torch.cuda.empty_cache()


Clearing GPU cache before Cell 4 main execution...

Loading ALL rows from 'input.csv' (all columns) for final classification...
Loaded 2000 rows for final processing.

Extracting, classifying entities, and aggregating for 2000 rows (Cell 4)...


Processing Full Dataset (Cell 4):   0%|          | 0/2000 [00:00<?, ?it/s]


Saved FINAL FILTERED & AGGREGATED entity analysis to 'entities_analysis_final.csv'.
   EntryNumber  \
0            1   
2            3   
3            4   
5            6   
6            7   

                                                                                                   ent  \
0                                                 [DUI, Two thirds, first, Taiwan, DUI, DUI, DUI, DUI]   
2                                                                                           [73%, TNF]   
3                                                                                              [Birds]   
5                                                                 [Two, EVT, Gated Recurrent Unit GRU]   
6  [1, Methylpyrene, CYP, 2, Methylpyrene, 3, Methylpyrene, Methylpyrene, as low as 0.125, V79, Met...   

                                                                                            label_pred  \
0  [Non-Factual Hallucination, Non Hallucinated, Non Hallucinat