<a href="https://colab.research.google.com/github/AvdMei/AI_Bullshit_Detector/blob/ages_branch/AI_Bullshit_Detector_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AI Bullshit Detector

Team:

Attribution:

LLM mode: LiquidAI LMF2-1.2B

Sundai project 11-jan-26

In [16]:
!pip install lion_pytorch



In [17]:
import os
import json
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader

from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from lion_pytorch import Lion

# Create template and tokenizer for LLM model to be able to chat with it

In [18]:
# Basic question-answer template
template_without_answer = "<|startoftext|><|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
template_with_answer = template_without_answer + "{answer}<|im_end|>\n"

# Let's try to put something into the template to see how it looks
print(template_with_answer.format(question="What is your name?", answer="My name is Lili!"))

<|startoftext|><|im_start|>user
What is your name?<|im_end|>
<|im_start|>assistant
My name is Lili!<|im_end|>



In [19]:
# Load the tokenizer for Liquid AI LFM2-1.2B
model_id = "LiquidAI/LFM2-1.2B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# How big is the tokenizer?
print(f"Vocab size: {len(tokenizer.get_vocab())}")

Vocab size: 64400


In [20]:
# Lets test out both steps:
text = "Here is some sample text!"
print(f"Original text: {text}")

# Tokenize the text
tokens = tokenizer.encode(text, return_tensors="pt")
print(f"Encoded tokens: {tokens}")

# Decode the tokens
decoded_text = tokenizer.decode(tokens[0], skip_special_tokens=True)
print(f"Decoded text: {decoded_text}")

Original text: Here is some sample text!
Encoded tokens: tensor([[   1, 9151,  856, 1429, 6643, 3304,  510]])
Decoded text: Here is some sample text!


In [21]:
prompt = template_without_answer.format(question="What is the capital of France? Use one word.")
print(prompt)

<|startoftext|><|im_start|>user
What is the capital of France? Use one word.<|im_end|>
<|im_start|>assistant



# Load the model -- note that this may take a few minutes
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

In [22]:
# Load the tokenizer for Liquid AI LFM2-1.2B
model_id = "LiquidAI/LFM2-1.2B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# How big is the tokenizer?
print(f"Vocab size: {len(tokenizer.get_vocab())}")

Vocab size: 64400


In [23]:
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")

In [24]:
prompt = template_without_answer.format(question="What does MIT stand for?")
tokens = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
output = model.generate(tokens, max_new_tokens=20)
print(tokenizer.decode(output[0]))

<|startoftext|><|startoftext|><|im_start|>user
What does MIT stand for?<|im_end|>
<|im_start|>assistant
MIT stands for Massachusetts Institute of Technology. It is a private, research-intensive university located in


# Understanding the internal representations of the model

In [25]:
# Inspect the model structure
print(model.config)
print(model)

# Check if the model has a specific cache class
from transformers import Cache
print(f"Cache class: {getattr(model.config, 'cache_implementation', 'default')}")

Lfm2Config {
  "architectures": [
    "Lfm2ForCausalLM"
  ],
  "block_auto_adjust_ff_dim": true,
  "block_dim": 2048,
  "block_ff_dim": 12288,
  "block_ffn_dim_multiplier": 1.0,
  "block_mlp_init_scale": 1.0,
  "block_multiple_of": 256,
  "block_norm_eps": 1e-05,
  "block_out_init_scale": 1.0,
  "block_use_swiglu": true,
  "block_use_xavier_init": true,
  "bos_token_id": 1,
  "conv_L_cache": 3,
  "conv_bias": false,
  "conv_dim": 2048,
  "conv_dim_out": 2048,
  "conv_use_xavier_init": true,
  "dtype": "float32",
  "eos_token_id": 7,
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 12288,
  "layer_types": [
    "conv",
    "conv",
    "full_attention",
    "conv",
    "conv",
    "full_attention",
    "conv",
    "conv",
    "full_attention",
    "conv",
    "full_attention",
    "conv",
    "full_attention",
    "conv",
    "full_attention",
    "conv"
  ],
  "max_position_embeddings": 128000,
  "model_type": "lfm2",
  "norm_eps": 1e-05,
  "num_attention_heads

In [26]:
# Encode prompt
prompt = template_without_answer.format(question="What does MIT stand for?")
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

# Forward pass with cache
with torch.no_grad():
    outputs = model(
        input_ids,
        use_cache=True,
        return_dict=True,
        output_attentions=True,  # Optional: attention weights
        output_hidden_states=True  # Optional: hidden states
    )

# Access cache
past_key_values = outputs.past_key_values
logits = outputs.logits

In [27]:
# Inspect cache structure
print(f"Cache type: {type(past_key_values)}")
print(f"Number of layers in cache: {len(past_key_values)}")

# For each layer, check the structure
for i, layer_cache in enumerate(past_key_values):
    if layer_cache is not None:
        if isinstance(layer_cache, tuple):
            print(f"Layer {i}: K shape={layer_cache[0].shape}, V shape={layer_cache[1].shape}")
        else:
            print(f"Layer {i}: type={type(layer_cache)}")
    else:
        print(f"Layer {i}: None (likely conv layer)")

Cache type: <class 'transformers.models.lfm2.modeling_lfm2.Lfm2HybridConvCache'>
Number of layers in cache: 15
Layer 0: K shape=torch.Size([0]), V shape=torch.Size([0])
Layer 1: K shape=torch.Size([0]), V shape=torch.Size([0])
Layer 2: K shape=torch.Size([1, 8, 16, 64]), V shape=torch.Size([1, 8, 16, 64])
Layer 3: K shape=torch.Size([0]), V shape=torch.Size([0])
Layer 4: K shape=torch.Size([0]), V shape=torch.Size([0])
Layer 5: K shape=torch.Size([1, 8, 16, 64]), V shape=torch.Size([1, 8, 16, 64])
Layer 6: K shape=torch.Size([0]), V shape=torch.Size([0])
Layer 7: K shape=torch.Size([0]), V shape=torch.Size([0])
Layer 8: K shape=torch.Size([1, 8, 16, 64]), V shape=torch.Size([1, 8, 16, 64])
Layer 9: K shape=torch.Size([0]), V shape=torch.Size([0])
Layer 10: K shape=torch.Size([1, 8, 16, 64]), V shape=torch.Size([1, 8, 16, 64])
Layer 11: K shape=torch.Size([0]), V shape=torch.Size([0])
Layer 12: K shape=torch.Size([1, 8, 16, 64]), V shape=torch.Size([1, 8, 16, 64])
Layer 13: K shape=torc

In [28]:
def generate_with_cache_and_probs(model, tokenizer, prompt, max_new_tokens=20):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    generated_tokens = []
    token_probs = []
    all_caches = []

    past_key_values = None
    current_ids = input_ids

    for step in range(max_new_tokens):
        with torch.no_grad():
            outputs = model(
                current_ids,
                past_key_values=past_key_values,
                use_cache=True,
                return_dict=True
            )

        # Get logits for last token
        next_token_logits = outputs.logits[:, -1, :]

        # Compute probabilities
        probs = F.softmax(next_token_logits, dim=-1)

        # Sample or greedy decode
        next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)

        # Get probability of selected token
        selected_prob = probs[0, next_token.item()].item()

        # Store results
        generated_tokens.append(next_token.item())
        token_probs.append(selected_prob)
        all_caches.append(outputs.past_key_values)

        # Check for EOS
        if next_token.item() == tokenizer.eos_token_id:
            break

        # Update for next iteration
        past_key_values = outputs.past_key_values
        current_ids = next_token

    return {
        "input_ids": input_ids,
        "generated_tokens": generated_tokens,
        "token_probs": token_probs,
        "final_cache": all_caches[-1],
        "text": tokenizer.decode(generated_tokens, skip_special_tokens=True)
    }

In [29]:
# Run generation
result = generate_with_cache_and_probs(
    model, tokenizer,
    template_without_answer.format(question="What does MIT stand for?"),
    max_new_tokens=20
)

# Display output
print(f"Generated text: {result['text']}")
print(f"\nToken-by-token breakdown:")
for i, (tok, prob) in enumerate(zip(result['generated_tokens'], result['token_probs'])):
    print(f"  Step {i}: '{tokenizer.decode([tok])}' (prob: {prob:.4f})")

# Analyze final KV cache
final_cache = result['final_cache']
print(f"\nFinal KV Cache Summary:")
for i, layer_cache in enumerate(final_cache):
    if layer_cache is not None and isinstance(layer_cache, tuple):
        k, v = layer_cache
        print(f"  Layer {i}: K={k.shape}, V={v.shape}, K_mem={k.element_size()*k.nelement()/1024:.1f}KB")

Generated text: MIT stands for Massachusetts Institute of Technology. It is a private, research-intensive university located in

Token-by-token breakdown:
  Step 0: 'M' (prob: 0.9520)
  Step 1: 'IT' (prob: 0.9993)
  Step 2: ' stands' (prob: 0.9243)
  Step 3: ' for' (prob: 0.9999)
  Step 4: ' Massachusetts' (prob: 0.8794)
  Step 5: ' Institute' (prob: 0.9990)
  Step 6: ' of' (prob: 0.9999)
  Step 7: ' Technology' (prob: 0.9993)
  Step 8: '.' (prob: 0.8819)
  Step 9: ' It' (prob: 0.8163)
  Step 10: ' is' (prob: 0.5875)
  Step 11: ' a' (prob: 0.8055)
  Step 12: ' private' (prob: 0.3536)
  Step 13: ',' (prob: 0.6832)
  Step 14: ' research' (prob: 0.7299)
  Step 15: '-int' (prob: 0.6954)
  Step 16: 'ensive' (prob: 0.9982)
  Step 17: ' university' (prob: 0.1610)
  Step 18: ' located' (prob: 0.9285)
  Step 19: ' in' (prob: 0.9984)

Final KV Cache Summary:
  Layer 0: K=torch.Size([0]), V=torch.Size([0]), K_mem=0.0KB
  Layer 1: K=torch.Size([0]), V=torch.Size([0]), K_mem=0.0KB
  Layer 2: K=torc

In [30]:
def analyze_kv_cache(cache):
    """Analyze KV cache for hybrid LFM2 model"""
    stats = {
        "total_layers": len(cache),
        "attention_layers": 0,
        "total_memory_bytes": 0,
        "layer_details": []
    }

    for i, layer_cache in enumerate(cache):
        if layer_cache is not None and isinstance(layer_cache, tuple):
            k, v = layer_cache
            layer_mem = (k.element_size() * k.nelement() +
                        v.element_size() * v.nelement())
            stats["attention_layers"] += 1
            stats["total_memory_bytes"] += layer_mem
            stats["layer_details"].append({
                "layer": i,
                "k_shape": list(k.shape),
                "v_shape": list(v.shape),
                "memory_kb": layer_mem / 1024
            })

    return stats

cache_stats = analyze_kv_cache(result['final_cache'])
print(json.dumps(cache_stats, indent=2))

{
  "total_layers": 15,
  "attention_layers": 15,
  "total_memory_bytes": 860160,
  "layer_details": [
    {
      "layer": 0,
      "k_shape": [
        0
      ],
      "v_shape": [
        0
      ],
      "memory_kb": 0.0
    },
    {
      "layer": 1,
      "k_shape": [
        0
      ],
      "v_shape": [
        0
      ],
      "memory_kb": 0.0
    },
    {
      "layer": 2,
      "k_shape": [
        1,
        8,
        35,
        64
      ],
      "v_shape": [
        1,
        8,
        35,
        64
      ],
      "memory_kb": 140.0
    },
    {
      "layer": 3,
      "k_shape": [
        0
      ],
      "v_shape": [
        0
      ],
      "memory_kb": 0.0
    },
    {
      "layer": 4,
      "k_shape": [
        0
      ],
      "v_shape": [
        0
      ],
      "memory_kb": 0.0
    },
    {
      "layer": 5,
      "k_shape": [
        1,
        8,
        35,
        64
      ],
      "v_shape": [
        1,
        8,
        35,
        64
      ],
     

# 4. Calculate the I Do Not Know Score

## Step 1

In [32]:
import torch
import torch.nn.functional as F
import numpy as np
from dataclasses import dataclass
from typing import List, Dict, Optional
import json

@dataclass
class HallucinationFeatures:
    """Features extracted for hallucination detection."""
    # Input
    prompt: str
    input_ids: torch.Tensor

    # Hidden states (per generated token)
    z8_states: List[torch.Tensor]   # Layer 8 hidden states
    z12_states: List[torch.Tensor]  # Layer 12 hidden states

    # KV cache stats (per generated token)
    kv8_head_disagreement: List[float]
    kv12_head_disagreement: List[float]

    # Output
    generated_tokens: List[int]
    token_probs: List[float]
    token_entropies: List[float]
    top5_tokens: List[List[tuple]]
    full_text: str

    # Summary scores
    mean_prob: float
    mean_entropy: float
    mean_head_disagreement_8: float
    mean_head_disagreement_12: float


def compute_head_disagreement(kv_cache_layer) -> float:
    """
    Compute disagreement across KV heads for a single layer.
    Higher = more disagreement = potential hallucination signal.

    kv_cache_layer: tuple of (K, V) each with shape [batch, 8_heads, seq, 64]
    """
    k, v = kv_cache_layer

    if k.numel() == 0:  # Conv layer, no KV cache
        return 0.0

    # Variance across heads (dim=1) for the LAST token position
    # Shape: [batch, heads, head_dim] -> variance over heads
    k_last = k[:, :, -1, :]  # [1, 8, 64]
    v_last = v[:, :, -1, :]

    k_var = k_last.var(dim=1).mean().item()  # Variance across 8 heads
    v_var = v_last.var(dim=1).mean().item()

    return k_var + v_var


def extract_hallucination_features(
    model,
    tokenizer,
    prompt: str,
    max_new_tokens: int = 50
) -> HallucinationFeatures:
    """
    Generate tokens while extracting all features needed for hallucination detection.
    """
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    # Storage
    z8_states = []
    z12_states = []
    kv8_disagreement = []
    kv12_disagreement = []
    generated_tokens = []
    token_probs = []
    token_entropies = []
    top5_tokens = []

    past_key_values = None
    current_ids = input_ids

    for step in range(max_new_tokens):
        with torch.no_grad():
            outputs = model(
                current_ids,
                past_key_values=past_key_values,
                use_cache=True,
                output_hidden_states=True,  # KEY: Get layer outputs
                return_dict=True
            )

        # === HIDDEN STATES (z8, z12) ===
        # hidden_states is tuple of (embedding, layer0, layer1, ..., layer15)
        # So layer 8 output is at index 9, layer 12 is at index 13
        hidden_states = outputs.hidden_states

        # Get last token's hidden state for the current step
        z8 = hidden_states[9][:, -1, :].clone().cpu()   # [1, 2048]
        z12 = hidden_states[13][:, -1, :].clone().cpu() # [1, 2048]

        z8_states.append(z8)
        z12_states.append(z12)

        # === KV CACHE HEAD DISAGREEMENT ===
        cache = outputs.past_key_values
        kv8_disagreement.append(compute_head_disagreement(cache[8]))
        kv12_disagreement.append(compute_head_disagreement(cache[12]))

        # === LOGITS & PROBABILITIES ===
        logits = outputs.logits[:, -1, :]  # [1, vocab_size]
        probs = F.softmax(logits, dim=-1)

        # Entropy of distribution
        entropy = -(probs * torch.log(probs + 1e-10)).sum().item()

        # Greedy selection
        next_token = torch.argmax(logits, dim=-1)
        selected_prob = probs[0, next_token.item()].item()

        # Top-5
        top5_probs, top5_idx = torch.topk(probs[0], 5)
        top5 = [(tokenizer.decode([idx.item()]), prob.item())
                for idx, prob in zip(top5_idx, top5_probs)]

        # Store
        generated_tokens.append(next_token.item())
        token_probs.append(selected_prob)
        token_entropies.append(entropy)
        top5_tokens.append(top5)

        # Check EOS
        if next_token.item() == tokenizer.eos_token_id:
            break

        # Update for next step
        past_key_values = outputs.past_key_values
        current_ids = next_token.unsqueeze(0)

    # === BUILD RESULT ===
    return HallucinationFeatures(
        prompt=prompt,
        input_ids=input_ids.cpu(),
        z8_states=z8_states,
        z12_states=z12_states,
        kv8_head_disagreement=kv8_disagreement,
        kv12_head_disagreement=kv12_disagreement,
        generated_tokens=generated_tokens,
        token_probs=token_probs,
        token_entropies=token_entropies,
        top5_tokens=top5_tokens,
        full_text=tokenizer.decode(generated_tokens, skip_special_tokens=True),
        mean_prob=np.mean(token_probs),
        mean_entropy=np.mean(token_entropies),
        mean_head_disagreement_8=np.mean(kv8_disagreement),
        mean_head_disagreement_12=np.mean(kv12_disagreement),
    )

# Step 2: Unsupervised z8 → z12 Predictor

In [33]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class Z8toZ12Predictor(nn.Module):
    """
    Predict z12 from z8. High prediction error = out-of-support.
    """
    def __init__(self, hidden_dim=2048, bottleneck_dim=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(hidden_dim, bottleneck_dim),
            nn.GELU(),
            nn.LayerNorm(bottleneck_dim),
            nn.Linear(bottleneck_dim, bottleneck_dim),
            nn.GELU(),
            nn.LayerNorm(bottleneck_dim),
            nn.Linear(bottleneck_dim, hidden_dim),
        )

    def forward(self, z8):
        return self.net(z8)

    def prediction_error(self, z8, z12):
        """MSE between predicted and actual z12."""
        z12_pred = self.forward(z8)
        return F.mse_loss(z12_pred, z12, reduction='none').mean(dim=-1)


class Z8Z12LogitsPredictor(nn.Module):
    """
    Extended predictor: z8 → z12 → logits_summary
    Captures full information flow consistency.
    """
    def __init__(self, hidden_dim=2048, vocab_size=65536, bottleneck_dim=512):
        super().__init__()

        # z8 → z12 predictor
        self.z8_to_z12 = nn.Sequential(
            nn.Linear(hidden_dim, bottleneck_dim),
            nn.GELU(),
            nn.Linear(bottleneck_dim, hidden_dim),
        )

        # z12 → logits summary (top-k probs, entropy)
        # We predict a summary, not full vocab logits
        self.z12_to_logits_summary = nn.Sequential(
            nn.Linear(hidden_dim, bottleneck_dim),
            nn.GELU(),
            nn.Linear(bottleneck_dim, 6),  # [entropy, top1_prob, top2_prob, ...]
        )

    def forward(self, z8):
        z12_pred = self.z8_to_z12(z8)
        logits_summary_pred = self.z12_to_logits_summary(z12_pred)
        return z12_pred, logits_summary_pred

    def compute_oos_score(self, z8, z12, logits_summary):
        """
        Out-of-support score combining both prediction errors.
        """
        z12_pred, logits_pred = self.forward(z8)

        err_z12 = F.mse_loss(z12_pred, z12, reduction='none').mean(dim=-1)
        err_logits = F.mse_loss(logits_pred, logits_summary, reduction='none').mean(dim=-1)

        # Weighted combination
        return 0.7 * err_z12 + 0.3 * err_logits


class HallucinationDataset(Dataset):
    """Dataset for training the predictor."""
    def __init__(self, features_list: List[HallucinationFeatures]):
        self.samples = []

        for feat in features_list:
            for i, (z8, z12, prob, entropy) in enumerate(zip(
                feat.z8_states,
                feat.z12_states,
                feat.token_probs,
                feat.token_entropies
            )):
                # Create logits summary: [entropy, top5_probs]
                top5_probs = [p for _, p in feat.top5_tokens[i]]
                logits_summary = torch.tensor([entropy] + top5_probs, dtype=torch.float32)

                self.samples.append({
                    'z8': z8.squeeze(0),
                    'z12': z12.squeeze(0),
                    'logits_summary': logits_summary,
                    'prob': prob,
                    'entropy': entropy,
                })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


def train_predictor(features_list: List[HallucinationFeatures], epochs=10, lr=1e-4):
    """Train the z8 → z12 → logits predictor."""
    dataset = HallucinationDataset(features_list)
    loader = DataLoader(dataset, batch_size=32, shuffle=True)

    predictor = Z8Z12LogitsPredictor()
    optimizer = torch.optim.AdamW(predictor.parameters(), lr=lr)

    predictor.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in loader:
            z8 = batch['z8']
            z12 = batch['z12']
            logits_summary = batch['logits_summary']

            z12_pred, logits_pred = predictor(z8)

            loss_z12 = F.mse_loss(z12_pred, z12)
            loss_logits = F.mse_loss(logits_pred, logits_summary)
            loss = 0.7 * loss_z12 + 0.3 * loss_logits

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(loader):.6f}")

    return predictor

## Step 3: Head Disagreement Score (Refined)

In [34]:
def compute_sequence_head_disagreement(features: HallucinationFeatures) -> Dict:
    """
    Compute detailed head disagreement statistics.
    """
    kv8 = np.array(features.kv8_head_disagreement)
    kv12 = np.array(features.kv12_head_disagreement)

    return {
        # Layer 8
        "kv8_mean": kv8.mean(),
        "kv8_std": kv8.std(),
        "kv8_max": kv8.max(),
        "kv8_trend": np.polyfit(range(len(kv8)), kv8, 1)[0] if len(kv8) > 1 else 0,

        # Layer 12
        "kv12_mean": kv12.mean(),
        "kv12_std": kv12.std(),
        "kv12_max": kv12.max(),
        "kv12_trend": np.polyfit(range(len(kv12)), kv12, 1)[0] if len(kv12) > 1 else 0,

        # Cross-layer
        "kv_ratio": kv12.mean() / (kv8.mean() + 1e-8),
        "kv_correlation": np.corrcoef(kv8, kv12)[0, 1] if len(kv8) > 1 else 0,
    }

# Step 4: Integration with Paraphrase Dataset

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from tqdm import tqdm

def run_full_hallucination_analysis(model, tokenizer, csv_path: str):
    """
    Run complete analysis pipeline on paraphrase dataset.
    """
    # Load data
    df = pd.read_csv(csv_path)

    # === STEP 1: Extract features for all questions ===
    print("Extracting internal features...")
    all_features = []
    for question in tqdm(df['question'], desc="Processing"):
        prompt = template_without_answer.format(question=question)
        features = extract_hallucination_features(model, tokenizer, prompt, max_new_tokens=100)
        all_features.append(features)

    df['llm_output'] = [f.full_text for f in all_features]
    df['mean_prob'] = [f.mean_prob for f in all_features]
    df['mean_entropy'] = [f.mean_entropy for f in all_features]
    df['head_disagree_8'] = [f.mean_head_disagreement_8 for f in all_features]
    df['head_disagree_12'] = [f.mean_head_disagreement_12 for f in all_features]

    # === STEP 2: Train z8 → z12 predictor ===
    print("\nTraining z8 → z12 predictor...")
    predictor = train_predictor(all_features, epochs=20)

    # === STEP 3: Compute OOS scores ===
    print("\nComputing out-of-support scores...")
    predictor.eval()
    oos_scores = []

    for feat in all_features:
        scores = []
        for z8, z12, prob, entropy in zip(
            feat.z8_states, feat.z12_states,
            feat.token_probs, feat.token_entropies
        ):
            top5_probs = [p for _, p in feat.top5_tokens[feat.z8_states.index(z8)]]
            logits_summary = torch.tensor([entropy] + top5_probs[:5])

            with torch.no_grad():
                score = predictor.compute_oos_score(
                    z8.squeeze(0).unsqueeze(0),
                    z12.squeeze(0).unsqueeze(0),
                    logits_summary.unsqueeze(0)
                )
            scores.append(score.item())

        oos_scores.append(np.mean(scores))

    df['oos_score'] = oos_scores

    # === STEP 4: Paraphrase consistency (TF-IDF) ===
    print("\nComputing paraphrase consistency...")
    vectorizer = TfidfVectorizer()
    embeddings = vectorizer.fit_transform(df['llm_output']).toarray()
    embeddings = normalize(embeddings)
    cos_sim_matrix = cosine_similarity(embeddings)

    # Group by category and compute consistency
    paraphrase_stats = []
    for idx, row in df.iterrows():
        category = row['category']
        same_category = df[df['category'] == category].index.tolist()

        if len(same_category) > 1:
            sims = [cos_sim_matrix[idx, j] for j in same_category if j != idx]
            paraphrase_stats.append({
                'mean_sim': np.mean(sims),
                'std_sim': np.std(sims),
                'min_sim': np.min(sims),
            })
        else:
            paraphrase_stats.append({'mean_sim': 1.0, 'std_sim': 0.0, 'min_sim': 1.0})

    df['paraphrase_mean_sim'] = [s['mean_sim'] for s in paraphrase_stats]
    df['paraphrase_std_sim'] = [s['std_sim'] for s in paraphrase_stats]
    df['paraphrase_min_sim'] = [s['min_sim'] for s in paraphrase_stats]

    return df, predictor, all_features


# === RUN ANALYSIS ===
df_results, predictor, all_features = run_full_hallucination_analysis(
    model, tokenizer,
    "AI_Bullshit_Detector/data/paraphase-prompts.csv"
)

Extracting internal features...


Processing:   4%|▍         | 4/100 [00:23<09:17,  5.81s/it]

## Cosine similary of paraphrased sentences --> used in testing the Iinternal representation model (IRM) and output.

In [31]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# --- 1. Load CSV ---
#!rm -rf AI_Bullshit_Detector
# Clone the repo
!git clone https://github.com/AvdMei/AI_Bullshit_Detector
df = pd.read_csv("AI_Bullshit_Detector/data/paraphase-prompts.csv")
print(df.head())
assert "category" in df.columns and "question" in df.columns
# --- 2. Load Liquid AI model and tokenizer ---
model_id = "LiquidAI/LFM2-1.2B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# --- 3. Templates ---
template_without_answer = "<|startoftext|><|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
# --- 4. Function to generate response ---
def generate_llm_output(question, max_tokens=200):
    prompt = template_without_answer.format(question=question)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False  # deterministic for cosine similarity
        )
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Remove the prompt part to get only the assistant answer
    if "<|im_start|>assistant" in output_text:
        answer = output_text.split("<|im_start|>assistant")[1].strip()
    else:
        answer = output_text
    return answer
# --- 5. Generate LLM outputs ---
llm_outputs = []
for q in tqdm(df['question'], desc="Generating outputs"):
    try:
        out = generate_llm_output(q)
    except Exception as e:
        out = f"ERROR: {e}"
    llm_outputs.append(out)
df['llm_output'] = llm_outputs
# --- 6. Convert outputs to embeddings using TF-IDF ---
vectorizer = TfidfVectorizer()
embeddings = vectorizer.fit_transform(df['llm_output']).toarray()
embeddings = normalize(embeddings)
# --- 7. Compute cosine similarity ---
cos_sim_matrix = cosine_similarity(embeddings)
# --- 8. Save similarity matrix to CSV ---
sim_df = pd.DataFrame(cos_sim_matrix, index=df['question'], columns=df['question'])
sim_df.to_csv("AI_Bullshit_Detector/data/llm_output_cosine_similarity.csv")
print("Cosine similarity matrix saved to data/llm_output_cosine_similarity.csv")
# --- 9. Save outputs alongside prompts ---
df.to_csv("AI_Bullshit_Detector/data/paraphases_with_llm_output.csv", index=False)
print("LLM outputs saved to data/paraphases_with_llm_output.csv")

Cloning into 'AI_Bullshit_Detector'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (139/139), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 139 (delta 46), reused 92 (delta 16), pack-reused 0 (from 0)[K
Receiving objects: 100% (139/139), 457.96 KiB | 7.27 MiB/s, done.
Resolving deltas: 100% (46/46), done.
                           category  \
0  Software Engineering - Debugging   
1  Software Engineering - Debugging   
2  Software Engineering - Debugging   
3  Software Engineering - Debugging   
4  Software Engineering - Debugging   

                                            question  
0  What is the first step an experienced engineer...  
1  When a bug is reported, what should be done fi...  
2  What is the initial action a senior engineer t...  
3  Before changing code to fix a bug, what should...  
4  What is the most important first step in a pro...  


`torch_dtype` is deprecated! Use `dtype` instead!
Generating outputs: 100%|██████████| 100/100 [07:49<00:00,  4.70s/it]

Cosine similarity matrix saved to data/llm_output_cosine_similarity.csv
LLM outputs saved to data/paraphases_with_llm_output.csv





Liquid AI's LFM2-1.2B