In [None]:
!pip install wikipedia

In [None]:
import wikipedia
import torch
import torch.nn.functional as F
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

class HallucinationDetector(object):
    HUGGINGFACE_TOKEN = ""
    MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
    N = 50
    TEMP = 0.8
    TOP_K = 50

    def __init__(self):
        self.model, self.tokenizer = self.load_model()

    def load_model(self):
        model = AutoModelForCausalLM.from_pretrained(
            self.MODEL_NAME,
            torch_dtype="auto",
            device_map="auto",
            token=self.HUGGINGFACE_TOKEN
        )
        tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME)
        return model, tokenizer

    def wikipedia_checker(self, fragment):
        """
        Usa a Wikipedia para checar se um fragmento aparece em algum artigo.
        Retorna resumo do primeiro match.
        """
        try:
            answer = wikipedia.search(fragment)
            if not answer:
                return None
            return wikipedia.summary(answer[0])
        except Exception:
            return None

    def get_logprobs(self, text: str):
        """
        Retorna os logprobs token-a-token e o total da sequência.
        """
        self.model.eval()
        enc = self.tokenizer(text, return_tensors="pt").to(self.model.device)
        input_ids = enc["input_ids"]

        with torch.no_grad():
            outputs = self.model(input_ids)
            logits = outputs.logits

        log_probs = F.log_softmax(logits, dim=-1)

        ids = input_ids[0]
        per_token_logprobs = []

        for i in range(1, ids.size(0)):
            token_id = ids[i].item()
            logp = log_probs[0, i-1, token_id].item()
            per_token_logprobs.append(logp)

        total_logprob = sum(per_token_logprobs)

        return {
            "tokens": self.tokenizer.convert_ids_to_tokens(ids),
            "per_token_logprobs": per_token_logprobs,
            "total_logprob": total_logprob,
            "sequence_probability": float(torch.exp(torch.tensor(total_logprob)))
        }

    def monte_carlo(self, prompt):
        """
        Executa N amostragens para o mesmo prompt, coletando respostas
        e estatísticas de probabilidade.
        """
        self.model.eval()
        results = []

        for i in range(self.N):
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
            output_ids = self.model.generate(
                **inputs,
                max_new_tokens=512,
                do_sample=True,
                temperature=self.TEMP,
                top_k=self.TOP_K
            )

            answer = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
            stats = self.get_logprobs(answer)

            results.append({
                "answer": answer,
                "tokens": stats["tokens"],
                "per_token_logprobs": stats["per_token_logprobs"],
                "total_logprob": stats["total_logprob"],
                "sequence_probability": stats["sequence_probability"]
            })

        return results

    def detect_hallucinations(self, result, epsilon=1e-10):
        """
        Marca tokens cuja probabilidade está abaixo de um limiar epsilon.
        Opcionalmente, compara com Wikipedia para validar factualidade.
        """
        tokens = result["tokens"]
        logprobs = result["per_token_logprobs"]
        probs = [float(torch.exp(torch.tensor(lp))) for lp in logprobs]

        hallucinated = []
        for tok, p in zip(tokens[1:], probs):  # ignora token inicial
            if p < epsilon:
                # checar factualidade simples (heurística)
                wiki_summary = self.wikipedia_checker(tok)
                if wiki_summary is None:
                    hallucinated.append((tok, p))

        return hallucinated

    def analyze_results(self, results, epsilon=1e-4):
        """
        Estatísticas agregadas sobre as N amostras.
        """
        total_logprobs = [r["total_logprob"] for r in results]
        seq_probs = [r["sequence_probability"] for r in results]

        mean_logprob = np.mean(total_logprobs)
        var_logprob = np.var(total_logprobs)
        mean_prob = np.mean(seq_probs)

        hallucinations = [self.detect_hallucinations(r, epsilon) for r in results]
        hallucination_counts = [len(h) for h in hallucinations]

        return {
            "mean_logprob": mean_logprob,
            "var_logprob": var_logprob,
            "mean_seq_prob": mean_prob,
            "hallucination_rate": np.mean([c > 0 for c in hallucination_counts]),
            "hallucinations": hallucinations
        }


In [3]:
detector = HallucinationDetector()

prompt = "Who won the Nobel Prize in Physics in 2020?"
results = detector.monte_carlo(prompt)

analysis = detector.analyze_results(results, epsilon=1e-4)

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [4]:
print("Taxa de hallucinations:", analysis["hallucination_rate"])

Taxa de hallucinations: 0.0


In [5]:
print("Exemplos de tokens suspeitos:", analysis["hallucinations"][:3])

Exemplos de tokens suspeitos: [[], [], []]


In [6]:
import pandas as pd

class ExperimentPipeline:
    def __init__(self, detector):
        self.detector = detector
        # prompts de exemplo — tu podes expandir com base no paper
        self.prompts = {
            "science": [
                "What is quantum entanglement?",
                "Explain the theory of evolution in simple terms."
            ],
            "technology": [
                "Who invented the internet?",
                "What is blockchain and how does it work?"
            ],
            "history": [
                "Who was the first emperor of Rome?",
                "What triggered World War I?"
            ],
            "everyday": [
                "What are the health benefits of drinking coffee?",
                "How does public transport reduce pollution?"
            ]
        }

    def run(self, epsilon=1e-4):
        summary = []

        for domain, prompts in self.prompts.items():
            total_outputs = 0
            total_hallucinations = 0

            for prompt in prompts:
                results = self.detector.monte_carlo(prompt)
                analysis = self.detector.analyze_results(results, epsilon=epsilon)

                total_outputs += len(results)
                # conta quantas respostas tiveram pelo menos 1 hallucination
                total_hallucinations += sum([len(h) > 0 for h in analysis["hallucinations"]])

            summary.append({
                "domain": domain,
                "total_prompts": len(prompts),
                "total_outputs": total_outputs,
                "hallucinations_detected": total_hallucinations,
                "percentage": (total_hallucinations / total_outputs) * 100
            })

        return pd.DataFrame(summary)


In [7]:
detector = HallucinationDetector()
pipeline = ExperimentPipeline(detector)

df_results = pipeline.run(epsilon=1e-4)

df_results

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,domain,total_prompts,total_outputs,hallucinations_detected,percentage
0,science,2,100,50,50.0
1,technology,2,100,50,50.0
2,history,2,100,50,50.0
3,everyday,2,100,0,0.0
