In [1]:
import spacy
import torch
import numpy as np
from sentence_transformers import CrossEncoder, SentenceTransformer, util


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1) NLI cross-encoder
nli_model = CrossEncoder("cross-encoder/nli-deberta-v3-base")

# 2) SBERT for topic embeddings
topic_model = SentenceTransformer("all-MiniLM-L6-v2")

# 3) spaCy for sentence splitting
nlp = spacy.load("en_core_web_sm")

In [5]:
def compute_semantic_entailment(source: str, summary: str) -> float:
    """
    Returns ScoreNLI = max_{s∈src_sents, h∈sum_sents}( P_entailment - P_contradiction )
    using cross-encoder/nli-deberta-v3-base.
    Scores range in [-1,1]: +1 = strong entailment, -1 = strong contradiction.
    """
    # 1. split into sentences
    src_sents  = [sent.text for sent in nlp(source).sents]
    sum_sents  = [sent.text for sent in nlp(summary).sents]

    best_score = -1.0  # worst-case
    for prem in src_sents:
        for hyp in sum_sents:
            # get logits for [contradiction, neutral, entailment]
            logits = nli_model.predict([(prem, hyp)])[0]
            probs  = torch.softmax(torch.tensor(logits), dim=0).numpy()
            p_contradict, p_neutral, p_entail = probs
            score = p_entail - p_contradict
            if score > best_score:
                best_score = score

    return best_score


def compute_topic_drift(source: str, summary: str) -> float:
    """
    Returns TopicDrift = 1 - cosine( embed(source), embed(summary) )
    where embeddings come from all-MiniLM-L6-v2.
    Drift in [0,2], but practically in [0,1].
    """
    emb_src = topic_model.encode(source,  convert_to_tensor=True)
    emb_sum = topic_model.encode(summary, convert_to_tensor=True)
    cos_sim = util.cos_sim(emb_src, emb_sum).item()
    drift   = 1.0 - cos_sim
    return drift


In [6]:
if __name__ == "__main__":
    example_src = (
        "The visitors led briefly through Vasil Lobzhanidze's early try, "
        "but Scotland raced ahead ... and got their reward."
    )
    example_sum = "Scotland dominated after an early Georgian try and ran out convincing winners."

    nli_score  = compute_semantic_entailment(example_src, example_sum)
    drift_score = compute_topic_drift(example_src, example_sum)

    print(f"Semantic Entailment Score: {nli_score:.3f}  (+1=entail, -1=contra)")
    print(f"Topic Drift Score:         {drift_score:.3f}  (0=on-topic, 1=off-topic)")

Semantic Entailment Score: 0.978  (+1=entail, -1=contra)
Topic Drift Score:         0.452  (0=on-topic, 1=off-topic)


In [9]:
import spacy
nlp = spacy.load("en_core_web_sm")
def extract_docs():
    from datasets import load_dataset
    xsum = load_dataset("EdinburghNLP/xsum")  # :contentReference[oaicite:1]{index=1}
    docs = {ex["id"]: ex["document"] for split in xsum.values()  # :contentReference[oaicite:2]{index=2}
            for ex in split}
    return docs
import pandas as pd
factual_df = pd.read_csv("../Data/factuality_annotations_xsum_summaries.csv")

df = factual_df.sample(
    frac=1, random_state=42
).reset_index(drop=True)
train_df = df.iloc[:500]

docs = extract_docs()

In [10]:
import pandas as pd
from tqdm.auto import tqdm

# Assume you already have:
# - `train_df` with columns ["bbcid", "summary", "system", "is_factual"]
# - `docs` dict mapping BBC IDs to source texts
# - compute_semantic_entailment and compute_topic_drift defined

# Prepare storage
records = []

for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Scoring"):
    bbcid   = str(row["bbcid"])
    src     = docs[bbcid]
    summ    = row["summary"]

    se_score = compute_semantic_entailment(src, summ)
    td_score = compute_topic_drift(src, summ)

    records.append({
        "bbcid":              bbcid,
        "system":             row["system"],
        "gold_is_factual":    row["is_factual"],
        "semantic_entailment": se_score,
        "topic_drift":         td_score,
    })

# Build a DataFrame
scores_df = pd.DataFrame(records)

# Quick look
print(scores_df[["semantic_entailment","topic_drift"]].describe())

# Save to CSV if you like
scores_df.to_csv("xsum_nli_topic_scores.csv", index=False)


Scoring: 100%|██████████| 500/500 [07:38<00:00,  1.09it/s]


       semantic_entailment  topic_drift
count           500.000000   500.000000
mean              0.940750     0.485253
std               0.305933     0.135311
min              -0.999059     0.137819
25%               0.997383     0.383937
50%               0.998993     0.470322
75%               0.999499     0.578487
max               0.999781     0.990211
