In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
def extract_docs():
    from datasets import load_dataset
    xsum = load_dataset("EdinburghNLP/xsum")  # :contentReference[oaicite:1]{index=1}
    docs = {ex["id"]: ex["document"] for split in xsum.values()  # :contentReference[oaicite:2]{index=2}
            for ex in split}
    return docs
import pandas as pd
factual_df = pd.read_csv("../Data/factuality_annotations_xsum_summaries.csv")

docs = extract_docs()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_svo_triples(text: str):
    """
    - Sentence‐level SVO
    - Verbs: ROOT, ccomp, conj (VERB or AUX)
    - Subjects: nsubj/nsubjpass (drop pronouns)
    - Objects: dobj/attr/dative + first pobj under prep + ccomp handled separately
    - Expand every object via its subtree
    - Drop objects whose text starts with "their "
    - Dedupe via a set
    """
    doc = nlp(text)
    triples = set()

    for sent in doc.sents:
        # 1) Collect candidate verbs
        verbs = [t for t in sent
                 if t.pos_ in ("VERB","AUX") and t.dep_ in ("ROOT","ccomp","conj")]
        # plus conjuncts of those
        for v in list(verbs):
            verbs.extend([c for c in v.conjuncts if c.pos_ in ("VERB","AUX")])

        for verb in verbs:
            # 2) Subjects
            subs = [c for c in verb.children
                    if c.dep_ in ("nsubj","nsubjpass") and c.pos_!="PRON"]
            # inherit for conj w/o own subs
            if not subs and verb.dep_=="conj":
                subs = [c for c in verb.head.children
                        if c.dep_ in ("nsubj","nsubjpass") and c.pos_!="PRON"]
            if not subs:
                continue

            # 3) Objects
            objs = []
            for c in verb.children:
                if c.dep_ in ("dobj","attr","dative"):
                    objs.append(c)
                if c.dep_ == "prep":
                    # first pobj under this prep
                    for pobj in c.children:
                        if pobj.dep_ == "pobj":
                            objs.append(pobj)
                            break
                # handle clausal complements as separate SVOs
                if c.dep_ == "ccomp":
                    nested = extract_svo_triples(c.text)
                    for (ns,nv,no) in nested:
                        for sub in subs:
                            triples.add((sub.text, verb.lemma_, f"{nv} {no}"))

            if not objs:
                continue

            # 4) Build and filter triples
            for sub in subs:
                subj_text = sub.text
                for obj in objs:
                    if obj.pos_ in ("PRON","NUM"):
                        continue
                    # full subtree span
                    toks = list(obj.subtree)
                    span = sent[toks[0].i - sent.start : toks[-1].i - sent.start + 1]
                    obj_text = span.text
                    if obj_text.lower().startswith("their "):
                        continue
                    triples.add((subj_text, verb.lemma_, obj_text))

    return list(triples)

# Sanity‐check on your example:
src = docs[str(factual_df.loc[10, "bbcid"])]
for sent in nlp(src).sents:
    print("SENTENCE:", sent.text)
    print("→", extract_svo_triples(sent.text))
    print()



SENTENCE: Former Dons midfielder Sheerin, 39, has been player-manager at the Red Lichties since 2010 and replaces Neil Cooper at Pittodrie.

→ [('Sheerin', 'replace', 'Neil Cooper'), ('Sheerin', 'be', 'player-manager at the Red Lichties'), ('Sheerin', 'replace', 'Pittodrie')]

SENTENCE: Arbroath were relegated to Scottish League Two after finishing season 2013-14 bottom of League One.

→ [('Arbroath', 'relegate', 'Scottish League')]

SENTENCE: Aberdeen chief executive Duncan Fraser said boss Derek McInnes chose Sheerin after "a comprehensive process".

→ [('McInnes', 'choose', 'a comprehensive process'), ('McInnes', 'choose', 'Sheerin')]

SENTENCE: Speaking on the club's website, Fraser added: "Despite wishing to retain Paul's services, Arbroath chairman John Christison was good to deal with and completely appreciated Paul's desire to come back to Aberdeen."
Sheerin played for several Scottish clubs and also featured for Östersunds in Sweden before moving into coaching.
→ [('Sheerin', 

In [3]:
def extract_entities(text: str):
    """
    Returns the list of entity strings found in text.
    """
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

In [4]:
from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
def match_ent(e_summ: str, e_src: str, thresh=0.8) -> bool:
    if e_summ == e_src:
        return True
    emb1 = embedder.encode(e_summ, convert_to_tensor=True)
    emb2 = embedder.encode(e_src, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item() >= thresh

def match_verb(v_summ: str, v_src: str, thresh=0.8) -> bool:
    if v_summ == v_src:
        return True
    emb1 = embedder.encode(v_summ, convert_to_tensor=True)
    emb2 = embedder.encode(v_src, convert_to_tensor=True)
    return util.cos_sim(emb1, emb2).item() >= thresh

In [None]:
import json
import pandas as pd

def compute_fact_score(summary: str, source: str, alpha: float = 0.5, 
                       ent_thresh: float = 0.8, verb_thresh: float = 0.8):
    # 1. Extract
    summ_ents   = extract_entities(summary)
    summ_svos   = extract_svo_triples(summary)
    source_svos = extract_svo_triples(source)

    # 2. Entity precision
    unsupported_entities = []
    sup_ents = 0
    for ent in summ_ents:
        # does this entity match any subject or object in source_svos?
        found = any(
            match_ent(ent, subj, ent_thresh) or match_ent(ent, obj, ent_thresh)
            for subj, _, obj in source_svos
        )
        if found:
            sup_ents += 1
        else:
            unsupported_entities.append(ent)
    ent_prec = sup_ents / len(summ_ents) if summ_ents else 1.0

    # 3. Triple precision
    unsupported_triples = []
    sup_trips = 0
    for s_s, v_s, o_s in summ_svos:
        # look for ANY source triple that matches all three components
        match_found = any(
            match_ent(s_s, s_src, ent_thresh) and
            match_ent(o_s, o_src, ent_thresh) and
            match_verb(v_s, v_src, verb_thresh)
            for s_src, v_src, o_src in source_svos
        )
        if match_found:
            sup_trips += 1
        else:
            unsupported_triples.append((s_s, v_s, o_s))
    tri_prec = sup_trips / len(summ_svos) if summ_svos else 0

    # 4. Combined FactScore (if you need it)
    fact_score = alpha * ent_prec + (1 - alpha) * tri_prec

    return {
        "fact_score":           fact_score,
        "entity_precision":     ent_prec,
        "triple_precision":     tri_prec,
        "unsupported_entities": unsupported_entities,
        "unsupported_triples":  unsupported_triples
    }
# def compute_fact_report(summary: str, source: str,
#                         ent_thresh: float = 0.8,
#                         verb_thresh: float = 0.8):
#     summ_ents   = extract_entities(summary)
#     summ_svos   = extract_svo_triples(summary)
#     source_svos = extract_svo_triples(source)

#     # 1) Find unsupported entities
#     unsupported_entities = [
#         ent for ent in summ_ents
#         if not any(match_ent(ent, subj, ent_thresh) or
#                    match_ent(ent, obj, ent_thresh)
#                    for subj,_,obj in source_svos)
#     ]

#     # 2) Find unsupported triples
#     unsupported_triples = []
#     for s_s, v_s, o_s in summ_svos:
#         if not any(
#             match_ent(s_s, s_src, ent_thresh)
#             and match_ent(o_s, o_src, ent_thresh)
#             and match_verb(v_s, v_src, verb_thresh)
#             for s_src, v_src, o_src in source_svos
#         ):
#             unsupported_triples.append({
#                 "triple": f"({s_s}, {v_s}, {o_s})",
#                 "status": "unsupported"
#             })

#     return {
#         "entity_hallucinations": unsupported_entities,
#         "unsupported_triples":  unsupported_triples
#     }


# Example: run on one row
row = factual_df.iloc[5552]

src = docs[str(row["bbcid"])]
summary = row["summary"]

summ_ents   = extract_entities(summary)
summ_svos   = extract_svo_triples(summary)
source_svos = extract_svo_triples(src)


print("SOURCE ", src)
print("*****************")
print("SUMMARY ", summary)
print("*****************")

print("SUMMARY_ents ", summ_ents)
print("*****************")
print("SUMMARY_svos ", summ_svos)
print("*****************")
print("SOURCE_svos ", source_svos)



res = compute_fact_score(summary, src, alpha=0.6)
print(json.dumps(res, indent=2))
print("***************************************************")

SOURCE  Media playback is not supported on this device
The visitors led briefly through Vasil Lobzhanidze's early try, but the Scots raced ahead through Tommy Seymour, a penalty try, Sean Maitland and Stuart Hogg before the break.
Hamish Watson extended the lead, battling over soon after half-time.
Lobzhanidze bagged a second from a huge Georgian scrum, before Hogg finished off a searing counter-attack.
Before the Scottish points deluge, it was the Georgians who opened the scoring. It was a soft one, with Lobzhanidze, the visitors' scrum-half, darting down the blind-side of a scrum where Seymour should have halted him, but didn't.
The wing made amends soon after when Hogg dinked a kick behind the Georgian defence for Seymour to run on to. In the foot-race, Seymour beat Lobzhanidze to the touch-down. It was questionable that Seymour got downward pressure on the ball, but the try was given and Scotland went ahead when Laidlaw swept over the conversion from the touchline.
The floodgates b

In [18]:
from tqdm.auto import tqdm

results = []



df = factual_df.sample(
    frac=1, random_state=42
).reset_index(drop=True)
train_df = df.iloc[:500]

for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Fact‐Validation"):
    src = docs[str(row["bbcid"])]
    out = compute_fact_score(row["summary"], src, alpha=0.6)
    out["bbcid"] = row["bbcid"]
    out["system"] = row["system"]
    out["summary"] = row["summary"]
    out["source"] = src
    out["gold_is_factual"] = row["is_factual"]
    results.append(out)

factval_df = pd.DataFrame(results)
# Save for later analysis
factval_df.to_csv("xsum_fact_validation.csv", index=False)

Fact‐Validation: 100%|██████████| 500/500 [06:34<00:00,  1.27it/s]


In [40]:
factval_df

Unnamed: 0,fact_score,entity_precision,triple_precision,unsupported_entities,unsupported_triples,bbcid,system,summary,source,gold_is_factual
0,0.00,0.000000,0.0,"[scotland women's, 2017, friday, anna evans]","[(qualifier, be, a "" humiliating "" chance to r...",36418721,BERTS2S,scotland women's euro 2017 qualifier against i...,The Scots have won all five qualifiers to date...,no
1,0.60,0.333333,1.0,"[georgia, first]",[],38097669,PtGen,scotland's hopes of reaching the cyprus cup qu...,Media playback is not supported on this device...,no
2,0.45,0.750000,0.0,[jenson],"[(alonso, win, a thrilling hungarian grand prix)]",18944949,BERTS2S,ferrari's fernando alonso won a thrilling hung...,Media playback is not supported on this device...,no
3,0.60,1.000000,0.0,[],"[(right, scrap, a bid to boost the housing mar...",19311364,TranS2S,the right to be built in social housing should...,Downing Street backed a report by think tank P...,no
4,1.00,1.000000,1.0,[],[],35428466,TranS2S,writer william mcilvanney is to be given the f...,"The Â£45m facility, to be built at Sutherland ...",no
...,...,...,...,...,...,...,...,...,...,...
495,0.60,1.000000,0.0,[],"[(negotiator, urge, the government)]",38345826,PtGen,the eu's brexit negotiator has urged the gover...,It also warns that Britain may have to allow E...,no
496,0.00,0.000000,0.0,"[11-year-old, canada]","[(boy, die, a house fire in canada)]",40634994,TranS2S,an 11-year-old boy has died in a house fire in...,Marcy Smith was woken up by her son David to f...,no
497,0.00,0.000000,0.0,"[16-year-old, first, british]","[(boy, become, the first teenager to play taek...",39186557,TranS2S,a 16-year-old boy has become the first teenage...,"Luke, 18, is one of 14 successful candidates f...",no
498,0.00,0.000000,0.0,"[19-year-old, indian, indian]","[(student, shoot, a school in the southern ind...",32143053,TranS2S,a 19-year-old indian student has been shot dea...,The shooting occurred at a hostel attached to ...,no


In [31]:
factval_df[factval_df["fact_score"]> 0.5]["bbcid"].iloc[0]

np.int64(38097669)

In [27]:
factval_df[factval_df["fact_score"]> 0.5]["source"].iloc[0]

'Media playback is not supported on this device\nThe visitors led briefly through Vasil Lobzhanidze\'s early try, but the Scots raced ahead through Tommy Seymour, a penalty try, Sean Maitland and Stuart Hogg before the break.\nHamish Watson extended the lead, battling over soon after half-time.\nLobzhanidze bagged a second from a huge Georgian scrum, before Hogg finished off a searing counter-attack.\nBefore the Scottish points deluge, it was the Georgians who opened the scoring. It was a soft one, with Lobzhanidze, the visitors\' scrum-half, darting down the blind-side of a scrum where Seymour should have halted him, but didn\'t.\nThe wing made amends soon after when Hogg dinked a kick behind the Georgian defence for Seymour to run on to. In the foot-race, Seymour beat Lobzhanidze to the touch-down. It was questionable that Seymour got downward pressure on the ball, but the try was given and Scotland went ahead when Laidlaw swept over the conversion from the touchline.\nThe floodgates

In [34]:
factval_df[factval_df["fact_score"]> 0.5]["system"].iloc[0]

'PtGen'

In [37]:
factual_df[(factual_df['bbcid']==38097669) ][ (factual_df['system']=='PtGen') ]

  factual_df[(factual_df['bbcid']==38097669) ][ (factual_df['system']=='PtGen') ]


Unnamed: 0,bbcid,system,summary,is_factual,worker_id
5552,38097669,PtGen,scotland's hopes of reaching the cyprus cup qu...,no,wid_0
5553,38097669,PtGen,scotland's hopes of reaching the cyprus cup qu...,no,wid_1
5554,38097669,PtGen,scotland's hopes of reaching the cyprus cup qu...,no,wid_2
