In [52]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [53]:
!pip install fastcoref
# or for training:
!pip install tiktoken



In [54]:
import tiktoken
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from sentence_transformers import CrossEncoder
import spacy
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [55]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

In [56]:
text = """The European Central Bank (ECB) is the central component of the Eurosystem and the European System of Central Banks (ESCB) as well as one of seven institutions of the European Union.[2] It is one of the world's most important central banks with a balance sheet total of around 7 trillion.[3]

The ECB Governing Council makes monetary policy for the Eurozone and the European Union, administers the foreign exchange reserves of EU member states, engages in foreign exchange operations, and defines the intermediate monetary objectives and key interest rate of the EU. The ECB Executive Board enforces the policies and decisions of the Governing Council, and may direct the national central banks when doing so.[4] The ECB has the exclusive right to authorise the issuance of euro banknotes. Member states can issue euro coins, but the volume must be approved by the ECB beforehand. The bank also operates the T2 (RTGS) payments system.

The ECB was established by the Treaty of Amsterdam in May 1999 with the purpose of guaranteeing and maintaining price stability. On 1 December 2009, the Treaty of Lisbon became effective and the bank gained the official status of an EU institution. When the ECB was created, it covered a Eurozone of eleven members. Since then, Greece joined in January 2001, Slovenia in January 2007, Cyprus and Malta in January 2008, Slovakia in January 2009, Estonia in January 2011, Latvia in January 2014, Lithuania in January 2015 and Croatia in January 2023.[5] The current president of the ECB is Christine Lagarde. Seated in Frankfurt, Germany, the bank formerly occupied the Eurotower prior to the construction of its new seat.

The ECB is directly governed by European Union law. Its capital stock, worth €11 billion, is owned by all 27 central banks of the EU member states as shareholders.[6] The initial capital allocation key was determined in 1998 on the basis of the states' population and GDP, but the capital key has been readjusted since.[6] Shares in the ECB are not transferable and cannot be used as collateral"""

In [57]:
from fastcoref import spacy_component
import spacy


nlp = spacy.load("en_core_web_trf")
nlp.add_pipe("fastcoref")


  model.load_state_dict(torch.load(filelike, map_location=device))


<fastcoref.spacy_component.spacy_component.FastCorefResolver at 0x7cdcb63dfe10>

In [58]:
import numpy as np
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_coherence(text, window_size=2):
    sentences = text.split(". ")
    if len(sentences) < 2:
        return 0.0

    embeddings = model.encode(sentences, convert_to_numpy=True)

    similarities = []
    for i in range(len(sentences) - window_size + 1):
        window_embs = embeddings[i : i + window_size]
        for j in range(len(window_embs) - 1):
            sim = 1 - cosine(window_embs[j], window_embs[j + 1])
            similarities.append(sim)

    avg_similarity = np.mean(similarities) if similarities else 0.0
    return avg_similarity

In [59]:
import spacy

def analyze_text_complexity(text):
    doc = nlp(text)

    total_words = len([token.text for token in doc if token.is_alpha])
    total_sentences = len(list(doc.sents))
    total_chars = sum(len(token.text) for token in doc)
    unique_words = len(set(token.text.lower() for token in doc if token.is_alpha))

    content_words_noun = [token for token in doc if token.pos_ in {"NOUN",}]
    content_words_verb = [token for token in doc if token.pos_ in {"VERB"}]
    content_words_adv = [token for token in doc if token.pos_ in {"ADV"}]
    content_words_adj = [token for token in doc if token.pos_ in {"ADJ"}]

    lexical_density_noun = round(len(content_words_noun) / total_words,4) if total_words > 0 else 0
    lexical_density_verb = round(len(content_words_verb) / total_words,4) if total_words > 0 else 0
    lexical_density_adv = round(len(content_words_adv) / total_words,4) if total_words > 0 else 0
    lexical_density_adj = round(len(content_words_adj) / total_words,4) if total_words > 0 else 0

    ttr = unique_words / total_words if total_words > 0 else 0
    avg_word_length = total_chars / total_words if total_words > 0 else 0
    avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0

    max_depth = max([token.i - token.head.i for token in doc if token.head != token]) if doc else 0

    clause_count = sum(1 for token in doc if token.dep_ in {"conj", "ccomp", "advcl"})

    named_entities = len(doc.ents)

    coherence = compute_coherence(text,window_size=2)

    return {
        "Lexical Density Noun": lexical_density_noun,
        "Lexical Density Verb": lexical_density_verb,
        "Lexical Density Adverb": lexical_density_adv,
        "Lexical Density Adjective": lexical_density_adj,
        "Type-Token Ratio": ttr,
        "Average Word Length": avg_word_length,
        "Average Sentence Length": avg_sentence_length,
        "Max Syntactic Depth": max_depth,
        "Clauses per Sentence": clause_count / total_sentences if total_sentences > 0 else 0,
        "Named Entity Count": named_entities,
        'Coherence':coherence}

metrics = analyze_text_complexity(text)
print(metrics)


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

{'Lexical Density Noun': 0.1881, 'Lexical Density Verb': 0.0846, 'Lexical Density Adverb': 0.0345, 'Lexical Density Adjective': 0.0627, 'Type-Token Ratio': 0.4952978056426332, 'Average Word Length': 5.382445141065831, 'Average Sentence Length': 16.789473684210527, 'Max Syntactic Depth': 40, 'Clauses per Sentence': 1.368421052631579, 'Named Entity Count': 56, 'Coherence': 0.5485425545302572}


In [60]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

# Load model and tokenizer


# Text to extract triplets from



In [61]:
def extract_facts(text):
  nlp = spacy.load("en_core_web_trf")
  nlp.add_pipe("fastcoref")

  prs = text.split('\n\n')
  new_p = []
  for p in prs:
    doc = nlp(p,component_cfg={"fastcoref": {'resolve_text': True}})
    new_p.append(doc._.resolved_text)
  all_snts=[]
  for p in new_p:
    doc = nlp(p)
    for s in doc.sents:
      all_snts.append(s.text)


  facts_all=[]
  i=0
  tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
  model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
  for t in all_snts:
    i=i+1
    model_inputs = tokenizer(t, max_length=256, padding=True, truncation=True, return_tensors = 'pt')

    # Generate
    generated_tokens = model.generate(
        model_inputs["input_ids"].to(model.device),
        attention_mask=model_inputs["attention_mask"].to(model.device),
    )

    # Extract text
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)
    facts_sent = []
    # Extract triplets
    for idx, sentence in enumerate(decoded_preds):
        trip = extract_triplets(sentence)
        for tr in trip:
          facts_sent.append({'head':tr['head'],'type':tr['type'],'tail':tr['tail']})
    facts_all.append({f'sentence {i}':facts_sent})

  return {'facts':facts_all}


In [62]:
text

"The European Central Bank (ECB) is the central component of the Eurosystem and the European System of Central Banks (ESCB) as well as one of seven institutions of the European Union.[2] It is one of the world's most important central banks with a balance sheet total of around 7 trillion.[3]\n\nThe ECB Governing Council makes monetary policy for the Eurozone and the European Union, administers the foreign exchange reserves of EU member states, engages in foreign exchange operations, and defines the intermediate monetary objectives and key interest rate of the EU. The ECB Executive Board enforces the policies and decisions of the Governing Council, and may direct the national central banks when doing so.[4] The ECB has the exclusive right to authorise the issuance of euro banknotes. Member states can issue euro coins, but the volume must be approved by the ECB beforehand. The bank also operates the T2 (RTGS) payments system.\n\nThe ECB was established by the Treaty of Amsterdam in May 1

In [63]:
extract_facts(text)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

{'facts': [{'sentence 1': [{'head': 'European Central Bank',
     'type': 'part of',
     'tail': 'Eurosystem'},
    {'head': 'European Central Bank',
     'type': 'part of',
     'tail': 'European System of Central Banks'},
    {'head': 'Eurosystem',
     'type': 'has part',
     'tail': 'European Central Bank'},
    {'head': 'European System of Central Banks',
     'type': 'has part',
     'tail': 'European Central Bank'}]},
  {'sentence 2': [{'head': 'European Central Bank',
     'type': 'instance of',
     'tail': 'central bank'}]},
  {'sentence 3': [{'head': 'Eurozone',
     'type': 'shares border with',
     'tail': 'European Union'},
    {'head': 'European Union',
     'type': 'shares border with',
     'tail': 'Eurozone'}]},
  {'sentence 4': [{'head': 'ECB Executive Board',
     'type': 'part of',
     'tail': 'The ECB Governing Council'},
    {'head': 'The ECB Governing Council',
     'type': 'has part',
     'tail': 'ECB Executive Board'}]},
  {'sentence 5': [{'head': 'euro b

In [64]:
!pip install tiktoken



In [65]:
from datasets import load_dataset
rag_dataset = load_dataset("neural-bridge/rag-dataset-1200")


README.md:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

(…)-00000-of-00001-f0c158413defd454.parquet:   0%|          | 0.00/2.32M [00:00<?, ?B/s]

(…)-00000-of-00001-06d83c58a8ea10e8.parquet:   0%|          | 0.00/604k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/960 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/240 [00:00<?, ? examples/s]

In [66]:
import torch

In [67]:
def tokenize_text(text):
    doc = nlp(text,component_cfg={"fastcoref": {'resolve_text': True}})

    return [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and token.pos_ in ['NOUN','VERB','ADJ','ADV']]
def extract_entities(text):
    doc = nlp(text)
    return set(ent.text for ent in doc.ents)

def compute_token_overlap(context, answer):
    context_tokens = set(tokenize_text(context))
    answer_tokens = set(tokenize_text(answer))

    context_answer_overlap = len(context_tokens.intersection(answer_tokens)) / len(answer_tokens) if len(answer_tokens) > 0 else 0

    return context_answer_overlap

def compute_nli_score(context, answer):
    model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    input = tokenizer(context, answer, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to('cpu'))  # device = "cuda:0" or "cpu"
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}

    return prediction

def improved_factual_matching(context, answer):

    token_overlap_score = compute_token_overlap(context, answer)


    context_entities = extract_entities(context)

    answer_entities = extract_entities(answer)
    entity_overlap = len(context_entities.intersection(answer_entities)) / len(answer_entities) if len(answer_entities) > 0 else 0

    label = compute_nli_score(context, answer)

    return {
        'token_overlap_score': token_overlap_score,
        'entity_overlap': entity_overlap,
        'context_nli_label': label,
    }

# Example usage
context = rag_dataset['train'][68]['context']
question = rag_dataset['train'][68]['question']
answer = rag_dataset['train'][68]['answer']

matching_results = improved_factual_matching(context, answer)

# Display the results
print(f"Token Overlap Score: {matching_results['token_overlap_score']}")
print(f"Entity Overlap: {matching_results['entity_overlap']:.4f}")
print(f"NLI Label (Entailment/Contradiction): {matching_results['context_nli_label']}")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Token Overlap Score: 1.0
Entity Overlap: 1.0000
NLI Label (Entailment/Contradiction): {'entailment': 89.1, 'neutral': 9.6, 'contradiction': 1.2}


In [68]:
rag_dataset['train'][68]['context'],rag_dataset['train'][68]['question'],rag_dataset['train'][68]['answer']

('@article{Griebel.Schweitzer:2002*3, key = {2001:yyy}, author = {M.~Griebel and M.~A. Schweitzer}, title = {A Particle-Partition of Unity Method---{P}art {III}: {A} Multilevel Solver}, institution = {Sonderforschungsbereich 256, Institut f\\"ur Angewandte Mathematik, Universit\\"at Bonn}, journal = {SIAM J. Sci. Comp.}, year = {2002}, volume = {24}, number = {2}, pages = {377--409}, note = {}, annote = {refereed article,256D}, ps = { 1}, pdf = { 1}, abstract = {In this paper we focus on the efficient solution of the linear block-systems arising from a Galerkin discretization of an elliptic partial differential equation of second order with the partition of unity method (PUM). We present a cheap multilevel solver for partition of unity discretizations of any order. The shape functions of a PUM are products of piecewise rational partition of unity (PU) functions and higher order local approximation functions (usually a local polynomial. Furthermore, they are non-interpolatory. In a mult