In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_trf

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModelForSeq2SeqLM

from sentence_transformers import CrossEncoder
import spacy
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [3]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

In [4]:
import spacy


nlp = spacy.load("en_core_web_trf")

In [5]:
import numpy as np
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_coherence(text, window_size=2):
    sentences = text.split(". ")
    if len(sentences) < 2:
        return 1

    embeddings = model.encode(sentences, convert_to_numpy=True)

    similarities = []
    for i in range(len(sentences) - window_size + 1):
        window_embs = embeddings[i : i + window_size]
        for j in range(len(window_embs) - 1):
            sim = 1 - cosine(window_embs[j], window_embs[j + 1])
            similarities.append(sim)

    avg_similarity = np.mean(similarities) if similarities else 0.0
    return float(avg_similarity)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
import spacy

def analyze_text_complexity(text):
    doc = nlp(text)

    total_words = len([token.text for token in doc if token.is_alpha])
    total_sentences = len(list(doc.sents))
    total_chars = sum(len(token.text) for token in doc)
    unique_words = len(set(token.text.lower() for token in doc if token.is_alpha))

    content_words_noun = [token for token in doc if token.pos_ in ["NOUN"]]
    content_words_verb = [token for token in doc if token.pos_ in ["VERB"]]
    content_words_adv = [token for token in doc if token.pos_ in ["ADV"]]
    content_words_adj = [token for token in doc if token.pos_ in ["ADJ"]]

    lexical_density_noun = round(len(content_words_noun) / total_words,4) if total_words > 0 else 0
    lexical_density_verb = round(len(content_words_verb) / total_words,4) if total_words > 0 else 0
    lexical_density_adv = round(len(content_words_adv) / total_words,4) if total_words > 0 else 0
    lexical_density_adj = round(len(content_words_adj) / total_words,4) if total_words > 0 else 0

    ttr = unique_words / total_words if total_words > 0 else 0
    avg_word_length = total_chars / total_words if total_words > 0 else 0
    avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0

    max_depth = max([token.i - token.head.i for token in doc if token.head != token]) if doc else 0

    clause_count = sum(1 for token in doc if token.dep_ in {"conj", "ccomp", "advcl"})

    named_entities = len(doc.ents)

    coherence = compute_coherence(text,window_size=2)

    return {
        "Lexical Density Noun": round(lexical_density_noun,4),
        "Lexical Density Verb": round(lexical_density_verb,4),
        "Lexical Density Adverb": round(lexical_density_adv,4),
        "Lexical Density Adjective": round(lexical_density_adj,4),
        "Type-Token Ratio": round(ttr,4),
        "Average Word Length": round(avg_word_length,4),
        "Average Sentence Length": round(avg_sentence_length,4),
        "Max Syntactic Depth": max_depth,
        "Clauses per Sentence": round(clause_count / total_sentences,4) if total_sentences > 0 else 0,
        "Named Entity Count": named_entities,
        'Coherence':round(coherence,4)}


In [7]:
import pandas as pd

# Execution

In [8]:
from datasets import load_dataset
rag_dataset = load_dataset("neural-bridge/rag-dataset-1200")


In [9]:
import torch

In [11]:
def tokenize_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and token.pos_ in ['NOUN','VERB','ADJ','ADV']]
def extract_entities(text):
    doc = nlp(text)
    return set(ent.text for ent in doc.ents)

def compute_token_overlap(context, answer):
    context_tokens = set(tokenize_text(context))
    answer_tokens = set(tokenize_text(answer))

    context_answer_overlap = len(context_tokens.intersection(answer_tokens)) / len(answer_tokens) if len(answer_tokens) > 0 else 0

    return context_answer_overlap

def improved_factual_matching(context, answer):

    token_overlap_score = compute_token_overlap(context, answer)


    context_entities = extract_entities(context)

    answer_entities = extract_entities(answer)
    entity_overlap = len(context_entities.intersection(answer_entities)) / len(answer_entities) if len(answer_entities) > 0 else 0

    return {
        'token_overlap_score': token_overlap_score,
        'entity_overlap': entity_overlap,
    }

# Example usage
context = rag_dataset['train'][125]['context']
question = rag_dataset['train'][125]['question']
answer = rag_dataset['train'][125]['answer']

matching_results = improved_factual_matching(context, answer)



In [12]:
matching_results

{'token_overlap_score': 1.0, 'entity_overlap': 1.0}

In [13]:
analyze_text_complexity(context)

{'Lexical Density Noun': 0.2371,
 'Lexical Density Verb': 0.1267,
 'Lexical Density Adverb': 0.0436,
 'Lexical Density Adjective': 0.0599,
 'Type-Token Ratio': 0.4823,
 'Average Word Length': 4.9605,
 'Average Sentence Length': 17.0698,
 'Max Syntactic Depth': 31,
 'Clauses per Sentence': 1.093,
 'Named Entity Count': 60,
 'Coherence': 0.3815}

In [14]:
analyze_text_complexity(answer)

{'Lexical Density Noun': 0.2,
 'Lexical Density Verb': 0.08,
 'Lexical Density Adverb': 0.04,
 'Lexical Density Adjective': 0.0,
 'Type-Token Ratio': 0.88,
 'Average Word Length': 5.04,
 'Average Sentence Length': 25.0,
 'Max Syntactic Depth': 20,
 'Clauses per Sentence': 1.0,
 'Named Entity Count': 4,
 'Coherence': 1}

In [15]:
context



In [16]:
answer

'"Total Eclipse of the Heart" was originally written as a vampire love song and was featured in Jim Steinman’s Broadway musical, Dance of the Vampires.'