In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_trf
!pip install lexical_diversity
!pip install textstat
!pip install rapidfuzz

In [13]:
from sentence_transformers import CrossEncoder
import spacy
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [14]:
import spacy


nlp = spacy.load("en_core_web_trf")

In [15]:
import numpy as np
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_coherence(text, window_size=2):
    sentences = text.split(". ")
    if len(sentences) < 2:
        return 1

    embeddings = model.encode(sentences, convert_to_numpy=True)

    similarities = []
    for i in range(len(sentences) - window_size + 1):
        window_embs = embeddings[i : i + window_size]
        for j in range(len(window_embs) - 1):
            sim = 1 - cosine(window_embs[j], window_embs[j + 1])
            similarities.append(sim)

    avg_similarity = np.mean(similarities) if similarities else 0.0
    return float(avg_similarity)

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

In [16]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
import spacy
import numpy as np
from collections import Counter
from scipy.stats import entropy

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.is_alpha]  # Only words
    return tokens

def calculate_ttr(tokens):
    """Calculates the Type-Token Ratio (TTR)."""
    return len(set(tokens)) / len(tokens) if tokens else 0

def calculate_mattr(tokens, window_size=50):
    """Calculates Moving-Average Type-Token Ratio (MATTR)."""
    if len(tokens) < window_size:
        return calculate_ttr(tokens)  # Default to TTR if text is too short

    mattr_scores = [
        len(set(tokens[i : i + window_size])) / window_size
        for i in range(len(tokens) - window_size + 1)
    ]
    return np.mean(mattr_scores)

def calculate_herdans_c(tokens):
    """Measure of lexical richness)."""
    V = len(set(tokens))  # Number of unique words
    N = len(tokens)  # Total words
    return np.log(V) / np.log(N) if N > 1 else 0

def calculate_shannon_entropy(tokens):
    """Word distribution uniformity."""
    word_counts = Counter(tokens)
    probabilities = np.array(list(word_counts.values())) / len(tokens)
    return entropy(probabilities, base=2)  # Use base-2 for bit-based entropy

import spacy
import numpy as np
from nltk.corpus import wordnet
from collections import defaultdict

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.is_alpha]  # Lemmatized words
    return tokens

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms

def calculate_semantic_diversity(tokens):
    synonym_groups = defaultdict(set)

    for word in set(tokens):  # Process unique words
        synonyms = get_synonyms(word)
        if synonyms:
            synonym_groups[frozenset(synonyms)].add(word)

    # Compute semantic diversity as the ratio of unique words to unique synonym groups
    unique_words = len(set(tokens))
    unique_synonym_groups = len(synonym_groups)

    return unique_synonym_groups / unique_words if unique_words else 0


In [18]:
import spacy
import numpy as np
import math
from collections import Counter
from lexical_diversity import lex_div as ld
from scipy.spatial.distance import cosine
from textstat import syllable_count

# Load spaCy model
nlp = spacy.load("en_core_web_trf")

import re
import nltk
from nltk.corpus import words
from string import punctuation

# Download necessary resources
nltk.download("words")

# Load English words
english_words = set(words.words())

def is_repetitive(word):
    """Detect words with excessive character repetition (e.g., 'aaaaaaa', 'hahahaha')"""
    return bool(re.match(r"^(.)\1{3,}$", word))

def has_uncommon_letter_combinations(word):
    """Check for unlikely letter combinations using bigram frequency"""
    common_bigrams = {"th", "he", "in", "er", "an", "re", "on", "at", "en", "nd"}  # Most frequent English bigrams
    bigrams = [word[i:i+2] for i in range(len(word)-1)]
    uncommon_count = sum(1 for bigram in bigrams if bigram not in common_bigrams)
    return uncommon_count / max(len(bigrams), 1) > 0.6  # More than 60% uncommon bigrams

def lacks_vowels(word):
    return not re.search(r"[aeiouy]", word, re.IGNORECASE)

def has_excessive_symbols(word):
    return sum(1 for char in word if char in punctuation or char.isdigit()) > len(word) * 0.4  # More than 40% non-alpha

def has_random_capitalization(word):
    return sum(1 for char in word if char.isupper()) not in {0, len(word)}  # Not all caps or all lowercase

def is_gibberish(word):
    if word.lower() in english_words:
        return False  # Valid dictionary word
    if is_repetitive(word) or has_uncommon_letter_combinations(word):
        return True
    if lacks_vowels(word) or has_excessive_symbols(word) or has_random_capitalization(word):
        return True
    return False

def gibberish_score(token):
    if is_gibberish(token):
        return 1  # This is likely gibberish
    return 0  # This is likely a valid word


def hdd(tokens):
    words = tokens
    return ld.hdd(words)

def entity_diversity(doc):
    entities = [ent.text.lower() for ent in doc.ents]
    return len(set(entities)) / len(entities) if entities else 0

def entity_type_diversity(doc):
    entity_types = [ent.label_ for ent in doc.ents]
    return len(set(entity_types)) / len(entity_types) if entity_types else 0

def pos_diversity(doc):
    pos_tags = [token.pos_ for token in doc]
    return len(set(pos_tags)) / len(pos_tags) if pos_tags else 0

def dependency_diversity(doc):
    dependencies = [token.dep_ for token in doc]
    return len(set(dependencies)) / len(dependencies) if dependencies else 0

def syntactic_pattern_diversity(doc):
    patterns = [" ".join([token.pos_ for token in sent]) for sent in doc.sents]
    return len(set(patterns)) / len(patterns) if patterns else 0

def lemmatization_diversity(doc):
    lemmas = [token.lemma_ for token in doc if token.is_alpha]
    return len(set(lemmas)) / len(lemmas) if lemmas else 0

def sentiment_diversity(doc):
    sentiments = [token.sentiment for token in doc]
    return np.std(sentiments).item() if len(sentiments) > 1 else 0

def analyze_text_complexity(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]

    total_words = len([token.text for token in doc if token.is_alpha])
    total_sentences = len(list(doc.sents))
    total_chars = sum(len(token.text) for token in doc)
    unique_words = len(set(token.text.lower() for token in doc if token.is_alpha))

    content_words_noun = [token for token in doc if token.pos_ in ["NOUN"]]
    content_words_verb = [token for token in doc if token.pos_ in ["VERB"]]
    content_words_adv = [token for token in doc if token.pos_ in ["ADV"]]
    content_words_adj = [token for token in doc if token.pos_ in ["ADJ"]]

    lexical_density_noun = round(len(content_words_noun) / total_words,4) if total_words > 0 else 0
    lexical_density_verb = round(len(content_words_verb) / total_words,4) if total_words > 0 else 0
    lexical_density_adv = round(len(content_words_adv) / total_words,4) if total_words > 0 else 0
    lexical_density_adj = round(len(content_words_adj) / total_words,4) if total_words > 0 else 0


    max_depth = max([token.i - token.head.i for token in doc if token.head != token]) if doc else 0

    clause_count = sum(1 for token in doc if token.dep_ in {"conj", "ccomp", "advcl"})


    coherence = compute_coherence(text,window_size=2)


    results = {'Giberish Score':np.mean([gibberish_score(token) for token in tokens]).item(),
        "Lexical Density Noun": round(lexical_density_noun,4),
        "Lexical Density Verb": round(lexical_density_verb,4),
        "Lexical Density Adverb": round(lexical_density_adv,4),
        "Lexical Density Adjective": round(lexical_density_adj,4),
        "Max Syntactic Depth": max_depth,
        "Clauses per Sentence": round(clause_count / total_sentences,4) if total_sentences > 0 else 0,
        'Coherence':round(coherence,4),
        "Lexical Diversity - Type-Token Ratio (TTR)": calculate_ttr(text),
        "Lexical Diversity - HD-D": hdd(text),
        "Entity Diversity": entity_diversity(doc),
        "Entity Type Diversity": entity_type_diversity(doc),
        "POS Diversity": pos_diversity(doc),
        "Dependency Structure Diversity": dependency_diversity(doc),
        "Syntactic Pattern Diversity": syntactic_pattern_diversity(doc),
        "Morphological Diversity - Lemmatization": lemmatization_diversity(doc),
        "Sentiment Diversity": sentiment_diversity(doc),
        'Semantic Diversity':calculate_semantic_diversity(tokens),
        'Shannon Entropy':calculate_shannon_entropy(tokens),
        'MATTRR':calculate_mattr(tokens),
        "Herdan's C":calculate_herdans_c(tokens)
    }

    return results

# Example usage
text = """Artificial intelligence is evolving rapidly. AI models, such as ChatGPT, are transforming the way we interact with technology.
However, ethical concerns about AI decision-making continue to grow. AI-powered systems are now being used in healthcare, finance, and law.
This raises questions about bias, fairness, and accountability."""
results = analyze_text_complexity(text)

# Print results
for key, value in results.items():
    print(f"{key}: {value:.4f}" if value is not None else f"{key}: Insufficient Data")


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Giberish Score: 0.3750
Lexical Density Noun: 0.3333
Lexical Density Verb: 0.1667
Lexical Density Adverb: 0.0625
Lexical Density Adjective: 0.0625
Max Syntactic Depth: 9.0000
Clauses per Sentence: 0.6667
Coherence: 0.3902
Lexical Diversity - Type-Token Ratio (TTR): 0.1054
Lexical Diversity - HD-D: 0.4545
Entity Diversity: 1.0000
Entity Type Diversity: 1.0000
POS Diversity: 0.2031
Dependency Structure Diversity: 0.2969
Syntactic Pattern Diversity: 1.0000
Morphological Diversity - Lemmatization: 0.8750
Sentiment Diversity: 0.0000
Semantic Diversity: 0.7660
Shannon Entropy: 5.2810
MATTRR: 0.7720
Herdan's C: 0.9258


**Use Cases for AI Text Evaluation**

1. Detecting Repetitive Language
AI text often has a low Type-Token Ratio (TTR), meaning it reuses words instead of using a rich vocabulary.
Low Syntactic Pattern Diversity means it follows the same sentence structures repeatedly.
2. Checking Bias & Overuse of Entities
Low Entity Type Diversity means the AI is mentioning too few types of things (e.g., always discussing "companies" but not "individuals" or "products").
AI models sometimes repeat the same named entities (e.g., ChatGPT often repeats phrases like "AI is transforming industries").
3. Evaluating Readability & Linguistic Complexity
AI-generated text often lacks morphological variety—it reuses words in similar forms instead of showing rich lemmatization diversity.
Phonetic & Syllabic Diversity can check if AI-generated text is unnaturally uniform (e.g., using only short, simple words). Coherence talks about how similar different sentences within one paragraph.


In [19]:
import pandas as pd

# Execution

In [None]:
!pip install datasets

In [21]:
from datasets import load_dataset
rag_dataset = load_dataset("neural-bridge/rag-dataset-1200")


README.md:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

(…)-00000-of-00001-f0c158413defd454.parquet:   0%|          | 0.00/2.32M [00:00<?, ?B/s]

(…)-00000-of-00001-06d83c58a8ea10e8.parquet:   0%|          | 0.00/604k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/960 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/240 [00:00<?, ? examples/s]

In [22]:
import torch

In [23]:
from rapidfuzz import process, fuzz

def fuzzy_intersection(set1, set2, threshold=85):
    matched = set()

    for word in set1:
        match, score, _ = process.extractOne(word, set2, scorer=fuzz.ratio)
        if score >= threshold:
            matched.add(match)  # Store the closest match from set2

    return matched


In [24]:
def tokenize_text(text):
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc if not token.is_stop]
def extract_entities(text):
    doc = nlp(text)
    return set(ent.text.lower() for ent in doc.ents)

def compute_token_overlap(context, answer):
    context_tokens = set(tokenize_text(context))
    answer_tokens = set(tokenize_text(answer))

    context_answer_overlap = len(fuzzy_intersection(context_tokens,answer_tokens)) / len(answer_tokens) if len(answer_tokens) > 0 else 0

    return context_answer_overlap

def improved_factual_matching(context, answer):

    token_overlap_score = compute_token_overlap(context, answer)


    context_entities = extract_entities(context)

    answer_entities = extract_entities(answer)
    entity_overlap = len(fuzzy_intersection(context_entities,answer_entities)) / len(answer_entities) if len(answer_entities) > 0 else 0

    return {
        'token_overlap_score': token_overlap_score,
        'entity_overlap': entity_overlap,
    }

# Example usage
context = rag_dataset['train'][900]['context']
question = rag_dataset['train'][900]['question']
answer = rag_dataset['train'][900]['answer']

matching_results = improved_factual_matching(context, answer)



In [29]:
final_df = pd.DataFrame([analyze_text_complexity(context),analyze_text_complexity(answer)],index=['Context','Answer']).T

In [42]:
final_df['Change'] = ((final_df['Answer']-final_df['Context'])/(final_df['Context']+0.000000000001))*100

In [43]:
pd.DataFrame([matching_results]).T

Unnamed: 0,0
token_overlap_score,1.0
entity_overlap,1.0


In [44]:
import pandas as pd

In [45]:
final_df

Unnamed: 0,Context,Answer,Change
Giberish Score,0.451316,0.363636,-19.427511
Lexical Density Noun,0.2182,0.1,-54.170486
Lexical Density Verb,0.1091,0.1,-8.340972
Lexical Density Adverb,0.0394,0.0,-100.0
Lexical Density Adjective,0.0644,0.0,-100.0
Max Syntactic Depth,21.0,8.0,-61.904762
Clauses per Sentence,0.3158,0.0,-100.0
Coherence,0.1852,1.0,439.956803
Lexical Diversity - Type-Token Ratio (TTR),0.023557,0.403509,1612.894737
Lexical Diversity - HD-D,0.510633,0.465333,-8.871354


In [46]:
context

"HD Springer's Recent Activity\nHD Springer replied to the thread The Nomad From Morwood.\nWow you really do make some of the most beautiful vaporizers. To be honest everything I’ve seen you do is beautiful. Every last detail....Jan 16, 2018 at 2:52 AM\nHD Springer liked Dan Morrison's post in the thread The Nomad From Morwood.\nScrew polishing jig. [IMG] Screw colouring. These are stainless steel screws, but the oxide colouring gives em' a more bronzey look....Jan 16, 2018 at 2:46 AM\nHD Springer liked Saltysusej69's post in the thread DIY Milaana.\nHey guys, I have some updates on the 3d-printing/CAD stuff. I've started using OnShape.com which offers a pretty complete CAD solution...Jan 16, 2018 at 2:40 AM\nHD Springer liked bellas's post in the thread DIY Milaana.\nThis and @Pipes Skeletor have got my artsy fartsy juices salavating like crazy! Vape on.Jan 15, 2018 at 6:35 PM\nHD Springer replied to the thread NewVape FlowerPot Twax Vaporizer.\nI don’t doubt that at all. I like to re

In [47]:
question

'What activity did HD Springer do on Jan 16, 2018 at 2:52 AM?'

In [48]:
answer

'HD Springer replied to the thread The Nomad From Morwood.'