In [None]:
# Core libraries
!pip install pandas numpy nltk spacy

# Install spaCy model
!python -m spacy download en_core_web_sm

# Libraries for semantic metrics
!pip install bert-score
!pip install bleurt
!pip install moverscore

# Download BLEURT checkpoint
!wget https://storage.googleapis.com/bleurt-data/BLEURT-20.zip
!unzip BLEURT-20.zip

In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import string
from collections import defaultdict

# import semantic libraries after pip installs
try:
    from bert_score import score as bert_score_calc
except ImportError:
    print("bert_score not found. Please run 'pip install bert_score'")

try:
    from bleurt.score import BleurtScorer
except ImportError:
    print("bleurt not found. Please run 'pip install bleurt'")

try:
    from moverscore import get_idf_dict, word_mover_score
except ImportError:
    print("moverscore not found. Please run 'pip install moverscore'")


# one time nltk downloads
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# load nlp models
# load a spaCy model. 'en_core_web_sm' is fast and light. 'en_core_web_trf' is more accurate but much slower.
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Spacy model 'en_core_web_sm' not found.")
    print("Please run: python -m spacy download en_core_web_sm")
    nlp = None

# Load BLEURT scorer
# IMPORTANT: update path to where i unzipped "BLEURT-20"
BLEURT_CHECKPOINT_PATH = "./BLEURT-20"
try:
    bleurt_scorer = BleurtScorer(BLEURT_CHECKPOINT_PATH)
except (IOError, NameError):
    print(f"BLEURT checkpoint not found at {BLEURT_CHECKPOINT_PATH}")
    print("Please download and unzip it as per the setup instructions.")
    bleurt_scorer = None


# helper function for lexical preprocessing
def preprocess_text_for_lexical(text: str) -> list:
    """
    Lowercase, remove punctuation, remove stopwords, and tokenize.
    """
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words and word.isalpha()]

# METHOD 1: Syntactic Complexity (spaCy-based proxies)
def get_syntactic_complexity(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Calculates syntactic complexity features using spaCy.

    This provides proxies for MLC (Mean Length of Clause) and
    CN/C (Complex Nominals per Clause).

    Metrics:
    - mean_sentence_length: Avg. # of tokens per sentence.
    - mean_noun_chunk_length: Avg. # of tokens per noun chunk (proxy for nominal elaboration).
    - sub_clauses_per_sentence: Avg. # of subordinating conjunctions ('mark' dependency)
      per sentence (proxy for clausal complexity).
    """
    if nlp is None:
        raise EnvironmentError("spaCy model is not loaded. Please check setup.")

    results = []
    for text in df[col]:
        doc = nlp(text)

        num_sentences = len(list(doc.sents))
        num_tokens = len([t for t in doc if not t.is_punct])
        noun_chunks = list(doc.noun_chunks)
        num_sub_clauses = len([t for t in doc if t.dep_ == 'mark'])

        if num_sentences > 0:
            mean_sentence_length = num_tokens / num_sentences
            sub_clauses_per_sentence = num_sub_clauses / num_sentences
        else:
            mean_sentence_length = 0
            sub_clauses_per_sentence = 0

        if len(noun_chunks) > 0:
            mean_noun_chunk_length = sum(len(nc) for nc in noun_chunks) / len(noun_chunks)
        else:
            mean_noun_chunk_length = 0

        results.append({
            'mean_sentence_length': mean_sentence_length,
            'mean_noun_chunk_length': mean_noun_chunk_length,
            'sub_clauses_per_sentence': sub_clauses_per_sentence
        })

    return pd.DataFrame(results)

# METHOD 2: Lexical Richness (TTR, Hapax Rate)
def get_lexical_richness(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """
    Calculates lexical richness features: TTR and Hapax Rate.

    Metrics:
    - ttr (Type-Token Ratio): Unique tokens / Total tokens.
    - hapax_rate (Hapax Legomena Rate): Tokens appearing only once / Total tokens.
    """
    results = []
    for text in df[col]:
        tokens = preprocess_text_for_lexical(text)
        total_tokens = len(tokens)

        if total_tokens == 0:
            results.append({'ttr': 0, 'hapax_rate': 0})
            continue

        num_unique_tokens = len(set(tokens))
        ttr = num_unique_tokens / total_tokens

        freq_dist = FreqDist(tokens)
        num_hapaxes = len(freq_dist.hapaxes())
        hapax_rate = num_hapaxes / total_tokens

        results.append({'ttr': ttr, 'hapax_rate': hapax_rate})

    return pd.DataFrame(results)

# METHOD 3: BERTScore
def get_bertscore(df: pd.DataFrame, col: str, reference_text: str) -> pd.DataFrame:
    """
    Calculates BERTScore (Precision, Recall, F1) against a reference text.
    """
    if 'bert_score_calc' not in globals():
        raise EnvironmentError("bert_score library not loaded.")

    candidates = df[col].tolist()
    # Create a list of the same reference text for all candidates
    references = [reference_text] * len(candidates)

    P, R, F1 = bert_score_calc(candidates, references, lang='en', model_type='bert-base-uncased')

    results = {
        'bertscore_precision': P.numpy(),
        'bertscore_recall': R.numpy(),
        'bertscore_f1': F1.numpy()
    }

    return pd.DataFrame(results)

# METHOD 4: BLEURT
def get_bleurt(df: pd.DataFrame, col: str, reference_text: str) -> pd.DataFrame:
    """
    Calculates the BLEURT score against a reference text.
    Uses the pre-loaded BLEURT-20 scorer.
    """
    if bleurt_scorer is None:
        raise EnvironmentError("BLEURT scorer not loaded. Please check checkpoint path.")

    candidates = df[col].tolist()
    references = [reference_text] * len(candidates)

    scores = bleurt_scorer.score(references=references, candidates=candidates)

    return pd.DataFrame({'bleurt_score': scores})

# METHOD 5: MoverScore
def get_moverscore(df: pd.DataFrame, col: str, reference_text: str) -> pd.DataFrame:
    """
    Calculates MoverScore against a reference text.
    """
    if 'word_mover_score' not in globals():
        raise EnvironmentError("moverscore library not loaded.")

    candidates = df[col].tolist()
    references = [reference_text] * len(candidates)

    # MoverScore requires IDF dictionaries.
    # Create them from the corpus itself (candidates + references).
    all_texts = candidates + references
    idf_dict = get_idf_dict(all_texts)

    # Set stop_words=[] because MoverScore's default list is large
    # and may remove important words for semantic comparison.
    scores = word_mover_score(
        references,
        candidates,
        idf_dict,
        idf_dict,
        stop_words=[],
        n_gram=1,
        remove_subwords=True,
        batch_size=48 # Adjust batch size based on your GPU/CPU memory
    )

    return pd.DataFrame({'moverscore': scores})