In [1]:
# Standard library imports
import json
from collections import Counter
from functools import lru_cache
from pprint import pprint
from typing import Dict, Set, Iterable, Optional, Any, Tuple
import importlib.resources as pkg_resources
from pathlib import Path

# Third-party imports
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
import spacy
import stanza
import textcomplexity  # only used to access en.json
from tqdm.auto import tqdm  

tqdm.pandas()

# Download required resources
stanza.download('en')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Make sure WordNet is available; if not, download it.
try:
    _ = wn.synsets("dog")
except LookupError:
    nltk.download("wordnet")
    nltk.download("omw-1.4")

# Load spaCy model
nlp = spacy.load("en_core_web_md")
spacy_nlp = nlp
spacy_nlp.add_pipe("sentencizer")

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 8.24MB/s]                    
2025-12-21 23:34:17 INFO: Downloaded file to C:\Users\rroll\stanza_resources\resources.json
2025-12-21 23:34:17 INFO: Downloading default packages for language: en (English) ...
2025-12-21 23:34:18 INFO: File exists: C:\Users\rroll\stanza_resources\en\default.zip
2025-12-21 23:34:21 INFO: Finished downloading models and saved to C:\Users\rroll\stanza_resources
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rroll\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rroll\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


<spacy.pipeline.sentencizer.Sentencizer at 0x23d0148c290>

In [2]:

# Colonnes attendues
COL_SIMPLE = "Simple"
COL_COMPLEX = "Complex"
ID_COL = "orig_id"

# Sorties
OUT_DIR = Path("data_prepared")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Langue / modèles
LANG = "en"
SPACY_MODEL = "en_core_web_md"


# Liste officielle des métriques (test.ipynb)
MEASURES = ["MTLD", "LD", "LS", "MDD", "CS", "LC", "CoH"]


In [3]:
# Chargement spaCy (doit contenir des vecteurs pour CoH)
nlp = spacy.load(SPACY_MODEL)
spacy_nlp = nlp

# Important : segmentation en phrases
if "sentencizer" not in spacy_nlp.pipe_names:
    spacy_nlp.add_pipe("sentencizer")


In [4]:
# Dictionnaire des datasets
datasets ={'ose_adv_ele':'data_sampled/OSE_adv_ele.csv', 
           'ose_adv_int':'data_sampled/OSE_adv_int.csv',
           'swipe': 'data_sampled/swipe.csv',
           'vikidia':'data_sampled/vikidia.csv'}

def load_data(path):
    return pd.read_csv(path, sep='\t')
    

def load_dataset(name):
    if name not in datasets:
        raise ValueError(f"Dataset {name} not found")
    return load_data(datasets[name])

In [5]:
df = load_dataset('vikidia')
df.head(3)


Unnamed: 0,Simple,Complex
0,North America is a large continent in the Nort...,North America is a continent entirely within t...
1,Current could mean:,Currents or The Current may refer to:
2,"The Moon is Earth's largest natural satellite,...",The Moon is an astronomical body orbiting Eart...


In [6]:
row = df.sample(1)

print('SIMPLE TEXT')
print(row['Simple'].iloc[0])
print('-'*100)
print('COMPLEX TEXT')
print(row['Complex'].iloc[0])


SIMPLE TEXT
Carnival is a public festival which takes place in many cities and towns in many countries around the world. It is in February or March each year. Carnival can sometimes last for several weeks. In some places there is only one day of celebration. There are often street parades, bands, costumes and many people wear masks. Carnival is linked to religious traditions in the Catholic and Eastern Orthodox Churches, and it is also linked to local customs.

Many Christian churches have a 40-day "season" of fasting called Lent, in which Christians prepare for Easter which is one of the two most important feasts in the Christian year (the other being Christmas). Easter is in late March or April. Lent always begins on a Wednesday, which is called Ash Wednesday in February or March. On that day, many people go to church and have some ash smeared on their forehead as a sign of sorrow for their sins. Then for 40 days, they try to work hard on improving themselves and thinking about the t

In [7]:
cnt = 0
for name, path in datasets.items():
    df = load_dataset(name)
    print(f"{name}: {df.shape[0]} rows")
    cnt += df.shape[0]
print(f"Total: {cnt} rows")

ose_adv_ele: 189 rows
ose_adv_int: 189 rows
swipe: 1233 rows
vikidia: 1233 rows
Total: 2844 rows


In [8]:
df = load_dataset('vikidia')

## Complexity measures



In [9]:
# Cache stanza pipelines to avoid re-loading models
_STANZA_PIPELINES: Dict[str, stanza.Pipeline] = {}

# UPOS tags considered content words (C)
CONTENT_UPOS = {"NOUN", "PROPN", "VERB", "ADJ", "ADV"}


@lru_cache()
def load_cow_top5000_en() -> Set[str]:
    """
    Load the COW-based list of the 5,000 most frequent English content words
    from textcomplexity's English language definition file (en.json).

    We ignore POS tags and keep only lowercased word forms.
    """
    with pkg_resources.files(textcomplexity).joinpath("en.json").open(
        "r", encoding="utf-8"
    ) as f:
        lang_def = json.load(f)

    most_common = lang_def["most_common"]  # list of [word, xpos]
    cow_top5000 = {w.lower() for w, xpos in most_common}
    return cow_top5000


def get_stanza_pipeline(lang: str = "en", use_gpu: bool = False) -> stanza.Pipeline:
    """
    Get (or create) a cached stanza Pipeline for a given language.

    NOTE: You must have downloaded the models beforehand, e.g.:
        import stanza
        stanza.download('en')
    """
    if lang not in _STANZA_PIPELINES:
        _STANZA_PIPELINES[lang] = stanza.Pipeline(
            lang=lang,
            processors="tokenize,pos,lemma,depparse,constituency",
            use_gpu=use_gpu,
            tokenize_no_ssplit=False,
        )
    return _STANZA_PIPELINES[lang]


### Lexical complexity

In [10]:
def _compute_mtld(tokens: Iterable[str], ttr_threshold: float = 0.72) -> Optional[float]:
    """
    Compute MTLD (Measure of Textual Lexical Diversity) for a list of tokens.

    MTLD = total_number_of_tokens / number_of_factors

    A factor is a contiguous segment where the running TTR stays >= threshold.
    When the TTR drops below the threshold, we close a factor (at the previous
    token) and start a new one. At the end, the remaining partial segment is
    counted as a fractional factor, with weight proportional to how close the
    final TTR is to the threshold.
    """
    tokens = [tok for tok in tokens if tok]
    if not tokens:
        return None

    types = set()
    factor_count = 0.0
    token_count_in_factor = 0

    for tok in tokens:
        token_count_in_factor += 1
        types.add(tok)
        ttr = len(types) / token_count_in_factor

        if ttr < ttr_threshold:
            factor_count += 1.0
            types = set()
            token_count_in_factor = 0

    # final partial factor
    if token_count_in_factor > 0:
        final_ttr = len(types) / token_count_in_factor
        if final_ttr < 1.0:
            fractional = (1.0 - final_ttr) / (1.0 - ttr_threshold)
            fractional = max(0.0, min(1.0, fractional))
            factor_count += fractional

    if factor_count == 0:
        return None

    return len(tokens) / factor_count



def _compute_lexical_density(total_tokens: int, content_tokens: int) -> Optional[float]:
    """
    LD = |C| / |T|
    where:
        |C| = number of content-word tokens
        |T| = total number of non-punctuation tokens
    """
    if total_tokens == 0:
        return None
    return content_tokens / total_tokens


def _compute_lexical_sophistication_cow(
    content_forms: Iterable[str],
    cow_top5000: set,
) -> Optional[float]:
    """
    LS = |{ w in C : w not in R }| / |C|
    where:
        C = content-word tokens (surface forms, lowercased)
        R = COW top-5000 content word forms (lowercased)
    """
    forms = [f for f in content_forms if f]
    if not forms:
        return None

    off_list = sum(1 for f in forms if f not in cow_top5000)
    return off_list / len(forms)



def lexical_measures_from_doc(doc) -> Dict[str, Optional[float]]:
    """
    Compute MTLD, LD, LS from a stanza Document.
    """
    cow_top5000 = load_cow_top5000_en()

    mtld_tokens = []
    total_tokens = 0
    content_tokens = 0
    content_forms = []

    for sent in doc.sentences:
        for word in sent.words:
            if word.upos == "PUNCT":
                continue

            lemma = (word.lemma or word.text or "").lower()
            if not lemma:
                continue

            mtld_tokens.append(lemma)
            total_tokens += 1

            if word.upos in CONTENT_UPOS:
                content_tokens += 1
                form = (word.text or "").lower()
                content_forms.append(form)

    mtld = _compute_mtld(mtld_tokens) if mtld_tokens else None
    ld = _compute_lexical_density(total_tokens, content_tokens)
    ls = _compute_lexical_sophistication_cow(content_forms, cow_top5000)

    return {"MTLD": mtld, "LD": ld, "LS": ls}


def lexical_measures_from_text(text: str, lang: str = "en") -> Dict[str, Optional[float]]:
    """
    Convenience wrapper: parse a single text and compute lexical measures.
    """
    if text is None:
        text = ""
    text = str(text)

    if not text.strip():
        return {"MTLD": None, "LD": None, "LS": None}

    nlp = get_stanza_pipeline(lang)
    doc = nlp(text)
    return lexical_measures_from_doc(doc)



def compute_lexical_measures_df(
    df: pd.DataFrame,
    column: str = "text",
    lang: str = "en",
) -> Dict[str, Dict[Any, Optional[float]]]:
    """
    Compute lexical measures for each row in df[column].

    Returns:
        {
            "MTLD": {index: value},
            "LD":   {index: value},
            "LS":   {index: value},
        }
    """
    mtld_res: Dict[Any, Optional[float]] = {}
    ld_res: Dict[Any, Optional[float]] = {}
    ls_res: Dict[Any, Optional[float]] = {}

    for idx, text in df[column].items():
        metrics = lexical_measures_from_text(text, lang=lang)
        mtld_res[idx] = metrics["MTLD"]
        ld_res[idx] = metrics["LD"]
        ls_res[idx] = metrics["LS"]

    return {"MTLD": mtld_res, "LD": ld_res, "LS": ls_res}


### Syntactic complexity

In [11]:
def mdd_from_doc(doc) -> Optional[float]:
    """
    Compute Mean Dependency Distance (MDD) from a stanza Document.

    For each sentence s_i with dependency set D_i:
        MDD_i = (1 / |D_i|) * sum_{(h,d) in D_i} |h - d|
    Then:
        MDD = (1 / k) * sum_i MDD_i, over all sentences with at least one dependency.
    """
    sentence_mdds = []

    for sent in doc.sentences:
        distances = []
        for w in sent.words:
            if w.head is None or w.head == 0:
                continue
            distances.append(abs(w.id - w.head))

        if distances:
            sentence_mdds.append(sum(distances) / len(distances))

    if not sentence_mdds:
        return None
    return sum(sentence_mdds) / len(sentence_mdds)



def _count_clauses_in_tree(tree) -> int:
    """
    Count clause nodes in a constituency tree.

    A simple and standard heuristic (PTB-style) is:
        count all nodes whose label starts with 'S'
        (S, SBAR, SBARQ, SINV, SQ, etc.).

    This aligns with the idea of counting finite and subordinate clauses
    as in Hunt (1965) and later complexity work.
    """
    if tree is None:
        return 0

    # Stanza's constituency tree: tree.label, tree.children
    count = 1 if getattr(tree, "label", "").startswith("S") else 0

    for child in getattr(tree, "children", []):
        # leaves can be strings or terminals without 'label'
        if hasattr(child, "label"):
            count += _count_clauses_in_tree(child)

    return count


def cs_from_doc(doc) -> Optional[float]:
    """
    Compute CS (clauses per sentence) from a stanza Document.

        CS = (1 / k) * sum_i L_i

    where L_i is the number of clauses in sentence s_i, estimated by counting
    all constituents whose label starts with 'S' in the constituency tree of s_i.
    """
    clause_counts = []
    for sent in doc.sentences:
        tree = getattr(sent, "constituency", None)
        if tree is None:
            # No constituency tree available for this sentence
            continue
        num_clauses = _count_clauses_in_tree(tree)
        clause_counts.append(num_clauses)

    if not clause_counts:
        return None

    return sum(clause_counts) / len(clause_counts)



def syntactic_measures_from_doc(doc) -> Dict[str, Optional[float]]:
    """
    Compute MDD and CS from a stanza Document.
    """
    mdd = mdd_from_doc(doc)
    cs = cs_from_doc(doc)
    return {"MDD": mdd, "CS": cs}


def syntactic_measures_from_text(text: str, lang: str = "en") -> Dict[str, Optional[float]]:
    """
    Convenience wrapper: parse a single text and compute syntactic measures.
    """
    if text is None:
        text = ""
    text = str(text)

    if not text.strip():
        return {"MDD": None, "CS": None}

    nlp = get_stanza_pipeline(lang)
    doc = nlp(text)
    return syntactic_measures_from_doc(doc)


def compute_syntactic_measures_df(
    df: pd.DataFrame,
    column: str = "text",
    lang: str = "en",
) -> Dict[str, Dict[Any, Optional[float]]]:
    """
    Compute syntactic measures for each row in df[column].

    Returns:
        {
            "MDD": {index: value},
            "CS":  {index: value},
        }
    """
    mdd_res: Dict[Any, Optional[float]] = {}
    cs_res: Dict[Any, Optional[float]] = {}

    for idx, text in df[column].items():
        metrics = syntactic_measures_from_text(text, lang=lang)
        mdd_res[idx] = metrics["MDD"]
        cs_res[idx] = metrics["CS"]

    return {"MDD": mdd_res, "CS": cs_res}


### Discourse complexity

In [12]:
# Approximate set of content POS tags (spaCy universal POS)
CONTENT_POS =  {"NOUN", "VERB", "ADJ", "ADV"}


def is_content_token(tok):
    """
    Return True if token is considered a content word.
    We ignore stopwords, punctuation, and non-alphabetic tokens.
    """
    return (
        tok.is_alpha
        and not tok.is_stop
        and tok.pos_ in CONTENT_POS
    )


@lru_cache(maxsize=100000)
def get_related_lemmas(lemma):
    """
    Return a set of semantically related lemmas for the given lemma
    using WordNet, including:
      - synonyms
      - antonyms
      - hypernyms / hyponyms
      - meronyms (part/member/substance)
      - coordinate terms (siblings under the same hypernym)

    NOTE: Some older examples mention 'troponyms', but in NLTK's
    WordNet interface there is no 'troponyms()' method on Synset,
    so we do NOT use it here.
    """
    lemma = lemma.lower()
    related = set()
    synsets = wn.synsets(lemma)

    for syn in synsets:
        # Synonyms and antonyms
        for l in syn.lemmas():
            related.add(l.name().lower().replace("_", " "))
            for ant in l.antonyms():
                related.add(ant.name().lower().replace("_", " "))

        # Hypernyms (more general) and hyponyms (more specific)
        for hyper in syn.hypernyms():
            for l in hyper.lemmas():
                related.add(l.name().lower().replace("_", " "))
        for hypo in syn.hyponyms():
            for l in hypo.lemmas():
                related.add(l.name().lower().replace("_", " "))

        # Meronyms: part/member/substance
        for mer in syn.part_meronyms() + syn.member_meronyms() + syn.substance_meronyms():
            for l in mer.lemmas():
                related.add(l.name().lower().replace("_", " "))

        # Coordinate terms (siblings under same hypernym)
        for hyper in syn.hypernyms():
            for sibling in hyper.hyponyms():
                if sibling == syn:
                    continue
                for l in sibling.lemmas():
                    related.add(l.name().lower().replace("_", " "))

    # Remove the lemma itself if present
    related.discard(lemma)
    return related


def lexical_cohesion_single(text, nlp):
    """
    Compute Lexical Cohesion (LC) for a single document:

        LC = |C| / m

    where:
      - |C| is the number of cohesive devices between sentences
        (lexical repetition + semantic relations),
      - m  is the total number of word tokens (alphabetic) in the document.

    If the document has fewer than 2 sentences or no valid words,
    LC is returned as 0.0.
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0

    doc = nlp(text)

    # Total number of alphabetic tokens (denominator m)
    m = sum(1 for tok in doc if tok.is_alpha)
    if m == 0:
        return 0.0

    sentences = list(doc.sents)
    if len(sentences) < 2:
        # With only one sentence, cross-sentence cohesion is not defined
        return 0.0

    # Collect sets of content lemmas per sentence
    sent_lemmas = []
    for sent in sentences:
        lemmas = set(
            tok.lemma_.lower()
            for tok in sent
            if is_content_token(tok)
        )
        if lemmas:
            sent_lemmas.append(lemmas)

    if len(sent_lemmas) < 2:
        return 0.0

    cohesive_count = 0

    for i in range(len(sent_lemmas) - 1):
        for j in range(i + 1, len(sent_lemmas)):
            li = sent_lemmas[i]
            lj = sent_lemmas[j]

            # 1) Lexical repetition: shared lemmas
            shared = li & lj
            cohesive_count += len(shared)

            # 2) Semantic relations via WordNet
            for lemma in li:
                related = get_related_lemmas(lemma)
                cohesive_count += len(related & lj)

    return float(cohesive_count) / float(m)


def sentence_vector(sent, vector_size):
    """
    Represent a sentence as the average of token vectors.
    If no token has a vector, return a zero vector.
    """
    vecs = [
        tok.vector
        for tok in sent
        if tok.has_vector and not tok.is_punct and not tok.is_space
    ]
    if not vecs:
        return np.zeros(vector_size, dtype="float32")
    return np.mean(vecs, axis=0)


def coherence_single(text, nlp):
    """
    Compute Coherence (CoH) for a single document as the average
    cosine similarity between adjacent sentence vectors:

        CoH = (1 / (k-1)) * sum_{i=1}^{k-1} cos(h_i, h_{i+1})

    where h_i is the sentence/topic vector for sentence i.

    If the document has fewer than 2 sentences, CoH = 0.0.
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0

    if nlp.vocab.vectors_length == 0:
        raise ValueError(
            "The loaded spaCy model does not contain word vectors "
            "(nlp.vocab.vectors_length == 0). "
            "Use a model like 'en_core_web_md' or similar."
        )

    doc = nlp(text)
    sentences = list(doc.sents)
    k = len(sentences)

    if k < 2:
        # Only one sentence: no adjacent pair, coherence = 0.0
        return 0.0

    vector_size = nlp.vocab.vectors_length
    sent_vectors = [
        sentence_vector(sent, vector_size)
        for sent in sentences
    ]

    sims = []
    for i in range(k - 1):
        v1 = sent_vectors[i]
        v2 = sent_vectors[i + 1]
        norm1 = np.linalg.norm(v1)
        norm2 = np.linalg.norm(v2)
        denom = norm1 * norm2
        if denom == 0.0:
            # Skip pairs where at least one sentence vector is zero
            continue
        cos_sim = float(np.dot(v1, v2) / denom)
        sims.append(cos_sim)

    if not sims:
        return 0.0

    return float(np.mean(sims))



def compute_lexical_cohesion_vector(df, nlp, column="text"):
    """
    Compute LC for each row of a DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing the texts.
    nlp : spaCy Language object
        Pre-loaded spaCy pipeline with lemmatizer, POS tagger, etc.
    column : str, default "text"
        Name of the column that contains the text.

    Returns
    -------
    np.ndarray
        1D array of LC scores, length == len(df).
    """
    texts = df[column].fillna("").astype(str)
    scores = [lexical_cohesion_single(t, nlp) for t in texts]
    return np.array(scores, dtype="float32")


def compute_coherence_vector(df, nlp, column="text"):
    """
    Compute CoH for each row of a DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing the texts.
    nlp : spaCy Language object
        Pre-loaded spaCy pipeline with word vectors.
    column : str, default "text"
        Name of the column that contains the text.

    Returns
    -------
    np.ndarray
        1D array of CoH scores, length == len(df).
    """
    texts = df[column].fillna("").astype(str)
    scores = [coherence_single(t, nlp) for t in texts]
    return np.array(scores, dtype="float32")


def compute_discourse_measures(df, nlp, column="text"):
    """
    Compute both LC and CoH for each row of a DataFrame and return
    them in a dictionary.

    Returns
    -------
    dict
        {
            "LC":  np.ndarray of lexical cohesion scores,
            "CoH": np.ndarray of coherence scores
        }
    """
    lc_vec = compute_lexical_cohesion_vector(df, nlp, column=column)
    coh_vec = compute_coherence_vector(df, nlp, column=column)
    return {"LC": lc_vec, "CoH": coh_vec}




### Text complexity

In [13]:
def _analyze_text_all(text: str, lang: str = "en") -> Dict[str, Optional[float]]:
    """
    Parse a text with stanza and compute all measures (lexical + syntactic)
    in a single pass.

    Returns a dict with keys:
        "MTLD", "LD", "LS", "MDD", "CS"
    (Discourse measures LC/CoH are added later at DataFrame level, via spaCy.)
    """
    if text is None:
        text = ""
    text = str(text)

    if not text.strip():
        return {"MTLD": None, "LD": None, "LS": None, "MDD": None, "CS": None}

    nlp = get_stanza_pipeline(lang)
    doc = nlp(text)

    lex = lexical_measures_from_doc(doc)
    syn = syntactic_measures_from_doc(doc)

    out: Dict[str, Optional[float]] = {}
    out.update(lex)
    out.update(syn)
    return out


def compute_all_complexity_measures_df(
    df: pd.DataFrame,
    column: str = "text",
    lang: str = "en",
    spacy_nlp=None,
) -> Dict[str, Dict[Any, Optional[float]]]:
    """
    Compute all complexity measures for each row in df[column].

    Args
    ----
    df : pandas.DataFrame
        DataFrame with a text column.
    column : str, default "text"
        Name of the text column.
    lang : str, default "en"
        Language code for stanza.
    n_jobs : int, default 1
        Number of worker processes to use.
            - 1  : sequential execution (no multiprocessing).
            - >1 : multiprocessing with that many workers.
            - 0 or None : use cpu_count() workers.
    spacy_nlp : spaCy Language, required for LC / CoH
        Pre-loaded spaCy pipeline with:
            - POS / lemmatizer for LC
            - word vectors for CoH (e.g. 'en_core_web_md').

    Returns
    -------
    dict
        {
            "MTLD": {index: value},
            "LD":   {index: value},
            "LS":   {index: value},
            "MDD":  {index: value},
            "CS":   {index: value},
            "LC":   {index: value},
            "CoH":  {index: value},
        }
    """
    mtld_res: Dict[Any, Optional[float]] = {}
    ld_res: Dict[Any, Optional[float]] = {}
    ls_res: Dict[Any, Optional[float]] = {}
    mdd_res: Dict[Any, Optional[float]] = {}
    cs_res: Dict[Any, Optional[float]] = {}

    items = list(df[column].items())  # list[(index, text)]
    total_items = len(items)

    # ---- Lexical + syntactic (stanza) ----
    for idx, text in tqdm(
        items,
        total=total_items,
        desc="Computing lexical & syntactic complexity (sequential)",
    ):
        metrics = _analyze_text_all(text, lang=lang)
        mtld_res[idx] = metrics["MTLD"]
        ld_res[idx] = metrics["LD"]
        ls_res[idx] = metrics["LS"]
        mdd_res[idx] = metrics["MDD"]
        cs_res[idx] = metrics["CS"]


    # ---- Discourse measures (spaCy: LC & CoH) ----
    if spacy_nlp is None:
        raise ValueError(
            "spacy_nlp must be provided to compute LC and CoH. "
            "Load a spaCy model with vectors, e.g. 'en_core_web_md', and "
            "pass it as spacy_nlp=..."
        )

    discourse = compute_discourse_measures(df, spacy_nlp, column=column)
    lc_vec = discourse["LC"]
    coh_vec = discourse["CoH"]

    lc_res: Dict[Any, float] = {}
    coh_res: Dict[Any, float] = {}

    # Map arrays back to DataFrame indices
    for i, idx in enumerate(df.index):
        lc_res[idx] = float(lc_vec[i])
        coh_res[idx] = float(coh_vec[i])

    return {
        "MTLD": mtld_res,
        "LD": ld_res,
        "LS": ls_res,
        "MDD": mdd_res,
        "CS": cs_res,
        "LC": lc_res,
        "CoH": coh_res,
    }


## Calcul métriques Simple/Complex

In [14]:
def compute_both_sides_metrics(
    df: pd.DataFrame,
    col_simple: str = COL_SIMPLE,
    col_complex: str = COL_COMPLEX,
    lang: str = LANG,
    spacy_nlp=None,
) -> pd.DataFrame:
    """
    Calcule les 7 métriques de complexité pour chaque ligne, pour Simple et Complex,
    en réutilisant strictement compute_all_complexity_measures_df (test.ipynb).

    Retour : DataFrame original + 14 colonnes :
      - simple_MTLD ... simple_CoH
      - complex_MTLD ... complex_CoH
    """
    required = {col_simple, col_complex}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Colonnes manquantes: {missing}. Colonnes présentes: {list(df.columns)}")

    if spacy_nlp is None:
        raise ValueError("spacy_nlp doit être fourni (modèle avec vecteurs)")

    # Mesures Simple
    simple = compute_all_complexity_measures_df(df, column=col_simple, lang=lang, spacy_nlp=spacy_nlp)
    # Mesures Complex
    complex_ = compute_all_complexity_measures_df(df, column=col_complex, lang=lang, spacy_nlp=spacy_nlp)

    out = df.copy()

    # On aligne explicitement sur l'index du DataFrame
    for m in MEASURES:
        out[f"simple_{m}"] = pd.Series(simple[m]).reindex(out.index)
        out[f"complex_{m}"] = pd.Series(complex_[m]).reindex(out.index)

    # Conversion numérique (sécurité)
    metric_cols = [f"simple_{m}" for m in MEASURES] + [f"complex_{m}" for m in MEASURES]
    for c in metric_cols:
        out[c] = pd.to_numeric(out[c], errors="coerce")

    return out


## Ajout orig_id

In [15]:
def add_orig_id(df: pd.DataFrame, id_col: str = ID_COL) -> pd.DataFrame:
    """
    Ajoute une colonne identifiant stable 'orig_id' basée sur le numéro de ligne d'origine (0-based).
    Si la colonne existe déjà, on ne la modifie pas.
    """
    if id_col in df.columns:
        return df
    df2 = df.reset_index(drop=True).copy()
    df2.insert(0, id_col, df2.index.astype(int))
    return df2


## Filtrage NaN 

In [16]:
def drop_rows_with_any_nan_metrics(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Supprime toute ligne pour laquelle au moins une des 14 métriques (Simple/Complex) est NaN.
    Retourne :
      - df_kept_nan : lignes conservées
      - df_removed_nan : lignes supprimées à cause de NaN
    """
    metric_cols = [f"simple_{m}" for m in MEASURES] + [f"complex_{m}" for m in MEASURES]
    mask_ok = df[metric_cols].notna().all(axis=1)
    return df[mask_ok].copy(), df[~mask_ok].copy()


## Dominance

In [17]:
def dominance_flags(df: pd.DataFrame) -> pd.Series:
    """
    Définition :
    Une ligne est en dominance si :
      1) Simple atteint ou dépasse Complex sur TOUTES les métriques (>=),
      2) ET il y a au moins une amélioration stricte (>) dans chacune des 3 dimensions :
         - lexicale : MTLD, LD, LS
         - syntaxique : MDD, CS
         - discursive : LC, CoH

    Retour :
      - Series booléenne : True si dominance atteinte, sinon False.
    """
    lex = ["MTLD", "LD", "LS"]
    synt = ["MDD", "CS"]
    disc = ["LC", "CoH"]

    # 1) Condition "atteint ou dépasse" sur toutes les métriques
    all_measures = lex + synt + disc
    ge_all = []
    for m in all_measures:
        ge_all.append(df[f"simple_{m}"] >= df[f"complex_{m}"])
    ge_all = pd.concat(ge_all, axis=1).all(axis=1)

    # 2) Amélioration stricte dans chaque dimension
    gt_lex = pd.concat([(df[f"simple_{m}"] > df[f"complex_{m}"]) for m in lex], axis=1).any(axis=1)
    gt_synt = pd.concat([(df[f"simple_{m}"] > df[f"complex_{m}"]) for m in synt], axis=1).any(axis=1)
    gt_disc = pd.concat([(df[f"simple_{m}"] > df[f"complex_{m}"]) for m in disc], axis=1).any(axis=1)

    # Dominance finale
    return ge_all & gt_lex & gt_synt & gt_disc


## Export CSV (All / Kept / Removed / Removed IDs)

In [18]:
def export_csv_bundle(dataset_name: str, df_all: pd.DataFrame, df_kept: pd.DataFrame, df_removed: pd.DataFrame) -> Dict[str, str]:
    """
    Exporte les résultats en CSV (UTF-8), selon le format attendu.
    """
    all_path = OUT_DIR / f"{dataset_name}_with_metrics_all.csv"
    kept_path = OUT_DIR / f"{dataset_name}_with_metrics_kept.csv"
    removed_path = OUT_DIR / f"{dataset_name}_with_metrics_removed.csv"
    removed_ids_path = OUT_DIR / f"{dataset_name}_removed_ids.csv"

    df_all.to_csv(all_path, index=False, encoding="utf-8")
    df_kept.to_csv(kept_path, index=False, encoding="utf-8")
    df_removed.to_csv(removed_path, index=False, encoding="utf-8")

    pd.DataFrame({ID_COL: df_removed[ID_COL].astype(int).sort_values()}).to_csv(
        removed_ids_path, index=False, encoding="utf-8"
    )

    return {
        "all": str(all_path),
        "kept": str(kept_path),
        "removed": str(removed_path),
        "removed_ids": str(removed_ids_path),
    }


## Pipeline dataset

In [19]:
def prepare_dataset_offline(dataset_name: str) -> Dict[str, Any]:
    """
    Prépare un dataset :
      1) charge les données via load_dataset (test.ipynb)
      2) ajoute orig_id (numéro de ligne d'origine)
      3) calcule les métriques offline (Simple & Complex)
      4) supprime les lignes avec NaN sur une métrique
      5) supprime les lignes en dominance
      6) exporte en CSV

    Sortie finale (df_kept) : 16 colonnes minimales :
      orig_id, Simple, Complex, 7 métriques Simple, 7 métriques Complex
    """
    df_raw = load_dataset(dataset_name)
    df_raw = add_orig_id(df_raw, id_col=ID_COL)

    # On conserve uniquement les colonnes texte + id
    df_base = df_raw[[ID_COL, COL_SIMPLE, COL_COMPLEX]].copy()

    # Calcul des métriques
    df_all = compute_both_sides_metrics(df_base, spacy_nlp=spacy_nlp)

    # Marquage NaN
    df_no_nan, df_removed_nan = drop_rows_with_any_nan_metrics(df_all)
    df_no_nan["removed_reason"] = ""  # juste pour cohérence si besoin

    # Dominance
    dom = dominance_flags(df_no_nan)
    df_no_nan["simple_dominates_complex"] = dom

    df_removed_dom = df_no_nan[df_no_nan["simple_dominates_complex"]].copy()
    df_kept = df_no_nan[~df_no_nan["simple_dominates_complex"]].copy()

    # Bundle removed = NaN + dominance 
    df_removed_nan = df_removed_nan.copy()
    if not df_removed_nan.empty:
        df_removed_nan["simple_dominates_complex"] = False
        df_removed_nan["removed_reason"] = "nan_metrics"

    if not df_removed_dom.empty:
        df_removed_dom["removed_reason"] = "dominance"

    df_removed = pd.concat([df_removed_nan, df_removed_dom], axis=0).sort_values(ID_COL)

    # Création rendu metric
    metric_cols_simple = [f"simple_{m}" for m in MEASURES]
    metric_cols_complex = [f"complex_{m}" for m in MEASURES]
    cols_16 = [ID_COL, COL_SIMPLE, COL_COMPLEX] + metric_cols_simple + metric_cols_complex

    df_all_export = df_all[cols_16].copy()
    df_kept_export = df_kept[cols_16].copy()
    df_removed_export = df_removed[cols_16 + ["removed_reason"]].copy() if "removed_reason" in df_removed.columns else df_removed[cols_16].copy()

    paths = export_csv_bundle(dataset_name, df_all_export, df_kept_export, df_removed_export)

    return {
        "dataset": dataset_name,
        "rows_all": int(df_all_export.shape[0]),
        "rows_kept": int(df_kept_export.shape[0]),
        "rows_removed": int(df_removed_export.shape[0]),
        "paths": paths,
        "df_all": df_all_export,
        "df_kept": df_kept_export,
        "df_removed": df_removed_export,
    }


## Exécution sur tous les datasets + aperçu DataFrame

In [20]:

results = {}

DATA_DIR = Path("data_sampled")

# Liste des CSV à traiter
CSV_FILES_TO_RUN = [
    # "ose_adv_ele.csv",
    # "ose_adv_int.csv",
    # "swipe.csv",
    "vikidia.csv"
]

for csv_name in CSV_FILES_TO_RUN:
    csv_path = DATA_DIR / csv_name
    if not csv_path.exists():
        raise FileNotFoundError(f"Fichier introuvable : {csv_path.resolve()}")

    dataset_name = csv_path.stem
    print(f"\n=== Preparing: {csv_path.name} ===")

    info = prepare_dataset_offline(dataset_name)
    results[dataset_name] = info

    print(f"All: {info['rows_all']} | Kept: {info['rows_kept']} | Removed: {info['rows_removed']}")
    print("Saved to:", info["paths"])

# Aperçu
example_name = next(iter(results.keys()))
df_prepared = results[example_name]["df_kept"]
df_prepared.head(10)



=== Preparing: vikidia.csv ===


Computing lexical & syntactic complexity (sequential):   0%|          | 0/1233 [00:00<?, ?it/s]2025-12-21 23:36:15 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 8.45MB/s]                    
2025-12-21 23:36:16 INFO: Downloaded file to C:\Users\rroll\stanza_resources\resources.json
2025-12-21 23:36:16 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| mwt          | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |

2025-12-21 23:36:16 INFO: Using device: cpu
2025-12-21 23:36:16 

All: 1233 | Kept: 1186 | Removed: 47
Saved to: {'all': 'data_prepared\\vikidia_with_metrics_all.csv', 'kept': 'data_prepared\\vikidia_with_metrics_kept.csv', 'removed': 'data_prepared\\vikidia_with_metrics_removed.csv', 'removed_ids': 'data_prepared\\vikidia_removed_ids.csv'}


Unnamed: 0,orig_id,Simple,Complex,simple_MTLD,simple_LD,simple_LS,simple_MDD,simple_CS,simple_LC,simple_CoH,complex_MTLD,complex_LD,complex_LS,complex_MDD,complex_CS,complex_LC,complex_CoH
0,0,North America is a large continent in the Nort...,North America is a continent entirely within t...,28.446809,0.617801,0.355932,2.832143,0.904762,0.206349,0.849175,62.438376,0.573404,0.411493,3.562466,2.675862,9.207063,0.859215
2,2,"The Moon is Earth's largest natural satellite,...",The Moon is an astronomical body orbiting Eart...,39.767651,0.527962,0.238779,2.963011,3.03125,3.740887,0.853167,65.879232,0.530912,0.370931,3.721229,3.35732,18.403658,0.879644
3,3,Skiing is either sportive or recreational acti...,Skiing is a means of transport using skis to g...,43.0,0.56686,0.379487,2.851802,2.625,2.553892,0.833354,50.933333,0.591623,0.511062,2.990334,2.369565,3.432285,0.837366
4,4,The United States of America is a federal repu...,Coordinates: 40°N 100°W﻿ / ﻿40°N 100°W﻿ / 40; ...,57.784401,0.546474,0.32668,3.153669,2.747059,14.112093,0.849551,75.512494,0.559053,0.336475,3.603042,2.772727,23.276802,0.874714
5,5,The beluga whale (Delphinapterus leucas) is a ...,The beluga whale (/bɪˈluːɡə/) (Delphinapterus ...,56.252177,0.54047,0.434783,3.049386,2.153846,1.235127,0.78493,73.340963,0.53209,0.419057,3.538253,3.419223,24.713951,0.863741
6,6,Feathers are the things which cover birds. The...,Feathers are epidermal growths that form dist...,40.92766,0.615721,0.340426,2.270137,1.88,2.578947,0.794947,60.039474,0.5595,0.447709,3.244358,2.836449,13.591144,0.85886
7,7,A religion is a set of beliefs that is passion...,Antiquity\r\n\r\nMedieval\r\n\r\nEarly modern\...,43.815789,0.525526,0.202286,2.780534,3.169811,6.507589,0.85229,67.598214,0.566636,0.328438,3.60704,3.175719,17.838646,0.852283
8,8,The Solar System is the Sun and all the object...,The Solar System[b] is the gravitationally bou...,51.380952,0.541242,0.330479,2.886124,3.027778,2.832381,0.798105,58.904914,0.545826,0.387038,3.556041,3.794562,16.532469,0.858444
9,9,French Guiana (French: Guyane) is an overseas ...,French Guiana (/ɡiˈɑːnə/ or /ɡiˈænə/; French: ...,41.243863,0.53,0.495283,2.956662,1.844444,1.21137,0.748275,66.944252,0.568671,0.420211,3.33475,2.527919,5.69645,0.8161
10,10,Africa is the second largest continent in the ...,Africa is the world's second-largest and secon...,60.335113,0.592243,0.322124,2.974312,1.684932,2.162403,0.836269,71.347756,0.576051,0.410155,3.506575,2.744986,14.434546,0.849769
