In [None]:
import spacy
import pandas as pd

# Charger les modèles lg
nlp_fr = spacy.load("fr_core_news_lg")
nlp_en = spacy.load("en_core_web_lg")


In [None]:
# Liste d'échantillons (50 mots en français, 50 en anglais)
french_words = [
    "mangeaient", "rapidement", "adorablement", "interrompue", "voitures", "invisible", "prolongation", "déménagement",
    "précautionneux", "recommencer", "anticonstitutionnellement", "abandonner", "déchirer", "prévisible", "réapparition",
    "couramment", "connaissance", "entraîner", "sous-marin", "inacceptable", "fonctionnel", "rapprochement",
    "découverte", "collaboration", "agriculteur", "développement", "protection", "encouragement", "transformation",
    "renforcement", "supermarché", "chaleureusement", "encadrement", "prononciation", "révolution", "anticipation",
    "approfondissement", "désintéressement", "réinitialiser", "insupportable", "impraticable", "dévouement", 
    "incompréhension", "admirablement", "préjudiciable", "refondateur", "cohabitation", "opposition", "mécontentement", "espérance"
]

english_words = [
    "running", "beautifully", "unexpected", "cars", "invisible", "extension", "moving", "cautious", "restart",
    "abandon", "tear", "predictable", "reappearance", "fluently", "knowledge", "train", "submarine", "unacceptable",
    "functional", "reunion", "discovery", "collaboration", "farmer", "development", "protection", "encouragement",
    "transformation", "strengthening", "supermarket", "warmly", "supervision", "pronunciation", "revolution", 
    "anticipation", "deepening", "disinterest", "reset", "unbearable", "unworkable", "devotion", "misunderstanding",
    "admirably", "harmful", "foundational", "cohabitation", "opposition", "discontent", "hope", "achievement", "adjustment"
]

all_words = [(w, "fr") for w in french_words] + [(w, "en") for w in english_words]


In [None]:
# Fonction pour extraire les infos utiles
def analyze(word, lang):
    nlp = nlp_fr if lang == "fr" else nlp_en
    doc = nlp(word)
    token = doc[0]
    morph = token.morph.to_dict()
    return {
        "word": word,
        "lang": lang,
        "lemma": token.lemma_,
        "pos": token.pos_,
        "morph": token.morph,
        "prefix": word[:2],
        "root": token.lemma_,
        "suffix": word[-3:]
    }

# Application de l'analyse
results = [analyze(word, lang) for word, lang in all_words]
df = pd.DataFrame(results)
df.head()


In [None]:
# Calcul du taux d'erreur de lemmatisation
df["lemma_error"] = df["word"].str.lower() != df["lemma"].str.lower()
errors = df.groupby("lang")["lemma_error"].mean() * 100
print("Taux d'erreur de lemmatisation (%) :\n", errors)


In [None]:
# Export facultatif vers un CSV
df.to_csv("analyse_morpholexicale.csv", index=False)
df.head(10)
