<a href="https://colab.research.google.com/github/EdgarID/foteFatal.py/blob/toyVersion/automatic_pos_tagging/brill_tagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install nltk & conllu
! pip install nltk conllu

In [None]:
# imports
import nltk
from nltk.tag import brill, brill_trainer
from nltk.tag import RegexpTagger
from conllu import parse
import requests

In [None]:
# ---------------------------------------------------------------------
# 1. Load and parse UD French corpus
# ---------------------------------------------------------------------
url = "https://raw.githubusercontent.com/UniversalDependencies/UD_French-GSD/master/fr_gsd-ud-train.conllu"
data = requests.get(url).text
sentences = parse(data)

# Extract (word, pos) tuples
tagged_sents = [[(t["form"], t["upos"]) for t in s] for s in sentences]

# Split train/test
train_sents = tagged_sents[:8000]
test_sents = tagged_sents[8000:]

In [None]:
from nltk.tag import UnigramTagger, RegexpTagger, DefaultTagger

# ---------------------------------------------------------------------
# 2. Create baseline tagger — Unigram with backoff
# ---------------------------------------------------------------------

# Step 1: Default tagger (fallback for completely unknown words)
default_tagger = DefaultTagger('NOUN')

# Step 2: Regex tagger for simple morphological patterns
regexp_tagger = RegexpTagger([
    (r'.*ment$', 'ADV'),    # typical adverb suffix
    (r'.*tion$', 'NOUN'),    # typical noun suffix
    (r'.*able$', 'ADJ'),     # typical adjective suffix
    (r'.*', 'NOUN')          # catch-all fallback
], backoff=default_tagger)

# Step 3: Unigram tagger trained on data, backed off by regex/default
unigram_tagger = UnigramTagger(train_sents, backoff=regexp_tagger)

# ---------------------------------------------------------------------
# Evaluate baseline accuracy
# ---------------------------------------------------------------------
accuracy = unigram_tagger.evaluate(test_sents)
print("Unigram baseline accuracy with backoff:", accuracy)

In [None]:
# ---------------------------------------------------------------------
# 3. Train Brill tagger on top of the Unigram baseline
# ---------------------------------------------------------------------
templates = brill.fntbl37()  # 37 default templates from NLTK
trainer = brill_trainer.BrillTaggerTrainer(unigram_tagger, templates)
brill_tagger = trainer.train(train_sents, max_rules=200)

print("✅ Brill tagger trained.")
print("Accuracy:", brill_tagger.evaluate(test_sents))

In [None]:
import pickle
# ---------------------------------------------------------------------
# 4. Save the model
# ---------------------------------------------------------------------
with open("brill_french_tagger.pkl", "wb") as f:
    pickle.dump(brill_tagger, f)

print("✅ Saved model to brill_french_tagger.pkl")

In [None]:
# download punktab
import nltk
nltk.download('punkt_tab')

In [None]:
import nltk
import pickle
import csv

# ---------------------------------------------------------------------
# 1. Load the trained Brill tagger
# ---------------------------------------------------------------------
with open("brill_french_tagger.pkl", "rb") as f:
    brill_tagger = pickle.load(f)

print("✅ Brill tagger loaded.")

# ---------------------------------------------------------------------
# 2. Read the new French text file
# ---------------------------------------------------------------------
with open("./Apprenants polonophones du FLE_Licence 2.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Tokenize the text
tokens = nltk.word_tokenize(text, language="french")

# ---------------------------------------------------------------------
# 3. Apply the Brill tagger
# ---------------------------------------------------------------------
tagged = brill_tagger.tag(tokens)

# Print first 20 tagged tokens
for word, tag in tagged[:20]:
    print(f"{word:<20} {tag}")

# ---------------------------------------------------------------------
# 4. Save results to CSV
# ---------------------------------------------------------------------
with open("brill_pos_output.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["TOKEN", "POS"])
    for word, tag in tagged:
        writer.writerow([word, tag])

print("✅ POS tagging results saved to brill_pos_output.csv")
