In [1]:
import os
import treetaggerwrapper

# Définir les chemins
tagger_dir = os.path.expanduser("~/treetagger")  # Dossier TreeTagger
param_file = os.path.expanduser("~/treetagger/english.par")  # Modèle anglais
input_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Fichier d'entrée

# Vérifier que les fichiers existent
assert os.path.exists(param_file), "Le fichier de paramètres n'existe pas."
assert os.path.exists(input_file), "Le fichier d'entrée n'existe pas."

# Initialiser TreeTagger
tagger = treetaggerwrapper.TreeTagger(TAGDIR=tagger_dir, TAGPARFILE=param_file)

# Lire le fichier `that_adv.txt`
with open(input_file, "r", encoding="utf-8") as f:
    text = f.read().splitlines()

# Appliquer TreeTagger ligne par ligne
for line in text:
    if line.strip():  # Vérifier que la ligne n'est pas vide
        tags = tagger.tag_text(line)
        print("\nTexte :", line)
        print("Annotation :")
        for tag in tags:
            print("\t".join(tag.split("\t")))  # Affiche proprement

  punct2find_re = re.compile("([^ ])([[" + ALONEMARKS + "])",
  DnsHostMatch_re = re.compile("(" + DnsHost_expression + ")",
  UrlMatch_re = re.compile(UrlMatch_expression, re.VERBOSE | re.IGNORECASE)
  EmailMatch_re = re.compile(EmailMatch_expression, re.VERBOSE | re.IGNORECASE)



Texte : I didn’t think it would be that hard.
Annotation :
I	PP	I
didn’t	VVD	didn’t
think	VV	think
it	PP	it
would	MD	would
be	VB	be
that	RB	that
hard	JJ	hard
.	SENT	.

Texte : It wasn’t that expensive after all.
Annotation :
It	PP	it
wasn’t	VVD	wasn’t
that	IN/that	that
expensive	JJ	expensive
after	IN	after
all	DT	all
.	SENT	.

Texte : She’s never been that interested in sports.
Annotation :
She’s	NP	She’s
never	RB	never
been	VBN	be
that	IN/that	that
interested	JJ	interested
in	IN	in
sports	NNS	sport
.	SENT	.

Texte : He didn’t run that fast.
Annotation :
He	PP	he
didn’t	VVD	didn’t
run	VV	run
that	IN/that	that
fast	RB	fast
.	SENT	.

Texte : I didn’t know it was that far.
Annotation :
I	PP	I
didn’t	VVD	didn’t
know	VV	know
it	PP	it
was	VBD	be
that	IN/that	that
far	RB	far
.	SENT	.

Texte : It doesn’t hurt that much.
Annotation :
It	PP	it
doesn’t	VVD	doesn’t
hurt	VV	hurt
that	IN/that	that
much	RB	much
.	SENT	.

Texte : I wasn’t that tired last night.
Annotation :
I	PP	I
wasn’t	VVD	wasn’t
t

In [3]:
import os
import treetaggerwrapper

# Définir les chemins
tagger_dir = os.path.expanduser("~/treetagger")  # Dossier TreeTagger
param_file = os.path.expanduser("~/treetagger/english.par")  # Modèle anglais
input_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Texte à analyser
output_file = os.path.expanduser("~/treetagger/data/that-annotation.txt")  # Fichier où stocker les résultats

# Vérifier l'existence des fichiers
assert os.path.exists(param_file), "Le fichier de paramètres n'existe pas."
assert os.path.exists(input_file), "Le fichier d'entrée n'existe pas."

# Initialiser TreeTagger
tagger = treetaggerwrapper.TreeTagger(TAGDIR=tagger_dir, TAGPARFILE=param_file)

# Lire le fichier `that_adv.txt`
with open(input_file, "r", encoding="utf-8") as f:
    text = f.read().splitlines()

# Ouvrir le fichier de sortie
with open(output_file, "w", encoding="utf-8") as out_f:
    total_that = 0
    correct_that = 0
    expected_tag = "RB"  # Modifier si nécessaire (adverbe)

    # Appliquer TreeTagger et enregistrer les annotations
    for line in text:
        words = line.strip().split()
        if words:
            tags = tagger.tag_text(line)
            for tagged_word in tags:
                parts = tagged_word.split("\t")
                if len(parts) == 3:
                    word, predicted_tag, lemma = parts
                    # Sauvegarder dans le fichier
                    out_f.write(f"{word}\t{predicted_tag}\t{lemma}\n")
                    
                    # Vérifier si "that" est bien annoté
                    if word.lower() == "that":
                        total_that += 1
                        if predicted_tag == expected_tag:
                            correct_that += 1

# Calculer la précision sur "that"
accuracy = correct_that / total_that if total_that > 0 else 0
print(f"Précision de TreeTagger sur 'that' : {accuracy:.2%}")
print(f"Total de 'that' trouvés : {total_that}, Correctement annotés : {correct_that}")

# Vérifier le fichier enregistré
print(f"\nLes annotations sont sauvegardées dans : {output_file}")

Précision de TreeTagger sur 'that' : 5.00%
Total de 'that' trouvés : 100, Correctement annotés : 5

Les annotations sont sauvegardées dans : /Users/diamouserignetoubandiaye/treetagger/data/that-annotation.txt


In [22]:
import subprocess

# Définir les chemins
lexicon_file = "~/treetagger/data/that_lexicon_fixed.txt"
open_class_file = "~/treetagger/data/open_class.txt"
train_file = "~/treetagger/data/that_train_fixed.txt"
output_model = "~/treetagger/data/that_model.par"

# Commande correcte pour entraîner TreeTagger
train_command = [
    "~/treetagger/bin/train-tree-tagger",
    lexicon_file,
    open_class_file,
    train_file,
    output_model
]

# Exécuter l'entraînement
process = subprocess.run(" ".join(train_command), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Afficher la sortie
print(process.stdout.decode())
print(process.stderr.decode())

# Vérifier si le fichier modèle a été généré
print(f"Modèle entraîné disponible ici : {output_model}")



train-tree-tagger -cl 2 -dtg 1.00 -sw 1.00 -ecw 1.00 -stg 1.00 -ptg -1.00 /Users/diamouserignetoubandiaye/treetagger/data/that_lexicon_fixed.txt /Users/diamouserignetoubandiaye/treetagger/data/open_class.txt /Users/diamouserignetoubandiaye/treetagger/data/that_train_fixed.txt /Users/diamouserignetoubandiaye/treetagger/data/that_model.par

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
49000	making affix tree ...
prefix lexicon: 7894 nodes
suffix lexicon: 1162 nodes
	reading classes ...
	making ngram table ...
51271	6234
finished.
	making decision tree ...
36	saving parameters ...

Number of nodes: 37
Max. path length: 15

done.

Modèle entraîné disponible ici : ~/treetagger/data/that_model.par
