In [None]:
import os
import treetaggerwrapper

# D√©finir les chemins
tagger_dir = os.path.expanduser("~/treetagger")  # Dossier TreeTagger
param_file = os.path.expanduser("~/treetagger/english.par")  # Mod√®le anglais
input_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Fichier d'entr√©e

# V√©rifier que les fichiers existent
assert os.path.exists(param_file), "Le fichier de param√®tres n'existe pas."
assert os.path.exists(input_file), "Le fichier d'entr√©e n'existe pas."

# Initialiser TreeTagger
tagger = treetaggerwrapper.TreeTagger(TAGDIR=tagger_dir, TAGPARFILE=param_file)

# Lire le fichier `that_adv.txt`
with open(input_file, "r", encoding="utf-8") as f:
    text = f.read().splitlines()

# Appliquer TreeTagger ligne par ligne
for line in text:
    if line.strip():  # V√©rifier que la ligne n'est pas vide
        tags = tagger.tag_text(line)
        print("\nTexte :", line)
        print("Annotation :")
        for tag in tags:
            print("\t".join(tag.split("\t")))  # Affiche proprement


Texte : I didn‚Äôt think it would be that hard.
Annotation :
I	PP	I
didn‚Äôt	VVD	didn‚Äôt
think	VV	think
it	PP	it
would	MD	would
be	VB	be
that	RB	that
hard	JJ	hard
.	SENT	.

Texte : It wasn‚Äôt that expensive after all.
Annotation :
It	PP	it
wasn‚Äôt	VVD	wasn‚Äôt
that	IN/that	that
expensive	JJ	expensive
after	IN	after
all	DT	all
.	SENT	.

Texte : She‚Äôs never been that interested in sports.
Annotation :
She‚Äôs	NP	She‚Äôs
never	RB	never
been	VBN	be
that	IN/that	that
interested	JJ	interested
in	IN	in
sports	NNS	sport
.	SENT	.

Texte : He didn‚Äôt run that fast.
Annotation :
He	PP	he
didn‚Äôt	VVD	didn‚Äôt
run	VV	run
that	IN/that	that
fast	RB	fast
.	SENT	.

Texte : I didn‚Äôt know it was that far.
Annotation :
I	PP	I
didn‚Äôt	VVD	didn‚Äôt
know	VV	know
it	PP	it
was	VBD	be
that	IN/that	that
far	RB	far
.	SENT	.

Texte : It doesn‚Äôt hurt that much.
Annotation :
It	PP	it
doesn‚Äôt	VVD	doesn‚Äôt
hurt	VV	hurt
that	IN/that	that
much	RB	much
.	SENT	.

Texte : I wasn‚Äôt that tired last night.
A

In [6]:
import os
import treetaggerwrapper

# D√©finir les chemins
tagger_dir = os.path.expanduser("~/treetagger")  # Dossier TreeTagger
param_file = os.path.expanduser("~/treetagger/english.par")  # Mod√®le anglais
input_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Texte √† analyser
output_file = os.path.expanduser("~/treetagger/data/that-annotation.txt")  # Fichier o√π stocker les r√©sultats

# V√©rifier l'existence des fichiers
assert os.path.exists(param_file), "Le fichier de param√®tres n'existe pas."
assert os.path.exists(input_file), "Le fichier d'entr√©e n'existe pas."

# Initialiser TreeTagger
tagger = treetaggerwrapper.TreeTagger(TAGDIR=tagger_dir, TAGPARFILE=param_file)

# Lire le fichier `that_adv.txt`
with open(input_file, "r", encoding="utf-8") as f:
    text = f.read().splitlines()

# Ouvrir le fichier de sortie
with open(output_file, "w", encoding="utf-8") as out_f:
    total_that = 0
    correct_that = 0
    expected_tag = "RB"  # Modifier si n√©cessaire (adverbe)

    # Appliquer TreeTagger et enregistrer les annotations
    for line in text:
        words = line.strip().split()
        if words:
            tags = tagger.tag_text(line)
            for tagged_word in tags:
                parts = tagged_word.split("\t")
                if len(parts) == 3:
                    word, predicted_tag, lemma = parts
                    # Sauvegarder dans le fichier
                    out_f.write(f"{word}\t{predicted_tag}\t{lemma}\n")
                    
                    # V√©rifier si "that" est bien annot√©
                    if word.lower() == "that":
                        total_that += 1
                        if predicted_tag == expected_tag:
                            correct_that += 1

# Calculer la pr√©cision sur "that"
accuracy = correct_that / total_that if total_that > 0 else 0
print(f"Pr√©cision de TreeTagger sur 'that' : {accuracy:.2%}")
print(f"Total de 'that' trouv√©s : {total_that}, Correctement annot√©s : {correct_that}")

# V√©rifier le fichier enregistr√©
print(f"\nLes annotations sont sauvegard√©es dans : {output_file}")

Pr√©cision de TreeTagger sur 'that' : 5.00%
Total de 'that' trouv√©s : 100, Correctement annot√©s : 5

Les annotations sont sauvegard√©es dans : /Users/diamouserignetoubandiaye/treetagger/data/that-annotation.txt


In [38]:
import subprocess
import os

# D√©finir les chemins
tagger_bin = os.path.expanduser("~/treetagger/bin/train-tree-tagger")  # Adapter selon l'OS
lexicon_file = "lexicon_brown.txt"
open_class_file = "tags_brown.txt"
train_file = "corpus_brown_that_custom.txt"
output_model = "english_brown_model.par"

# V√©rifier que les fichiers existent
for file in [tagger_bin, lexicon_file, open_class_file, train_file]:
    if not os.path.exists(file):
        print(f"Erreur : Fichier introuvable -> {file}")
        exit(1)

# Commande correcte pour entra√Æner TreeTagger
train_command = [
    tagger_bin, "-utf8",
    "-st", ".",
    lexicon_file, open_class_file, train_file, output_model
]

# Ex√©cuter l'entra√Ænement
process = subprocess.run(train_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Afficher la sortie
print(process.stdout.decode())
print(process.stderr.decode())

# V√©rifier si le fichier mod√®le a √©t√© g√©n√©r√©
if os.path.exists(output_model):
    print(f"‚úÖ Mod√®le entra√Æn√© avec succ√®s : {output_model}")
else:
    print(f"‚ùå Erreur : le mod√®le n'a pas √©t√© g√©n√©r√©.")



train-tree-tagger -cl 2 -dtg 1.00 -sw 1.00 -ecw 1.00 -stg 1.00 -ptg -1.00 lexicon_brown.txt tags_brown.txt corpus_brown_that_custom.txt english_brown_model.par

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...
247000	making affix tree ...
prefix lexicon: 15945 nodes
suffix lexicon: 3120 nodes
	reading classes ...
	making ngram table ...
279449	29083
finished.
	making decision tree ...
58	saving parameters ...

Number of nodes: 59
Max. path length: 24

done.

‚úÖ Mod√®le entra√Æn√© avec succ√®s : english_brown_model.par


In [10]:
import subprocess

# D√©finir les chemins
model_file = os.path.expanduser("~/treetagger/data/that_model.par")
test_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Fichier √† tester
output_file = os.path.expanduser("~/treetagger/data/that_test_output.txt")

# Commande pour ex√©cuter TreeTagger avec le nouveau mod√®le
tagger_command = [
    "~/treetagger/bin/tree-tagger",
    model_file
]

# Lire le fichier de test et l'envoyer √† TreeTagger
with open(test_file, "r", encoding="utf-8") as f:
    test_text = f.read()

process = subprocess.run(tagger_command, input=test_text.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

# Sauvegarder la sortie dans un fichier
with open(output_file, "w", encoding="utf-8") as f:
    f.write(process.stdout.decode())

# Afficher la sortie
print("R√©sultats de l'annotation :")
print(process.stdout.decode())

R√©sultats de l'annotation :



In [11]:
import subprocess

# D√©finir le chemin du mod√®le entra√Æn√©
model_file = os.path.expanduser("~/treetagger/data/that_model.par")
test_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Fichier √† tester

# Commande pour ex√©cuter TreeTagger avec le nouveau mod√®le
tagger_command = [
    "~/treetagger/bin/tree-tagger",
    model_file
]

# Lire le fichier de test
with open(test_file, "r", encoding="utf-8") as f:
    test_text = f.read()

# Ex√©cuter TreeTagger et capturer la sortie
process = subprocess.run(tagger_command, input=test_text.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

# Afficher les erreurs s'il y en a
if process.stderr:
    print("‚ö† Erreur TreeTagger :")
    print(process.stderr.decode())

# Afficher les r√©sultats
print("R√©sultats de l'annotation :")
print(process.stdout.decode())

‚ö† Erreur TreeTagger :

USAGE:  tree-tagger {-options-} <parameter file> {<input file> {<output file>}}

OPTIONS:
	-token:	Print the token
	-lemma:	Print the lemma
	-sgml:	Don't tag SGML annotations
	-threshold <p>: Print all tags of a word with a probability higher than <p> times the largest probability
	-prob: Print tag probabilities
	-ignore-prefix: Ignore prefix when guessing pos for unknown words.
	-no-unknown: Print the token rather than <unknown> for unknown lemmas
	-hyphen-heuristics:	Turn on the heuristics fur guessing the parts of speech of unknown hyphenated words
	-quiet:	Don't print status messages
	-pt-with-lemma:	pretagging with lemmata
	-pt-with-prob:	pretagging with probabilities
	-files <f>:	Read names of input and output files pairwise from <f>
	-lex <f>:	Read auxiliary lexicon entries from file <f>
	-wc <f>:	Read a word-class automaton from file <f>
	-eos-tag <tag>:	The SGML tag <tag> signals the end of a sentence.
			This option implies the option -sgml

Some more

In [12]:
import subprocess
import os

# D√©finir les chemins des fichiers
model_file = os.path.expanduser("~/treetagger/data/that_model.par")
test_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Fichier de test
output_file = os.path.expanduser("~/treetagger/data/that_test_output.txt")  # R√©sultat

# V√©rifier si le fichier mod√®le et le fichier test existent
assert os.path.exists(model_file), "‚ö† Le mod√®le that_model.par n'existe pas !"
assert os.path.exists(test_file), "‚ö† Le fichier de test that_adv.txt n'existe pas !"

# Commande correcte pour ex√©cuter TreeTagger
tagger_command = [
    "~/treetagger/bin/tree-tagger",
    model_file,
    test_file
]

# Ex√©cuter TreeTagger et r√©cup√©rer la sortie
process = subprocess.run(" ".join(tagger_command), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Sauvegarder la sortie dans un fichier
with open(output_file, "w", encoding="utf-8") as f:
    f.write(process.stdout.decode())

# V√©rifier s'il y a une erreur
if process.stderr:
    print("‚ö† Erreur TreeTagger :")
    print(process.stderr.decode())
else:
    print("‚úÖ Annotation termin√©e ! R√©sultats enregistr√©s dans that_test_output.txt")

# Afficher les r√©sultats directement
print("\nR√©sultats de l'annotation :")
print(process.stdout.decode())

‚ö† Erreur TreeTagger :
	reading parameters ...
	tagging ...
	 finished.


R√©sultats de l'annotation :
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT
SENT



In [13]:
import subprocess
import os

# D√©finir le chemin du mod√®le
model_file = os.path.expanduser("~/treetagger/data/that_model.par")

# Tester une phrase directement en entr√©e
test_sentence = "I didn‚Äôt think it would be that hard.\n"

tagger_command = [
    "~/treetagger/bin/tree-tagger",
    model_file
]

process = subprocess.run(" ".join(tagger_command), input=test_sentence.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

print("üìå R√©sultat de l'annotation :")
print(process.stdout.decode())

if process.stderr:
    print("‚ö† Erreur TreeTagger :")
    print(process.stderr.decode())

üìå R√©sultat de l'annotation :
SENT

‚ö† Erreur TreeTagger :
	reading parameters ...
	tagging ...
	 finished.



In [14]:
import os

model_file = os.path.expanduser("~/treetagger/data/that_model.par")

# V√©rifier si le mod√®le existe et sa taille
if os.path.exists(model_file) and os.path.getsize(model_file) > 1000:  # Minimum 1 Ko
    print(f"‚úÖ Le mod√®le existe et fait {os.path.getsize(model_file)} octets.")
else:
    print("‚ö† Le mod√®le `that_model.par` est absent ou trop petit, l'entra√Ænement a peut-√™tre √©chou√©.")

‚ö† Le mod√®le `that_model.par` est absent ou trop petit, l'entra√Ænement a peut-√™tre √©chou√©.


In [15]:
import subprocess

model_file = os.path.expanduser("~/treetagger/data/that_model.par")
test_file = os.path.expanduser("~/treetagger/data/that_adv.txt")

tagger_command = [
    "~/treetagger/bin/tree-tagger",
    "-token",  # Forcer l'affichage des tokens
    model_file,
    test_file
]

process = subprocess.run(" ".join(tagger_command), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

print("üìå R√©sultat de l'annotation (avec -token) :")
print(process.stdout.decode())

if process.stderr:
    print("‚ö† Erreur TreeTagger :")
    print(process.stderr.decode())

üìå R√©sultat de l'annotation (avec -token) :
I didn‚Äôt think it would be that hard.	SENT
It wasn‚Äôt that expensive after all.	SENT
She‚Äôs never been that interested in sports.	SENT
He didn‚Äôt run that fast.	SENT
I didn‚Äôt know it was that far.	SENT
It doesn‚Äôt hurt that much.	SENT
I wasn‚Äôt that tired last night.	SENT
The movie wasn‚Äôt that good.	SENT
She didn‚Äôt seem that upset about it.	SENT
It wasn‚Äôt that big of a deal.	SENT
The weather wasn‚Äôt that bad.	SENT
I didn‚Äôt realize the book was that old.	SENT
He‚Äôs not that tall compared to his brother.	SENT
I don‚Äôt think it‚Äôs that important.	SENT
The cake didn‚Äôt taste that sweet.	SENT
She wasn‚Äôt that impressed with the presentation.	SENT
The car wasn‚Äôt that expensive to fix.	SENT
He didn‚Äôt speak that loudly during the meeting.	SENT
I wasn‚Äôt that worried about the exam.	SENT
It wasn‚Äôt that late when I left.	SENT
The hotel wasn‚Äôt that luxurious, but it was nice.	SENT
He didn‚Äôt look that tired after the 

In [7]:
import os
def generate_lexicon(training_corpus_path, lexicon_output_path):
    """
    G√©n√®re un fichier lexique √† partir d'un corpus d‚Äôentra√Ænement annot√©.
    
    Le corpus doit contenir des lignes au format :
      token<TAB>tag<TAB>lemme
    Les lignes qui ne contiennent pas exactement 3 colonnes (par exemple, les en-t√™tes)
    seront ignor√©es.
    
    Pour chaque token, on s√©lectionne l'analyse (tag et lemme) la plus fr√©quente.
    
    :param training_corpus_path: Chemin vers le corpus d‚Äôentra√Ænement
    :param lexicon_output_path: Chemin de sortie pour le fichier lexique g√©n√©r√©
    """
    lexicon = {}  # Dictionnaire : token -> {(tag, lemme): fr√©quence}

    with open(training_corpus_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue  # Ignore les lignes vides
            # Ignore les lignes qui ne contiennent pas de tabulations (en-t√™tes, etc.)
            if "\t" not in line:
                continue
            parts = line.split("\t")
            if len(parts) != 3:
                # Affiche un avertissement et passe √† la ligne suivante
                print(f"Avertissement : ligne ignor√©e (format inattendu) : {line}")
                continue
            token, tag, lemma = parts[0], parts[1], parts[2]
            if token not in lexicon:
                lexicon[token] = {}
            key = (tag, lemma)
            lexicon[token][key] = lexicon[token].get(key, 0) + 1

    # Pour chaque token, on s√©lectionne l'analyse (tag, lemme) la plus fr√©quente
    with open(lexicon_output_path, "w", encoding="utf-8") as out:
        for token, analyses in lexicon.items():
            best_analysis = max(analyses.items(), key=lambda item: item[1])[0]
            best_tag, best_lemma = best_analysis
            out.write(f"{token}\t{best_tag}\t{best_lemma}\n")

    print(f"Fichier lexique g√©n√©r√© : {lexicon_output_path}")

# Exemple d'utilisation :
if __name__ == "__main__":
    # Remplacez ces chemins par ceux de vos fichiers
    training_corpus = os.path.expanduser("~/treetagger/data/that-annotation.txt")
    lexicon_file = os.path.expanduser("~/treetagger/data/lexicon2.txt")
    generate_lexicon(training_corpus, lexicon_file)

Fichier lexique g√©n√©r√© : /Users/diamouserignetoubandiaye/treetagger/data/lexicon2.txt


In [8]:
def generate_tagset(corpus_file_path, tagset_output_path):
    """
    G√©n√®re un fichier listant tous les tags uniques pr√©sents dans le corpus d'entra√Ænement.
    
    On suppose que chaque ligne du corpus est au format :
      token<TAB>tag<TAB>lemme
    Les lignes non conformes (par exemple, les en-t√™tes) sont ignor√©es.
    
    :param corpus_file_path: Chemin vers le corpus d'entra√Ænement annot√©
    :param tagset_output_path: Chemin o√π √©crire le fichier du tagset (liste des tags)
    """
    tags = set()
    
    with open(corpus_file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue  # ignorer les lignes vides
            # V√©rifier que la ligne contient bien une tabulation
            if "\t" not in line:
                continue
            parts = line.split("\t")
            if len(parts) != 3:
                # Ligne au format inattendu, on l'ignore
                print(f"Ligne ignor√©e (format inattendu) : {line}")
                continue
            # La deuxi√®me colonne correspond au tag
            tag = parts[1]
            tags.add(tag)
    
    # √âcriture du fichier de tagset (un tag par ligne)
    with open(tagset_output_path, "w", encoding="utf-8") as out:
        for tag in sorted(tags):
            out.write(tag + "\n")
    
    print(f"Fichier tagset g√©n√©r√© : {tagset_output_path}")


# Exemple d'utilisation
if __name__ == "__main__":
    corpus_path = os.path.expanduser("~/treetagger/data/that-annotation.txt")  # Remplacez par le chemin de votre corpus annot√©
    tagset_path = os.path.expanduser("~/treetagger/data/tagset.txt")          # Chemin de sortie souhait√© pour la liste des tags
    generate_tagset(corpus_path, tagset_path)

Fichier tagset g√©n√©r√© : /Users/diamouserignetoubandiaye/treetagger/data/tagset.txt


In [None]:

# D√©finir les chemins
tagger_dir = os.path.expanduser("~/treetagger")  # Dossier TreeTagger
param_file = os.path.expanduser("~/treetagger/english.par")  # Mod√®le anglais
input_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Fichier d'entr√©e

In [28]:
import os
import subprocess

def train_treetagger(corpus_path, lexicon_path, open_class_path, output_model_path):
    """
    Entra√Æne un mod√®le TreeTagger √† partir d'un corpus annot√©.
    
    Usage (d'apr√®s l'aide de train-tree-tagger) :
      train-tree-tagger [options] <lexicon> <open class file> <input file> <output file>
    
    Param√®tres :
      - corpus_path : chemin vers le corpus d'entra√Ænement annot√© (format : token<TAB>tag<TAB>lemme par ligne)
      - lexicon_path : chemin vers le fichier lexique g√©n√©r√© √† partir du corpus
      - open_class_path : chemin vers le fichier listant les mots open class (ou fichier vide)
      - output_model_path : chemin o√π sera sauvegard√© le mod√®le entra√Æn√©
    """
    # Construction de la commande avec les arguments dans l'ordre attendu.
    cmd = [
        os.path.expanduser("~/treetagger/bin/train-tree-tagger"),
        lexicon_path,
        open_class_path,
        corpus_path,
        output_model_path
    ]
    
    print("Commande d'entra√Ænement ex√©cut√©e :")
    print(" ".join(cmd))
    
    # Ex√©cution de la commande
    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    
    # V√©rification du retour
    if result.returncode != 0:
        print("Erreur lors de l'entra√Ænement :")
        print(result.stderr)
    else:
        print("Entra√Ænement termin√© avec succ√®s.")
        print(result.stdout)

if __name__ == "__main__":
    # D√©finissez les chemins adapt√©s √† votre configuration
    corpus_path      = os.path.expanduser("~/treetagger/data/that_train.txt")      # Votre corpus annot√©
    lexicon_path     = os.path.expanduser("~/treetagger/data/lexicon2.txt")        # Fichier lexique g√©n√©r√©
    open_class_path  = os.path.expanduser("~/treetagger/data/tagset.txt")      # Fichier open class
    
    # Si vous n'avez pas encore de fichier open class, vous pouvez en cr√©er un vide
    if not os.path.exists(open_class_path):
        with open(open_class_path, "w", encoding="utf-8") as f:
            # Vous pouvez √©galement y ajouter manuellement la liste des mots open class si n√©cessaire.
            f.write("")  # Fichier vide pour l'instant
    
    output_model_path = os.path.expanduser("~/treetagger/myEnglish.par")   # Chemin de sortie pour le mod√®le entra√Æn√©
    
    train_treetagger(corpus_path, lexicon_path, open_class_path, output_model_path)

Commande d'entra√Ænement ex√©cut√©e :
/Users/diamouserignetoubandiaye/treetagger/bin/train-tree-tagger /Users/diamouserignetoubandiaye/treetagger/data/lexicon2.txt /Users/diamouserignetoubandiaye/treetagger/data/tagset.txt /Users/diamouserignetoubandiaye/treetagger/data/that_train.txt /Users/diamouserignetoubandiaye/treetagger/myEnglish.par
Erreur lors de l'entra√Ænement :

train-tree-tagger -cl 2 -dtg 1.00 -sw 1.00 -ecw 1.00 -stg 1.00 -ptg -1.00 /Users/diamouserignetoubandiaye/treetagger/data/lexicon2.txt /Users/diamouserignetoubandiaye/treetagger/data/tagset.txt /Users/diamouserignetoubandiaye/treetagger/data/that_train.txt /Users/diamouserignetoubandiaye/treetagger/myEnglish.par

	reading the lexicon ...
		reading the tagset ...
		reading the lemmas ...
		reading the entries ...
		sorting the lexicon ...
		reading the open class tags ...
	calculating tag frequencies ...

ERROR: unknown tag <NNP> !




In [26]:
import os
import treetaggerwrapper

# D√©finir les chemins
tagger_dir = os.path.expanduser("~/treetagger")  # Dossier TreeTagger
param_file = os.path.expanduser("~/treetagger/myenglish.par")  # Mod√®le anglais
input_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Fichier d'entr√©e

# V√©rifier que les fichiers existent
assert os.path.exists(param_file), "Le fichier de param√®tres n'existe pas."
assert os.path.exists(input_file), "Le fichier d'entr√©e n'existe pas."

# Initialiser TreeTagger
tagger = treetaggerwrapper.TreeTagger(TAGDIR=tagger_dir, TAGPARFILE=param_file)

# Lire le fichier `that_adv.txt`
with open(input_file, "r", encoding="utf-8") as f:
    text = f.read().splitlines()

# Appliquer TreeTagger ligne par ligne
for line in text:
    if line.strip():  # V√©rifier que la ligne n'est pas vide
        tags = tagger.tag_text(line)
        print("\nTexte :", line)
        print("Annotation :")
        for tag in tags:
            print("\t".join(tag.split("\t")))  # Affiche proprement


Texte : I didn‚Äôt think it would be that hard.
Annotation :
I	PP	I
didn‚Äôt	VVD	didn‚Äôt
think	VV	think
it	PP	it
would	MD	would
be	VB	be
that	IN/that	that
hard	JJ	hard
.	SENT	.

Texte : It wasn‚Äôt that expensive after all.
Annotation :
It	PP	it
wasn‚Äôt	NN	wasn‚Äôt
that	IN/that	that
expensive	JJ	expensive
after	IN	after
all	DT	all
.	SENT	.

Texte : She‚Äôs never been that interested in sports.
Annotation :
She‚Äôs	NP	She‚Äôs
never	RB	never
been	VBN	be
that	IN/that	that
interested	JJ	interested
in	IN	in
sports	NNS	sport
.	SENT	.

Texte : He didn‚Äôt run that fast.
Annotation :
He	PP	he
didn‚Äôt	VVD	didn‚Äôt
run	VV	run
that	IN/that	that
fast	RB	fast
.	SENT	.

Texte : I didn‚Äôt know it was that far.
Annotation :
I	PP	I
didn‚Äôt	VVD	didn‚Äôt
know	VV	know
it	PP	it
was	VBD	be
that	IN/that	that
far	RB	far
.	SENT	.

Texte : It doesn‚Äôt hurt that much.
Annotation :
It	PP	it
doesn‚Äôt	VVD	doesn‚Äôt
hurt	VV	hurt
that	IN/that	that
much	RB	much
.	SENT	.

Texte : I wasn‚Äôt that tired last nigh

In [None]:
import os
import treetaggerwrapper

# D√©finir les chemins
tagger_dir = os.path.expanduser("~/treetagger")  # Dossier TreeTagger
param_file = os.path.expanduser("~/treetagger/english.par")  # Mod√®le anglais
input_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Texte √† analyser
output_file = os.path.expanduser("~/treetagger/data/that-annotation.txt")  # Fichier o√π stocker les r√©sultats

# V√©rifier l'existence des fichiers
assert os.path.exists(param_file), "Le fichier de param√®tres n'existe pas."
assert os.path.exists(input_file), "Le fichier d'entr√©e n'existe pas."

# Initialiser TreeTagger
tagger = treetaggerwrapper.TreeTagger(TAGDIR=tagger_dir, TAGPARFILE=param_file)

# Lire le fichier `that_adv.txt`
with open(input_file, "r", encoding="utf-8") as f:
    text = f.read().splitlines()

# Ouvrir le fichier de sortie
with open(output_file, "w", encoding="utf-8") as out_f:
    total_that = 0
    correct_that = 0
    expected_tag = "RB"  # Modifier si n√©cessaire (adverbe)

    # Appliquer TreeTagger et enregistrer les annotations
    for line in text:
        words = line.strip().split()
        if words:
            tags = tagger.tag_text(line)
            for tagged_word in tags:
                parts = tagged_word.split("\t")
                if len(parts) == 3:
                    word, predicted_tag, lemma = parts
                    # Sauvegarder dans le fichier
                    out_f.write(f"{word}\t{predicted_tag}\t{lemma}\n")
                    
                    # V√©rifier si "that" est bien annot√©
                    if word.lower() == "that":
                        total_that += 1
                        if predicted_tag == expected_tag:
                            correct_that += 1

# Calculer la pr√©cision sur "that"
accuracy = correct_that / total_that if total_that > 0 else 0
print(f"Pr√©cision de TreeTagger sur 'that' : {accuracy:.2%}")
print(f"Total de 'that' trouv√©s : {total_that}, Correctement annot√©s : {correct_that}")

# V√©rifier le fichier enregistr√©
print(f"\nLes annotations sont sauvegard√©es dans : {output_file}")

In [27]:
import os
import treetaggerwrapper

# D√©finir les chemins
tagger_dir = os.path.expanduser("~/treetagger")  # Dossier TreeTagger
param_file = os.path.expanduser("~/treetagger/myenglish.par")  # Mod√®le anglais
input_file = os.path.expanduser("~/treetagger/data/that_adv.txt")  # Texte √† analyser
output_file = os.path.expanduser("~/treetagger/data/mythat-annotation.txt")  # Fichier o√π stocker les r√©sultats

# V√©rifier l'existence des fichiers
assert os.path.exists(param_file), "Le fichier de param√®tres n'existe pas."
assert os.path.exists(input_file), "Le fichier d'entr√©e n'existe pas."

# Initialiser TreeTagger
tagger = treetaggerwrapper.TreeTagger(TAGDIR=tagger_dir, TAGPARFILE=param_file)

# Lire le fichier `that_adv.txt`
with open(input_file, "r", encoding="utf-8") as f:
    text = f.read().splitlines()

# Ouvrir le fichier de sortie
with open(output_file, "w", encoding="utf-8") as out_f:
    total_that = 0
    correct_that = 0
    expected_tag = "RB"  # Modifier si n√©cessaire (adverbe)

    # Appliquer TreeTagger et enregistrer les annotations
    for line in text:
        words = line.strip().split()
        if words:
            tags = tagger.tag_text(line)
            for tagged_word in tags:
                parts = tagged_word.split("\t")
                if len(parts) == 3:
                    word, predicted_tag, lemma = parts
                    # Sauvegarder dans le fichier
                    out_f.write(f"{word}\t{predicted_tag}\t{lemma}\n")
                    
                    # V√©rifier si "that" est bien annot√©
                    if word.lower() == "that":
                        total_that += 1
                        if predicted_tag == expected_tag:
                            correct_that += 1

# Calculer la pr√©cision sur "that"
accuracy = correct_that / total_that if total_that > 0 else 0
print(f"Pr√©cision de TreeTagger sur 'that' : {accuracy:.2%}")
print(f"Total de 'that' trouv√©s : {total_that}, Correctement annot√©s : {correct_that}")

# V√©rifier le fichier enregistr√©
print(f"\nLes annotations sont sauvegard√©es dans : {output_file}")

Pr√©cision de TreeTagger sur 'that' : 0.00%
Total de 'that' trouv√©s : 100, Correctement annot√©s : 0

Les annotations sont sauvegard√©es dans : /Users/diamouserignetoubandiaye/treetagger/data/mythat-annotation.txt
