In [2]:
%cd source
%load_ext autoreload
%autoreload 2
import pandas as pd
import matplotlib.pyplot as plt
import time
import spacy
import os
import pipeline
from spacy.language import Language
import re

/home/labicquette/M1/TER/source
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
base_model = "en_core_web_sm"
nlp = spacy.load(base_model)

if "sentencizer" not in nlp.pipe_names:
    sentencizer = nlp.add_pipe("sentencizer")
    
# Préparation des données d'entraînement
def process_text_files(directory):
    data = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r") as file:
                text = file.read()
                # Supprimer les sauts de ligne supplémentaires
                text = text.strip()
                # Séparer le texte en phrases
                sentences = text.split("\n")
                start_end_pos = []
                start_pos = 0
                for sentence in sentences:
                    end_pos = start_pos + len(sentence)
                    start_end_pos.append((start_pos, end_pos))
                    # Ajouter la longueur d'un saut de ligne
                    start_pos = end_pos + 1
                data.append((text, {"words": sentences}))
    return data

# Définir le répertoire contenant vos fichiers .txt
train_directory = "../documents/train/"

# Charger les données d'entraînement
train_data = process_text_files(train_directory)

nlp.disable_pipes("tagger", "parser")  # Désactiver le Tagger et le Parser par exemple

# Entraînement du modèle
# Utiliser vos données d'entraînement pour affiner le modèle
for text, annotations in train_data:
    # Créer un objet Example à partir du texte et des annotations
    example = spacy.training.Example.from_dict(nlp.make_doc(text), annotations)
    # Mettre à jour le modèle avec l'exemple
    nlp.update([example], losses={})

# Sauvegarder le modèle affiné
nlp.to_disk("../models/fine_tuned_spacy_model")

In [4]:
nlp = spacy.load("../models/fine_tuned_spacy_model")

In [5]:

doc = nlp("This is a sentence. [1] This is another sentence.")
sentences = []
for sentence in doc.sents:
    print(sentence.text)
    sentences += [sentence.text]
    
    

This is a sentence. [
1] This is another sentence.




In [6]:
# Définir une fonction pour vérifier si un token est un chiffre romain
def is_roman_numeral(token):
    return bool(re.match(r'^(?:i[vx]|v[li]*|x[vli]*)$', token.text.lower()))

# Ajouter une règle de segmentation personnalisée
def custom_segmentation(doc):
    to_end_after_bracket = False
    for i, token in enumerate(doc[:-1]):
        if (token.text == "]" or token.text == ")") and to_end_after_bracket:
            doc[i + 1].is_sent_start = True
            to_end_after_bracket = False
            continue
            
        if to_end_after_bracket:
            doc[i + 1].is_sent_start = False
            continue
            
        if token.text == ".":
            # Le point est suivi d'une suite de 4 chiffres max et d'un point ou parenthèse
            if (re.match(r'^\d{1,4}$', doc[i + 1].text) or is_roman_numeral(doc[i + 1])) and (doc[i+2].text == "." or doc[i+2].text == ")"):
                print(doc[i + 1].text, " ", doc[i + 2].text)
                doc[i + 1].is_sent_start = False
            
            # Les parenthèses / crochets font partie de la phrase précédente
            elif doc[token.i + 1].text == "[" or doc[token.i + 1].text == "(" or doc[token.i + 2].text == "[" or doc[token.i + 2].text == "(":
                doc[token.i + 1].is_sent_start = False
                to_end_after_bracket = True
             
            # Le point est suivi d'une minuscule   
            elif re.match(r'\b[a-z]\w*\b', doc[i + 1].text):
                doc[i + 1].is_sent_start = False
                
            
            continue
    return doc

# Ajouter la fonction de segmentation personnalisée au pipeline spaCy
@Language.component("custom_segmentation")
def custom_segmentation_component(doc):
    return custom_segmentation(doc)

# Charger le modèle spaCy de base
nlp = spacy.load("en_core_web_sm")

# Insérer la segmentation personnalisée au début du pipeline
nlp.add_pipe("custom_segmentation", before="tagger")

# Exemple de texte
text = "This is a short sentence. 123. This is another sentence. IV. This is yet another sentence."

# Traiter le texte avec le modèle
doc = nlp(text)

# Afficher les phrases
for sent in doc.sents:
    print(sent.text)


123   .
IV   .
This is a short sentence. 123.
This is another sentence. IV.
This is yet another sentence.


In [7]:
res_eval = []
dir_path = "../documents/train/*"
models = ['nltk', 'spacy', 'custom_spacy', 'naive']
tokenizers = ['nltk-punkt', 'spacy', 'spacy', 'nltk-word']
for i in range(len(models)):
    start = time.time()
    r, p, f = pipeline.evaluation(dir_path, tokenizers[i], models[i])
    res_eval += [[time.time()-start, r, p, f]]
print(res_eval)


model nltk
Segmentation ['*29 JUSTICE BREYER delivered the opinion of the Court.', 'Before seeking a federal writ of habeas corpus, a state prisoner must exhaust available state remedies, 28 U. S. C. § 2254(b)(1), thereby giving the State the "`"opportunity to pass upon and correct" alleged violations of its prisoners\' federal rights.\'"', 'Duncan v. Henry, 513 U. S. 364 , 365 (1995) (per curiam) (quoting Picard v. Connor, 404 U. S. 270 , 275 (1971)).', 'To provide the State with the necessary "opportunity," the prisoner must "fairly present" his claim in each appropriate state court (including a state supreme court with powers of discretionary review), thereby alerting that court to the federal nature of the claim.', "Duncan, supra, at 365-366; O'Sullivan v. Boerckel, 526 U. S. 838 , 845 (1999).", 'This case focuses upon the requirement of "fair presentation."', "Michael Reese, the respondent, appealed his state-court kidnaping and attempted sodomy convictions and sentences through O

In [8]:
little_df = pd.DataFrame(res_eval,columns=["execution_time","precision", "recall", "F1_score"], index=["nltk", 'spacy', "custom_spacy", "naive"])
little_df.style.highlight_max(color = 'green', axis = 0).highlight_max(color = 'red', axis = 0, subset = ["execution_time"]).highlight_min(color = 'red', axis = 0).highlight_min(color = 'green', axis = 0, subset = ["execution_time"])

Unnamed: 0,execution_time,precision,recall,F1_score
nltk,0.379237,0.819645,0.668145,0.733752
spacy,3.166476,0.830381,0.70524,0.761185
custom_spacy,15.61907,0.896021,0.825888,0.857936
naive,0.52626,0.271217,0.138127,0.180767
