In [1]:
import re
import pandas as pd
import spacy
from transformers import FlaubertModel, FlaubertTokenizer
import torch
import os

In [2]:
# Chargement de FlauBERT pour la tokenisation et les embeddings
tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
model = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased')

def extract_articles_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Pattern pour identifier les codes d'article
    mixed_code_pattern = re.compile(r'\b(?=[A-Z0-9]{10}\b)(?=[A-Z]*[0-9][A-Z]*)(?=[0-9]*[A-Z][0-9]*)[A-Z0-9]{10}')

    # Extraction et comptage des codes uniques
    unique_mixed_found_codes = set(mixed_code_pattern.findall(content))

    # Détermination des limites des articles
    article_boundaries = list(mixed_code_pattern.finditer(content))

    # Extraction des articles
    articles = []
    for i in range(len(article_boundaries)):
        start_index = article_boundaries[i].start()
        end_index = article_boundaries[i + 1].start() if i + 1 < len(article_boundaries) else None
        article_code = content[start_index:article_boundaries[i].end()].strip()
        article_text = content[article_boundaries[i].end():end_index].strip()
        articles.append((article_code, article_text))

    return pd.DataFrame(articles, columns=['Code', 'Text'])

def get_embeddings(text):
    # Diviser le texte en segments de 512 tokens
    max_length = 512
    tokens = tokenizer.encode(text, add_special_tokens=False)
    token_segments = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    
    embeddings_list = []
    for segment in token_segments:
        segment_tensor = torch.tensor([segment])
        with torch.no_grad():
            segment_embeddings = model(segment_tensor)[0]
        embeddings_list.append(segment_embeddings.mean(dim=1).squeeze())
    
    # Prendre la moyenne des embeddings de tous les segments
    return torch.mean(torch.stack(embeddings_list), dim=0)


def vectorize_articles_with_embeddings(articles_df):
    embeddings_list = []
    for _, row in articles_df.iterrows():
        embeddings = get_embeddings(row['Text'])
        embeddings_list.append(embeddings.numpy())
    
    return pd.DataFrame(embeddings_list)

Some weights of the model checkpoint at flaubert/flaubert_base_cased were not used when initializing FlaubertModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing FlaubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
file_path = r"C:\Users\33672\OneDrive\Bureau\TER Docs\corpus_FRESA - Texte - Pertinence\Entrainement - txt\Non Pertinent.txt"
articles_df = extract_articles_from_file(file_path)

In [4]:

##UNIQUEMENT POUR DATASET ENTRAINEMENT

# Ajouter une nouvelle colonne contenant des '1 ou 0' au début du DataFrame pour déterminer la pertinence ou non de l'article
articles_df.insert(0, 'Pertinence', 0)

In [5]:
articles_df.head()  

Unnamed: 0,Pertinence,Code,Text
0,0,IWUSP7B1UT,",Bénin : Le gouvernement recrute 100 auditeurs..."
1,0,HFJB1RBOBG,",Tirs de missiles : La Corée du Nord s'amélior..."
2,0,1HI8MCH2VW,",""Poutine : Biden """"n'a aucune intention de s'..."
3,0,J6YEAEEZXN,",Coupe du monde féminine 2023 : Les représenta..."
4,0,3IT7YAZNRY,",Bénin: la réhabilitation des voies démarre fi..."


In [6]:
# Vectorisation des articles avec FlauBERT
embeddings_df = vectorize_articles_with_embeddings(articles_df)

Token indices sequence length is longer than the specified maximum sequence length for this model (1664 > 512). Running this sequence through the model will result in indexing errors


In [7]:
# Concaténer les embeddings avec la colonne de pertinence
final_df = pd.concat([articles_df['Pertinence'], embeddings_df], axis=1)

In [8]:
# Extraire le nom de base du fichier
base_name = os.path.basename(file_path)

# Retirer l'extension du fichier et ajouter '_traité'
new_file_name = os.path.splitext(base_name)[0] + '_traité.pkl'

# Définir le chemin du nouveau fichier pickle
new_file_path = os.path.join(os.path.dirname(file_path), new_file_name)

# Sauvegarder le DataFrame au format pickle
final_df.to_pickle(new_file_path)

print(f"DataFrame sauvegardé au chemin : {new_file_path}")

DataFrame sauvegardé au chemin : C:\Users\33672\OneDrive\Bureau\TER Docs\corpus_FRESA - Texte - Pertinence\Entrainement - txt\Non Pertinent_traité.pkl


In [10]:

##UNIQUEMENT POUR DATASET ENTRAINEMENT

# Chemins vers les fichiers pickle
pertinent_file_path = r"C:\Users\33672\OneDrive\Bureau\TER Docs\corpus_FRESA - Texte - Pertinence\Entrainement - txt\Pertinent_traité.pkl"
non_pertinent_file_path = r"C:\Users\33672\OneDrive\Bureau\TER Docs\corpus_FRESA - Texte - Pertinence\Entrainement - txt\Non Pertinent_traité.pkl"

# Charger les DataFrames à partir des fichiers pickle
pertinent_df = pd.read_pickle(pertinent_file_path)
non_pertinent_df = pd.read_pickle(non_pertinent_file_path)

# Concaténer les deux DataFrames
combined_df = pd.concat([pertinent_df, non_pertinent_df], ignore_index=True)

# (Optionnel) Réinitialiser l'index du DataFrame combiné
combined_df.reset_index(drop=True, inplace=True)

# Affichage des premières lignes pour vérification
print(combined_df.head())

# Chemin pour enregistrer le DataFrame combiné
save_path = r"C:/Users/33672/OneDrive/Bureau/TER Docs/corpus_FRESA - Texte - Pertinence/Entrainement - txt/train-traité.pkl"

# Sauvegarde du DataFrame combiné
combined_df.to_pickle(save_path)

print("DataFrame sauvegardé avec succès à l'emplacement : " + save_path)

   Pertinence         0         1         2         3         4         5  \
0           1 -0.129774  0.448794  0.069120  0.804145 -0.059709 -0.303203   
1           1 -0.628227  0.290821  0.605764  0.407965 -0.393719 -0.599755   
2           1 -0.496422  0.360855  0.579142 -0.378369 -0.553538 -0.472761   
3           1 -0.546441  0.814615  0.384502  0.502827 -0.206296 -0.550668   
4           1 -0.333643  0.460296  0.445026 -0.007603 -0.297541  0.055417   

          6         7         8  ...       758       759       760       761  \
0  0.070819 -0.688821 -0.257140  ... -0.298196  0.017583 -0.561512 -0.028051   
1  0.487397 -0.738708 -0.271538  ... -0.419414  0.100172 -0.552149 -0.145568   
2 -0.035669 -0.806150 -0.144847  ...  0.009380  0.239444 -0.026951 -0.035498   
3 -0.083413 -0.470302 -0.522546  ... -0.491874 -0.167929 -0.415632 -0.331937   
4  0.070353 -0.910181  0.218543  ... -0.201357 -0.300444 -0.477405 -0.381431   

        762       763       764       765       766     