Ici on va pré traiter les données pour les faire rentrer dans un modele (GRU OU LSTM)

In [12]:
import re
import pickle
import random
from collections import Counter
import pandas as pd
import torch
from tqdm import tqdm


In [13]:
import os
print(os.getcwd())

c:\Users\idirs\Desktop\M2IA\projet_rosette\processing_data


In [14]:
# Charger en listes de phrases alignées
with open("../data/raw_data/small_vocab_fr.txt", "r", encoding="utf-8") as f:
    fr_sentences = f.read().strip().split("\n")

with open("../data/raw_data/small_vocab_en.txt", "r", encoding="utf-8") as f:
    en_sentences = f.read().strip().split("\n")

print(len(fr_sentences), "lines for fr txt and", len(en_sentences), "lines for en txt") 


137860 lines for fr txt and 137860 lines for en txt


In [15]:
# Aligne en paires (anglais, français)

pairs = list(zip(en_sentences, fr_sentences))
print(pairs[0])  # (en, fr)

('new jersey is sometimes quiet during autumn , and it is snowy in april .', "new jersey est parfois calme pendant l' automne , et il est neigeux en avril .")


Nettoyer les phrases

* Minuscule

* Retirer la ponctuation ou la garder (au choix, si dataset petit je conseille de la garder mais séparée en tokens).

* Normaliser espaces.

In [16]:
def clean_sentence(sentence: str):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    return sentence


pairs = [(clean_sentence(en), clean_sentence(fr)) for en, fr in pairs]

In [17]:
# Tokenisation

en_tokens = [en.split() for en, fr in pairs]
fr_tokens = [fr.split() for en, fr in pairs]

In [18]:
# Construire vocabulaires (word → id, id → word)
def build_vocab(token_lists, min_freq=1):
    counter = Counter([tok for sent in token_lists for tok in sent])
    vocab = {word for word, freq in counter.items() if freq >= min_freq}
    vocab = ["<pad>", "<sos>", "<eos>", "<unk>"] + sorted(list(vocab))
    word2id = {w:i for i,w in enumerate(vocab)}
    id2word = {i:w for i,w in enumerate(vocab)}
    return word2id, id2word

en_word2id, en_id2word = build_vocab(en_tokens)
fr_word2id, fr_id2word = build_vocab(fr_tokens)

In [19]:
# Convertir phrases → séquences d’IDs

def encode_sentence(tokens, word2id, add_sos=False, add_eos=False):
    ids = []
    if add_sos:
        ids.append(word2id["<sos>"])
    for t in tokens:
        ids.append(word2id.get(t, word2id["<unk>"]))
    if add_eos:
        ids.append(word2id["<eos>"])
    return ids

pairs_ids = []
for en, fr in zip(en_tokens, fr_tokens):
    src_ids = encode_sentence(en, en_word2id)
    tgt_ids = encode_sentence(fr, fr_word2id, add_sos=True, add_eos=True)
    pairs_ids.append((src_ids, tgt_ids))

print(pairs_ids[0])

([128, 98, 93, 163, 146, 57, 19, 4, 9, 95, 93, 162, 91, 14, 5], [1, 218, 163, 117, 238, 61, 241, 166, 40, 4, 119, 154, 117, 217, 109, 45, 10, 2])


In [21]:
with open("../data/preprocessed_data/pairs_ids.pkl", "wb") as f:
    pickle.dump(pairs_ids, f)

In [23]:
with open("../data/preprocessed_data/pairs_ids.pkl", "rb") as f:
    pairs_ids_loaded = pickle.load(f)

Split Train / Val / Test

In [24]:
random.shuffle(pairs_ids)
k = 0.7 # percentage data pour la partie training
q = (1 - k)/2  # percentage data pour la partie test/val
n = len(pairs_ids)
train_pairs = pairs_ids[:int(k*n)]
val_pairs   = pairs_ids[int(k*n):int((k+q)*n)]
test_pairs  = pairs_ids[int(((k+q)*n)):]

In [26]:
def count_unique_words(pairs):
    """
    Compte le nombre de mots uniques dans les phrases cibles d'un ensemble de paires.

    Args:
        pairs (list): liste de tuples (src, tgt)

    Returns:
        unique_words_count (int): nombre de mots uniques
        duplicates (int): nombre de phrases doublons
        unique_words_set (set): ensemble des mots uniques rencontrés
    """
    seen_phrases = set()
    unique_words = set()
    duplicates = 0

    for _, tgt in tqdm(pairs):
        phrase_tuple = tuple(tgt)  # pour rendre la liste hashable
        if phrase_tuple not in seen_phrases:
            seen_phrases.add(phrase_tuple)
            unique_words.update(tgt)
        else:
            duplicates += 1

    return len(unique_words), duplicates, unique_words


# ==============================
# Comptage
# ==============================
word_train, dtr, used_train = count_unique_words(train_pairs)
word_val, dva, used_val = count_unique_words(val_pairs)
word_test, dte, used_test = count_unique_words(test_pairs)

total_word = len(fr_id2word)

# ==============================
# Affichage
# ==============================
print(f"Train: {word_train} mots uniques ({100 * word_train / total_word:.2f}%)")
print(f"Val:   {word_val} mots uniques ({100 * word_val / total_word:.2f}%)")
print(f"Test:  {word_test} mots uniques ({100 * word_test / total_word:.2f}%)\n")

print(f"Doublons train: {dtr}")
print(f"Doublons val:   {dva}")
print(f"Doublons test:  {dte}")


100%|██████████| 96502/96502 [00:00<00:00, 746396.21it/s]
100%|██████████| 20679/20679 [00:00<00:00, 816673.69it/s]
100%|██████████| 20679/20679 [00:00<00:00, 738570.38it/s]

Train: 352 mots uniques (98.88%)
Val:   336 mots uniques (94.38%)
Test:  334 mots uniques (93.82%)

Doublons train: 9176
Doublons val:   512
Doublons test:  444





In [None]:
torch.save({
    "train": train_pairs,
    "val": val_pairs,
    "test": test_pairs,
    "en_word2id": en_word2id,
    "fr_word2id": fr_word2id,
    "en_id2word": en_id2word,
    "fr_id2word": fr_id2word
}, "data/preprocessed_data/processed_data.pt")