In [2]:
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
#recuperé la base de données
#only done once
dataset = load_from_disk('wmt14_fr_en')


In [6]:
# Save the dataset locally
#only done once
#dataset.save_to_disk("wmt14_fr_en")

Downloading data files:  29%|██▊       | 2/7 [39:28:42<98:41:47, 71061.41s/it]
Saving the dataset (30/30 shards): 100%|██████████| 40836715/40836715 [07:29<00:00, 90909.62 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3000/3000 [00:00<00:00, 180087.76 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3003/3003 [00:00<00:00, 132028.25 examples/s]


In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 40836715
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})


# multithread tokenization using map

In [17]:
from sacremoses import MosesTokenizer
from typing import Dict, Any



def tokenize_data(example: Dict[str, Any], mt_en: MosesTokenizer, mt_fr: MosesTokenizer) -> Dict[str, Any]:
    """
    Tokenizes English and French text in the given dataset example.

    :param example: A dictionary containing 'translation' key with 'en' and 'fr' subkeys.
    :param mt_en: English MosesTokenizer instance.
    :param mt_fr: French MosesTokenizer instance.
    :return: The modified example with tokenized text.
    """
    # Check if both 'en' and 'fr' keys exist
    if 'translation' in example and 'en' in example['translation'] and 'fr' in example['translation']:
        example['translation']['en'] = mt_en.tokenize(example['translation']['en'].lower())
        example['translation']['fr'] = mt_fr.tokenize(example['translation']['fr'].lower())


    return example



In [18]:
# Initialize tokenizers once, outside the function
mt_en = MosesTokenizer(lang='en')
mt_fr = MosesTokenizer(lang='fr')
test_tokenization = tokenize_data(dataset['train'][10], mt_en, mt_fr)
print(test_tokenization)

{'translation': {'en': ['would', 'it', 'be', 'appropriate', 'for', 'you', ',', 'madam', 'president', ',', 'to', 'write', 'a', 'letter', 'to', 'the', 'sri', 'lankan', 'president', 'expressing', 'parliament', '&apos;s', 'regret', 'at', 'his', 'and', 'the', 'other', 'violent', 'deaths', 'in', 'sri', 'lanka', 'and', 'urging', 'her', 'to', 'do', 'everything', 'she', 'possibly', 'can', 'to', 'seek', 'a', 'peaceful', 'reconciliation', 'to', 'a', 'very', 'difficult', 'situation', '?'], 'fr': ['ne', 'pensez-vous', 'pas', ',', 'madame', 'la', 'présidente', ',', 'qu&apos;', 'il', 'conviendrait', 'd&apos;', 'écrire', 'une', 'lettre', 'au', 'président', 'du', 'sri', 'lanka', 'pour', 'lui', 'communiquer', 'que', 'le', 'parlement', 'déplore', 'les', 'morts', 'violentes', ',', 'dont', 'celle', 'de', 'm.', 'ponnambalam', ',', 'et', 'pour', 'l&apos;', 'inviter', 'instamment', 'à', 'faire', 'tout', 'ce', 'qui', 'est', 'en', 'son', 'pouvoir', 'pour', 'chercher', 'une', 'réconciliation', 'pacifique', 'et',

In [9]:
import multiprocessing as mp

cpu_count = mp.cpu_count() 
print(cpu_count)

12


In [19]:
tokenized_dataset = dataset.map(tokenize_data, fn_kwargs={"mt_en": mt_en, "mt_fr": mt_fr}, num_proc=12)

In [22]:
print(tokenized_dataset['train'][100])
print(type(tokenized_dataset))
print(type(dataset))

{'translation': {'en': ['that', 'is', 'why', 'my', 'group', 'moves', 'that', 'this', 'item', 'be', 'taken', 'off', 'the', 'agenda', '.'], 'fr': ['c&apos;', 'est', 'pourquoi', 'mon', 'groupe', 'demande', 'que', 'ce', 'point', 'soit', 'retiré', 'de', 'l&apos;', 'ordre', 'du', 'jour', '.']}}
<class 'datasets.dataset_dict.DatasetDict'>
<class 'datasets.dataset_dict.DatasetDict'>


In [63]:
tokenized_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (47/47 shards): 100%|██████████| 40836715/40836715 [13:34<00:00, 50130.90 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 3000/3000 [00:00<00:00, 59020.67 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3003/3003 [00:00<00:00, 69604.91 examples/s]


# Keeping 30000 most commun words

In [3]:
tokenized_dataset=load_from_disk("tokenized_dataset")

In [4]:
from collections import Counter
from typing import Dict, Any

def create_vocab(example: Dict[str, Any], eng_counter: Counter, fr_counter: Counter) -> (Counter, Counter):
    """
    Creates both English and French vocabularies from the given dataset example.

    :param example: A dictionary containing 'translation' key with 'en' and 'fr' subkeys.
    :param eng_counter: Counter object to store the English vocabulary.
    :param fr_counter: Counter object to store the French vocabulary.
    :return: The modified Counter objects for English and French vocabularies.
    """
    if 'translation' in example:
        if 'en' in example['translation']:
            eng_counter.update(example['translation']['en'])
        if 'fr' in example['translation']:
            fr_counter.update(example['translation']['fr'])

    return None


In [None]:
# Initialize counters
en_counter = Counter()  
fr_counter = Counter()

# Create vocabularies from train data
tokenized_dataset['train'].map(lambda example: create_vocab(example, en_counter, fr_counter))

In [62]:
print(tokenized_data['train'][10])
print(tokenized_data['test'][10])

{'translation': {'en': ['would', 'it', 'be', 'appropriate', 'for', 'you', ',', 'madam', 'president', ',', 'to', 'write', 'a', 'letter', 'to', 'the', 'sri', 'lankan', 'president', 'expressing', 'parliament', '&apos;s', 'regret', 'at', 'his', 'and', 'the', 'other', 'violent', 'deaths', 'in', 'sri', 'lanka', 'and', 'urging', 'her', 'to', 'do', 'everything', 'she', 'possibly', 'can', 'to', 'seek', 'a', 'peaceful', 'reconciliation', 'to', 'a', 'very', 'difficult', 'situation', '?'], 'fr': ['ne', 'pensez-vous', 'pas', ',', 'madame', 'la', 'présidente', ',', 'qu&apos;', 'il', 'conviendrait', 'd&apos;', 'écrire', 'une', 'lettre', 'au', 'président', 'du', 'sri', 'lanka', 'pour', 'lui', 'communiquer', 'que', 'le', 'parlement', 'déplore', 'les', 'morts', 'violentes', ',', 'dont', 'celle', 'de', 'm.', 'ponnambalam', ',', 'et', 'pour', 'l&apos;', 'inviter', 'instamment', 'à', 'faire', 'tout', 'ce', 'qui', 'est', 'en', 'son', 'pouvoir', 'pour', 'chercher', 'une', 'réconciliation', 'pacifique', 'et',

In [30]:
def most_commun(example,eng_vocab,fr_vocab):

    if 'en' in example['translation'] and 'fr' in example['translation']:
        # Tokenize
        example['translation']['en'] =  [word if word in eng_vocab else 'UNK' for word in example['translation']['en']]
        example['translation']['fr'] = [word if word in fr_vocab else 'UNK' for word in example['translation']['fr']]

    return example
    

In [66]:
# Define the file path
eng_vocab_path= '../../30k_eng.txt'  # Replace with your file path
fr_vocab_path = '../../30k_fr.txt'
# Open the file and read lines into a list
with open(eng_vocab_path, 'r') as file:
    eng_vocab = [line.strip() for line in file]

with open(fr_vocab_path, 'r') as file:
    fr_vocab = [line.strip() for line in file]


In [67]:
print(len(eng_vocab))
print(len(fr_vocab))

30000
30001


In [71]:
test_vocab = most_commun(tokenized_dataset['train'][10],eng_vocab,fr_vocab)
print(dataset['train'][10])
print(tokenized_dataset['train'][10])
print(test_vocab)

{'translation': {'en': "Would it be appropriate for you, Madam President, to write a letter to the Sri Lankan President expressing Parliament's regret at his and the other violent deaths in Sri Lanka and urging her to do everything she possibly can to seek a peaceful reconciliation to a very difficult situation?", 'fr': "Ne pensez-vous pas, Madame la Présidente, qu'il conviendrait d'écrire une lettre au président du Sri Lanka pour lui communiquer que le Parlement déplore les morts violentes, dont celle de M. Ponnambalam, et pour l'inviter instamment à faire tout ce qui est en son pouvoir pour chercher une réconciliation pacifique et mettre un terme à cette situation particulièrement difficile."}}
{'translation': {'en': ['would', 'it', 'be', 'appropriate', 'for', 'you', ',', 'madam', 'president', ',', 'to', 'write', 'a', 'letter', 'to', 'the', 'sri', 'lankan', 'president', 'expressing', 'parliament', '&apos;s', 'regret', 'at', 'his', 'and', 'the', 'other', 'violent', 'deaths', 'in', 'sr

In [None]:
updated_dataset = tokenized_dataset.map(most_commun, fn_kwargs={"eng_vocab": eng_vocab,"fr_vocab": fr_vocab }, num_proc=12)

# One Hot encoding

In [None]:
import numpy as np

In [None]:
def one_hot(index, vocab_size):

    # Initialize the list of one-hot encoded vectors
    one_hot_vectors = np.zeros(vocab_size)
    one_hot_vectors[index] = 1

    return one_hot_vectors

def one_hot_encode(example,eng_vocab,fr_vocab):
    eng_vocab_size = len(eng_vocab)
    fr_vocab_size = len(fr_vocab)

    if 'en' in example['translation'] and 'fr' in example['translation']:
        # Tokenize
        example['translation']['en'] =  [one_hot_encode(eng_vocab.index(word),eng_vocab_size) for word in example['translation']['en']]
        example['translation']['fr'] = [one_hot_encode(fr_vocab.index(word),fr_vocab_size) for word in example['translation']['fr']]

    return example

In [None]:
test_one_hot = one_hot_encode(updated_dataset['train'][10],eng_vocab,fr_vocab)
print(test_one_hot)

# Padding

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def pad_sequences_and_convert_to_integers(X, Y, max_length):
    # Convert tokenized sentences to sequences of integers
    tokenizer_X = Tokenizer()
    tokenizer_X.fit_on_texts(X)
    seq_X = tokenizer_X.texts_to_sequences(X)

    tokenizer_Y = Tokenizer()
    tokenizer_Y.fit_on_texts(Y)
    seq_Y = tokenizer_Y.texts_to_sequences(Y)

    # Pad sequences to a maximum length
    padded_X = pad_sequences(seq_X, maxlen=max_length, padding='post')
    padded_Y = pad_sequences(seq_Y, maxlen=max_length, padding='post')

    return padded_X, padded_Y


In [None]:
#testing padding
tokenized_X, tokenized_Y = tokenize_data(sample_data )

# Define maximum sequence length
max_seq_length = 50

# Pad the tokenized data and convert to integer sequences
padded_X, padded_Y = pad_sequences_and_convert_to_integers(tokenized_X, tokenized_Y, max_seq_length)

print(padded_X)
print(padded_Y)