In [11]:
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenized_data=load_from_disk('tokenized_dataset')

In [5]:
mini_dataset=tokenized_data['train'].select(range(40000))

# Keeping 30000 commun words

In [6]:
def most_commun(example,eng_vocab,fr_vocab):

    # Tokenize
    example['translation']['en'] =  [word if word in eng_vocab else 'UNK' for word in example['translation']['en']]
    example['translation']['fr'] = [word if word in fr_vocab else 'UNK' for word in example['translation']['fr']]

    return example
    

In [7]:
# Define the file path
eng_vocab_path= '../../30k_eng.txt'  # Replace with your file path
fr_vocab_path = '../../30k_fr.txt'
# Open the file and read lines into a list
with open(eng_vocab_path, 'r') as file:
    eng_vocab = [line.strip() for line in file]

with open(fr_vocab_path, 'r') as file:
    fr_vocab = [line.strip() for line in file]



In [8]:
print(eng_vocab[0] )
test_commun=most_commun(mini_dataset[0],eng_vocab,fr_vocab)
print(mini_dataset[0]['translation']['en'])
print(test_commun['translation']['en'])

PAD
['resumption', 'of', 'the', 'session']
['UNK', 'of', 'the', 'session']


In [9]:
mini_dataset=mini_dataset.map(most_commun, fn_kwargs={"eng_vocab": eng_vocab, "fr_vocab": fr_vocab}, num_proc=12)   

In [10]:
print(mini_dataset[0])

{'translation': {'en': ['UNK', 'of', 'the', 'session'], 'fr': ['reprise', 'de', 'la', 'session']}}


# Padding

In [11]:
def pad_sentence(example, max_length):
    """
    Pads or truncates a sentence to a specific length.

    Args:
    - sentence (list): The sentence to pad, represented as a list of tokens/words.
    - max_length (int): The maximum length of the sentence.
    - padding_token (int or str, optional): The token used for padding shorter sentences.

    Returns:
    - list: The padded or truncated sentence.
    """
    
    # Truncate the sentence if it's longer than max_length
    if len(example['translation']['en']) > max_length:
       example['translation']['en']=example['translation']['en'][:max_length]
    # Pad the sentence if it's shorter than max_length
    else :
        example['translation']['en'] = example['translation']['en'] + ['PAD']*(max_length-len(example['translation']['en']))

    if len(example['translation']['fr']) > max_length:
        example['translation']['fr']=example['translation']['fr'][:max_length]
    else :
        example['translation']['fr'] = example['translation']['fr'] + ['PAD']*(max_length-len(example['translation']['fr']))  
    
    return example


In [10]:

test_padding=pad_sentence(mini_dataset[0], 10)
print(mini_dataset[0])
print(test_padding)

{'translation': {'en': ['UNK', 'of', 'the', 'session'], 'fr': ['reprise', 'de', 'la', 'session']}}
{'translation': {'en': ['UNK', 'of', 'the', 'session', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], 'fr': ['reprise', 'de', 'la', 'session', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']}}


In [12]:
padded_dataset=mini_dataset.map(pad_sentence, fn_kwargs={"max_length": 50}, num_proc=12)

In [12]:
print(len(padded_dataset[10]['translation']['en']))

50


# Creating dictionaries 

In [13]:

# Cleaning words and creating the dictionary
word_dict_fr = {i: word.strip() for i, word in enumerate(fr_vocab)}

# Showing the first 10 entries of the dictionary as an example
example_dict = {k: word_dict_fr[k] for k in list(word_dict_fr)[:10]}
example_dict
print(len(word_dict_fr))
#eng dictionnary 
word_dict_eng = {i: word.strip() for i, word in enumerate(eng_vocab)}
print(len(word_dict_eng))

30000
30000


# Train test split

In [22]:
# Split the dataset into training and test sets
train_test_split_ratio = 0.2  # 20% for testing
train_dataset, test_dataset = padded_dataset.train_test_split(test_size=train_test_split_ratio).values()

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 32000
Test set size: 8000


In [21]:
print(padded_dataset)

Dataset({
    features: ['translation'],
    num_rows: 40000
})


In [23]:
padded_dataset.save_to_disk("mini_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 40000/40000 [00:00<00:00, 523137.10 examples/s]


# One hot encoding

In [13]:
data = load_from_disk('C:\\Users\\ferie\\OneDrive\\Bureau\\M2 ISI\\mini_dataset')

In [14]:
print(data[0])

{'translation': {'en': ['UNK', 'of', 'the', 'session', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], 'fr': ['reprise', 'de', 'la', 'session', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']}}


In [15]:
# creating pairs
pairs = [(item['translation']['en'], item['translation']['fr']) for item in data]

In [15]:

random.shuffle(pairs)

NameError: name 'pairs' is not defined

In [None]:
#split the pairs into batches
batch_size = 80
batches = [pairs[i:i + batch_size] for i in range(0, len(pairs), batch_size)]

In [8]:
# Define the file path
eng_vocab_path= '../../30k_eng.txt'  # Replace with your file path
fr_vocab_path = '../../30k_fr.txt'
# Open the file and read lines into a list
with open(eng_vocab_path, 'r') as file:
    eng_vocab = [line.strip() for line in file]

with open(fr_vocab_path, 'r') as file:
    fr_vocab = [line.strip() for line in file]


# Cleaning words and creating the dictionary
word_dict_fr = {i: word.strip() for i, word in enumerate(fr_vocab)}


print(len(word_dict_fr))
#eng dictionnary 
word_dict_eng = {i: word.strip() for i, word in enumerate(eng_vocab)}
print(len(word_dict_eng))

30000
30000


In [9]:
print(word_dict_eng['as'])

the


In [7]:
# Creating word_to_id dictionary for French vocabulary
word_to_id_fr = {word.strip(): i for i, word in enumerate(fr_vocab)}

print(len(word_to_id_fr))

# Creating word_to_id dictionary for English vocabulary
word_to_id_eng = {word.strip(): i for i, word in enumerate(eng_vocab)}



26684


In [None]:
# creating pairs
pairs = [(item['translation']['en'], item['translation']['fr']) for item in data]
#creating batches 
batches = [pairs[i:i + batch_size] for i in range(0, len(pairs), batch_size)]





In [16]:
import random
def make_batch(data,batch_size):
    # creating pairs
    pairs = [(item['translation']['en'], item['translation']['fr']) for item in data]
    random.shuffle(pairs)
    batches = [pairs[i:i + batch_size] for i in range(0, len(pairs), batch_size)]

    return batches


In [17]:
batches =make_batch(data,80)

In [20]:
import numpy as np
print(np.shape(batches))

(500, 80, 2, 50)


In [None]:
vocab_size=30000
input_batch=[]
output_batch=[]

for batch in batches : 
    for pair in batch: 
        input_batch.append([np.eye(vocab_size)[[word_to_id_eng[n] for n in pair[0]]]])
        output_batch.append([np.eye(vocab_size)[[word_to_id_fr[n] for n in pair[1]]]])