In [3]:
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenized_data=load_from_disk('tokenized_dataset')

In [5]:
mini_dataset=tokenized_data['train'].select(range(40000))

# Keeping 30000 commun words

In [6]:
def most_commun(example,eng_vocab,fr_vocab):

    # Tokenize
    example['translation']['en'] =  [word if word in eng_vocab else 'UNK' for word in example['translation']['en']]
    example['translation']['fr'] = [word if word in fr_vocab else 'UNK' for word in example['translation']['fr']]

    return example
    

In [7]:
# Define the file path
eng_vocab_path= '../../30k_eng.txt'  # Replace with your file path
fr_vocab_path = '../../30k_fr.txt'
# Open the file and read lines into a list
with open(eng_vocab_path, 'r') as file:
    eng_vocab = [line.strip() for line in file]

with open(fr_vocab_path, 'r') as file:
    fr_vocab = [line.strip() for line in file]



In [8]:
print(eng_vocab[0] )
test_commun=most_commun(mini_dataset[0],eng_vocab,fr_vocab)
print(mini_dataset[0]['translation']['en'])
print(test_commun['translation']['en'])

PAD
['resumption', 'of', 'the', 'session']
['UNK', 'of', 'the', 'session']


In [9]:
mini_dataset=mini_dataset.map(most_commun, fn_kwargs={"eng_vocab": eng_vocab, "fr_vocab": fr_vocab}, num_proc=12)   

In [10]:
print(mini_dataset[0])

{'translation': {'en': ['UNK', 'of', 'the', 'session'], 'fr': ['reprise', 'de', 'la', 'session']}}


# Padding

In [11]:
def pad_sentence(example, max_length):
    """
    Pads or truncates a sentence to a specific length.

    Args:
    - sentence (list): The sentence to pad, represented as a list of tokens/words.
    - max_length (int): The maximum length of the sentence.
    - padding_token (int or str, optional): The token used for padding shorter sentences.

    Returns:
    - list: The padded or truncated sentence.
    """
    
    # Truncate the sentence if it's longer than max_length
    if len(example['translation']['en']) > max_length:
       example['translation']['en']=example['translation']['en'][:max_length]
    # Pad the sentence if it's shorter than max_length
    else :
        example['translation']['en'] = example['translation']['en'] + ['PAD']*(max_length-len(example['translation']['en']))

    if len(example['translation']['fr']) > max_length:
        example['translation']['fr']=example['translation']['fr'][:max_length]
    else :
        example['translation']['fr'] = example['translation']['fr'] + ['PAD']*(max_length-len(example['translation']['fr']))  
    
    return example


In [10]:

test_padding=pad_sentence(mini_dataset[0], 10)
print(mini_dataset[0])
print(test_padding)

{'translation': {'en': ['UNK', 'of', 'the', 'session'], 'fr': ['reprise', 'de', 'la', 'session']}}
{'translation': {'en': ['UNK', 'of', 'the', 'session', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD'], 'fr': ['reprise', 'de', 'la', 'session', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']}}


In [12]:
padded_dataset=mini_dataset.map(pad_sentence, fn_kwargs={"max_length": 50}, num_proc=12)

In [12]:
print(len(padded_dataset[10]['translation']['en']))

50


# Creating dictionaries 

In [13]:

# Cleaning words and creating the dictionary
word_dict_fr = {i: word.strip() for i, word in enumerate(fr_vocab)}

# Showing the first 10 entries of the dictionary as an example
example_dict = {k: word_dict_fr[k] for k in list(word_dict_fr)[:10]}
example_dict
print(len(word_dict_fr))
#eng dictionnary 
word_dict_eng = {i: word.strip() for i, word in enumerate(eng_vocab)}
print(len(word_dict_eng))

30000
30000


# Train test split

In [22]:
# Split the dataset into training and test sets
train_test_split_ratio = 0.2  # 20% for testing
train_dataset, test_dataset = padded_dataset.train_test_split(test_size=train_test_split_ratio).values()

print(f"Training set size: {len(train_dataset)}")
print(f"Test set size: {len(test_dataset)}")

Training set size: 32000
Test set size: 8000


In [21]:
print(padded_dataset)

Dataset({
    features: ['translation'],
    num_rows: 40000
})


In [23]:
padded_dataset.save_to_disk("mini_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 40000/40000 [00:00<00:00, 523137.10 examples/s]


In [16]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp311-cp311-win_amd64.whl (9.2 MB)
   ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
   ---------------------------------------- 0.1/9.2 MB 2.3 MB/s eta 0:00:04
    --------------------------------------- 0.2/9.2 MB 2.5 MB/s eta 0:00:04
   - -------------------------------------- 0.2/9.2 MB 2.5 MB/s eta 0:00:04
   - -------------------------------------- 0.4/9.2 MB 2.1 MB/s eta 0:00:05
   -- ------------------------------------- 0.6/9.2 MB 2.7 MB/s eta 0:00:04
   --- ------------------------------------ 0.9/9.2 MB 3.2 MB/s eta 0:00:03
   ---- ----------------------------------- 1.0/9.2 MB 3.2 MB/s eta 0:00:03
   ----- ---------------------------------- 1.2/9.2 MB 3.4 MB/s eta 0:00:03
   ------ -------------------------

# One hot encoding

In [10]:
import numpy as np

def one_hot_encode(example, eng_word_to_idx, fr_word_to_idx):

    # Extracting English and French sentences from the example
    eng_sentence = example['translation']['en']
    fr_sentence = example['translation']['fr']

    # Initialize one-hot encoded vectors for the entire sentences
    one_hot_encoded_sentence_en = np.zeros((len(eng_sentence), 30000))
    one_hot_encoded_sentence_fr = np.zeros((len(fr_sentence), 30000))


    # Vectorized one-hot encoding for English sentence
    eng_indices = [eng_word_to_idx[word] for word in eng_sentence if word in eng_word_to_idx]
    one_hot_encoded_sentence_en[np.arange(len(eng_sentence)), eng_indices] = 1

    # Vectorized one-hot encoding for French sentence
    fr_indices = [fr_word_to_idx[word] for word in fr_sentence if word in fr_word_to_idx]
    one_hot_encoded_sentence_fr[np.arange(len(fr_sentence)), fr_indices] = 1

    # Update the example with the one-hot encoded sentences
    example['one_hot_encoded_en'] = one_hot_encoded_sentence_en.tolist()
    example['one_hot_encoded_fr'] = one_hot_encoded_sentence_fr.tolist()

    return example
