# Imports

In [1]:
# IMPORTS
import torch
import torch.nn as nn
import json # for loading the json file
from torch.utils.data import Dataset # for creating the dataset
from torch.utils.data import DataLoader # for creating the dataloader
from torch.utils.data import random_split # for splitting the dataset
from torch.nn.utils.rnn import pad_sequence # for padding the sequences

# for tokenizing the sentences
import nltk 
from nltk.tokenize import word_tokenize 
nltk.download('punkt') 


# For splitting the dataset into train, validation anmd testing
import random
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /Users/enzobenoit-
[nltk_data]     jeannin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Preprocessing

We first load the header json file were we store paths / parameters / etc.

In [2]:
# Load the JSON header file
def load_json_header(json_file):
    with open(json_file) as json_data:
        d = json.load(json_data)
        return d

config = load_json_header('config.json')

### Functions

We now define the set of functions we will used to preprocess the data.



We define a function to load the data from the files. The file paths are defined in the config file.

In [3]:
def load_data(source_file_path, target_file_path):
    """
    Load data from two separate files where each line in one file corresponds to the line in the other file.
    """
    with open(source_file_path, 'r', encoding='utf-8') as file:
        source = file.read().split('\n')
    with open(target_file_path, 'r', encoding='utf-8') as file:
        target = file.read().split('\n')

    return source, target

In [4]:
def build_vocab(sentences, min_frequency=2, special_tokens=["[UNK]", "[SOS]", "[EOS]", "[PAD]"]):
    vocab = {}
    word_counts = {}

    # Initialize vocab with special tokens
    for token in special_tokens:
        vocab[token] = len(vocab)

    # Count word frequencies
    for sentence in sentences:
        for word in word_tokenize(sentence):
            word_counts[word] = word_counts.get(word, 0) + 1

    # Add words above min frequency to vocab
    for word, count in word_counts.items():
        if count >= min_frequency:
            vocab[word] = len(vocab)

    return vocab

def tokenize_and_numericalize(sentence, vocab):
    return [vocab.get(word, vocab["[UNK]"]) for word in word_tokenize(sentence)]


In [5]:
def shuffle_and_split(source_sentences, target_sentences, test_size, val_size):
    combined = list(zip(source_sentences, target_sentences))
    random.shuffle(combined)
    shuffled_source_sentences, shuffled_target_sentences = zip(*combined)

    # Splitting into train and test
    src_train_val, src_test, trg_train_val, trg_test = train_test_split(
        shuffled_source_sentences, shuffled_target_sentences, test_size = test_size, random_state =42)
    
    # Splitting train_val into train and val
    src_train, src_val, trg_train, trg_val = train_test_split(
        src_train_val, trg_train_val, test_size = val_size/(1 - test_size), random_state=42)

    return src_train, src_val, src_test, trg_train, trg_val, trg_test

In [6]:
# Causal mask: each word in the decoder can only look at previous words
# This is done to prevent the decoder from looking at future words
# seq_len: length of the sequence
def causal_mask(seq_len):
    # Create a matrix of size seq_len x seq_len
    # Fill the upper triangle with 0s and lower triangle with 1s
    # This is done to prevent the decoder from looking at future words
    return torch.tril(torch.ones((1, seq_len, seq_len), dtype=torch.int64)) 

### Dataset Class

In [7]:
class TranslationDataset(Dataset):

    def __init__(self, source_ds, target_ds, src_vocab, trg_vocab, seq_len):
        super().__init__()
        self.source_ds = source_ds
        self.target_ds = target_ds
        self.src_vocab = src_vocab
        self.trg_vocab = trg_vocab
        self.seq_len = seq_len

        self.sos_token = torch.tensor([src_vocab["[SOS]"]], dtype=torch.int64)
        self.eos_token = torch.tensor([src_vocab["[EOS]"]], dtype=torch.int64)
        self.pad_token = torch.tensor([src_vocab["[PAD]"]], dtype=torch.int64)
    
    def __len__(self):
        return len(self.source_ds)

    def __getitem__(self, idx):
        # Get source and target sentences
        src = self.source_ds[idx]
        trg = self.target_ds[idx]

        # Cnvert text to tokens to split into single words
        # Convert to input ids: map words to vocab ids
        enc_input_tokens = tokenize_and_numericalize(src, self.src_vocab)
        dec_input_tokens = tokenize_and_numericalize(trg, self.trg_vocab)

        # PAD input sequences to the same length so that model can be trained in batches where each input sequence has the same length
        # minus 2: remove SOS and EOS tokens from the count
        enc_num_padding = self.seq_len - len(enc_input_tokens) - 2
        
        # Only minus 1: remove SOS token from the count (EOS token is not included in the input sequence)
        dec_num_padding = self.seq_len - len(dec_input_tokens) - 1
        
        if enc_num_padding < 0 or dec_num_padding < 0:
            raise Exception("Sequence length is too long")

        # Add SOS and EOS tokens to the input sequence
        encoder_input = torch.cat([
            self.sos_token, 
            torch.tensor(enc_input_tokens, dtype = torch.int64),
            self.eos_token, 
            self.pad_token.repeat(enc_num_padding)])

        # Add SOS token to the decoder input sequence
        decoder_input = torch.cat([
            self.sos_token, 
            torch.tensor(dec_input_tokens, dtype = torch.int64),
            self.pad_token.repeat(dec_num_padding)])
        
        # Add EOS token to the decoder label sequence
        label = torch.cat([
            torch.tensor(dec_input_tokens, dtype = torch.int64),
            self.eos_token, 
            self.pad_token.repeat(dec_num_padding)])

        assert encoder_input.shape[0] == self.seq_len
        assert decoder_input.shape[0] == self.seq_len
        assert label.shape[0] == self.seq_len

        return {
            "encoder_input": encoder_input, # size seq_len
            "decoder_input": decoder_input, # size seq_len
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1,1, Seq_len) # DOnt want the padding tokens to participate in the self attention mechnaism
            
            # Causal mask: each word can onlky look at previosu words and not future words and padding tokens as well. 
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1,1, Seq_len) & (1, Seq_len, Seq_len)
            "label": label,
            "src": src,
            "trg": trg
        }

### Apply preprocessing

In [8]:
# Load the English-Italian dataset
en_file_path = config['en-it-dataset-english'] 
it_file_path = config['en-it-dataset-italian']

In [9]:
# def preprocess(source_file_path, target_file_path):
source_file_path = en_file_path
target_file_path = it_file_path
# Load data from source and target files
source, target = load_data(source_file_path, target_file_path)

# Check if source and target files have the same number of lines
if len(source) != len(target):
    raise Exception("Source and target files do not have the same number of lines")

# Build vocab dictionaries for source and target languages
source_vocab = build_vocab(source)
target_vocab = build_vocab(target)


# We first get the validation and testing set sizes located in the config file
# We then calculate the training set size from them and the total dataset size (see the shuffle and split function above)
val_size = config["val_size"]
test_size = config["test_size"]

# Split into train and validation sets, making sure that the source and target sentences are aligned
src_train, src_val, src_test, trg_train, trg_val, trg_test = shuffle_and_split(source, target, test_size=test_size, val_size=val_size)

# Check that the source and target datasets have the same size after split, otherwise raise an exception
if len(src_train) != len(trg_train) or len(src_val) != len(trg_val) or len(src_test) != len(trg_test):
    raise Exception("Source and target datasets do not have the same size")

In [10]:
# Create datasets
# We use the TranslationDataset class defined above
# We pass the source and target datasets, source and target languages, source and target vocabularies, and the sequence length (in the config file)
train_ds = TranslationDataset(src_train, trg_train, source_vocab, target_vocab, config["seq_len"])
val_ds = TranslationDataset(src_val, trg_val, source_vocab, target_vocab, config["seq_len"])
test_ds = TranslationDataset(src_test, trg_test, source_vocab, target_vocab, config["seq_len"])

# Print dataset sizes and a sample from the training dataset
print("Train dataset size:", len(train_ds))
print("Validation dataset size:", len(val_ds))
print("Test dataset size:", len(test_ds))

Train dataset size: 1527292
Validation dataset size: 190912
Test dataset size: 190912


In [11]:
# Create dataloaders
train_dl = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
val_dl = DataLoader(val_ds, batch_size=1, shuffle=False)
test_dl = DataLoader(test_ds, batch_size=config["batch_size"], shuffle=False)

 # Print some examples from the dataloader
print("Some examples from the dataloader:")
for batch in train_dl:
    print(batch)
    break

Some examples from the dataloader:
{'encoder_input': tensor([[    1, 34564,   214,  ...,     3,     3,     3],
        [    1,     8,   223,  ...,     3,     3,     3],
        [    1,  2055,    34,  ...,     3,     3,     3],
        ...,
        [    1,     8,   943,  ...,     3,     3,     3],
        [    1,    96,   444,  ...,     3,     3,     3],
        [    1,   299,   214,  ...,     3,     3,     3]]), 'decoder_input': tensor([[    1, 55649, 53128,  ...,     3,     3,     3],
        [    1,   248,    90,  ...,     3,     3,     3],
        [    1, 12596,  1355,  ...,     3,     3,     3],
        ...,
        [    1,  1050,  1367,  ...,     3,     3,     3],
        [    1,   199,  1217,  ...,     3,     3,     3],
        [    1,   716,   391,  ...,     3,     3,     3]]), 'encoder_mask': tensor([[[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        ...,


        [[[1, 1, 1,  ..., 0, 0, 0]]],


        [[[1,

In [12]:
# # Load the English-Italian dataset
# en_file_path = config['en-it-dataset-english'] 
# it_file_path = config['en-it-dataset-italian']

# preprocess(en_file_path, it_file_path)