In [1]:
import re
import torch
import pandas as pd
from tqdm import tqdm
import torch.utils.data as data
from unicodedata import normalize

In [2]:
df = pd.read_csv('./en-fr.txt', names = ['en', 'fr', 'attr'], usecols=['en', 'fr'], sep='\t')
df = df.sample(frac=1, random_state=42)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,en,fr
0,You're very clever.,Vous êtes fort ingénieuse.
1,Are there kids?,Y a-t-il des enfants ?
2,Come in.,Entrez !
3,Where's Boston?,Où est Boston ?
4,You see what I mean?,Vous voyez ce que je veux dire ?


In [3]:
# text cleaning
# For English
# 1. Remove punctuation symbols and numbers
# 2. converting characters to lowercase
# 3. replacing unicode characters with their AASCII equivalent.

# For French
# 1. Add <start> and <end> tokens at the beginning and endof each phrase.

def clean_text(text):
    text = normalize('NFD', text.lower())
    text = re.sub('[^A-Za-z ]+', '', text)
    return text

def clean_prepare_text(text):
    text = '[start] ' + clean_text(text) + ' [end]'
    return text

df['en'] = df['en'].apply(lambda text: clean_text(text))
df['fr'] = df['fr'].apply(lambda text: clean_prepare_text(text))
df.head()

Unnamed: 0,en,fr
0,youre very clever,[start] vous etes fort ingenieuse [end]
1,are there kids,[start] y atil des enfants [end]
2,come in,[start] entrez [end]
3,wheres boston,[start] ou est boston [end]
4,you see what i mean,[start] vous voyez ce que je veux dire [end]


In [4]:
# create train-val-test split
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

num_sentences = len(df)
num_train = int(train_ratio * num_sentences)
num_val = int(val_ratio * num_sentences)
num_test = num_sentences - num_train - num_val

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)


In [5]:
train_df = df[:num_train]
val_df = df[num_train:num_train+num_val]
test_df = df[num_train+num_val:]

In [6]:
train_df.head()

Unnamed: 0,en,fr
0,this isnt easy,[start] ce nest pas facile [end]
1,this coat fits you,[start] ce manteau est a ta taille [end]
2,dont change a thing,[start] ne changez rien [end]
3,im happy youre here,[start] je suis heureux que tu sois la [end]
4,where did you go,[start] ou vous etesvous rendue [end]


In [7]:
# tokenization
train_en_tokens = []
train_fr_tokens = []
val_en_tokens = []
val_fr_tokens = []
test_en_tokens = []
test_fr_tokens = []
en_vocab = {'<pad>': 0}  # Initialize English vocabulary with <pad> token
fr_vocab = {'<pad>': 0}  # Initialize Bengali vocabulary with <pad> token

def tokenize_sentence(sentence, vocab):
    tokens = sentence.split()
    token_ids = []
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab)
        token_ids.append(vocab[token])
    return token_ids

# Tokenizing training data
print("Tokenizing training data:")
for en_sent, fr_sent in tqdm(zip(train_df['en'], train_df['fr']), total=len(train_df)):
    train_en_tokens.append(tokenize_sentence(en_sent, en_vocab))
    train_fr_tokens.append(tokenize_sentence(fr_sent, fr_vocab))

# Tokenizing validation data
print("Tokenizing validation data:")
for en_sent, fr_sent in tqdm(zip(val_df['en'], val_df['fr']), total=len(val_df)):
    val_en_tokens.append(tokenize_sentence(en_sent, en_vocab))
    val_fr_tokens.append(tokenize_sentence(fr_sent, fr_vocab))

# Tokenizing testing data
print("Tokenizing testing data:")
for en_sent, fr_sent in tqdm(zip(test_df['en'], test_df['fr']), total=len(test_df)):
    test_en_tokens.append(tokenize_sentence(en_sent, en_vocab))
    test_fr_tokens.append(tokenize_sentence(fr_sent, fr_vocab))

# Update the vocabulary sizes
src_vocab_size = len(en_vocab)
tgt_vocab_size = len(fr_vocab)

Tokenizing training data:


100%|██████████| 40000/40000 [00:00<00:00, 375202.47it/s]


Tokenizing validation data:


100%|██████████| 5000/5000 [00:00<00:00, 825747.92it/s]


Tokenizing testing data:


100%|██████████| 5000/5000 [00:00<00:00, 241498.87it/s]


In [8]:
class TranslationDataset(data.Dataset):
    def __init__(self, en_tokens, fr_tokens):
        super().__init__()
        self.en_tokens = en_tokens
        self.fr_tokens = fr_tokens
        self.max_len = max([max(len(en_token), len(fr_token)) for en_token, fr_token in zip(en_tokens, fr_tokens)]) # extract maximum length for zero padding

    def __len__(self):
        return len(self.en_tokens)
    
    def __getitem__(self, index):
        en_data = self.en_tokens[index] + [0] * (self.max_len - len(self.en_tokens[index])) # post zero padding
        fr_data = self.fr_tokens[index] + [0] * (self.max_len - len(self.fr_tokens[index])) # post zero padding

        return torch.tensor(en_data), torch.tensor(fr_data)

In [9]:
train_dataset = TranslationDataset(train_en_tokens, train_fr_tokens)
val_dataset = TranslationDataset(val_en_tokens, val_fr_tokens)
test_dataset = TranslationDataset(test_en_tokens, test_fr_tokens)

In [10]:
batch_size = 64
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1)