In [None]:
!pip install nltk
!pip install spacy
!python -m spacy download fr_dep_news_trf
!python -m spacy download en_core_web_trf

In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load dataset

In [None]:
from datasets import load_dataset

train_data_files = {'train': 'fr-en/train-00017-of-00030.parquet'}
dataset_train = load_dataset(path='wmt/wmt14', trust_remote_code=True, data_files=train_data_files)

test_data_files = {'test': 'fr-en/test-00000-of-00001.parquet'}
dataset_test = load_dataset('wmt/wmt14', trust_remote_code=True, data_files=test_data_files)

validation_data_files = {'validation': 'fr-en/validation-00000-of-00001.parquet'}
dataset_validation = load_dataset('wmt/wmt14', trust_remote_code=True, data_files=validation_data_files)

In [None]:
# from datasets import load_dataset, load_from_disk

# dataset_train = load_from_disk('/kaggle/working/train_data')
# dataset_test = load_from_disk('/kaggle/working/test_data')
# dataset_validation = load_from_disk('/kaggle/working/validation_data')

## Sort sentence length

In [None]:
def get_sentence_length(example):
    # Gets first language text (source text)
    source_text = list(example['translation'].values())[0]
    example['length'] = len(source_text.split())
    return example

def sort_sentence_length(dataset):
    dataset_with_lengths = dataset.map(get_sentence_length)
    sorted_dataset = dataset_with_lengths.sort('length')
    return sorted_dataset.remove_columns('length')

dataset_train = sort_sentence_length(dataset_train)
dataset_test = sort_sentence_length(dataset_test)
dataset_validation = sort_sentence_length(dataset_validation)

## Tokenization

In [None]:
import spacy
en_nlp = spacy.load('en_core_web_trf')
fr_nlp = spacy.load('fr_dep_news_trf')

  model.load_state_dict(torch.load(filelike, map_location=device))


In [None]:
def batch_tokenize(example, en_nlp, fr_nlp, max_length, sos_token, eos_token):
    en_tokens = [token.text.lower() for token in en_nlp.tokenizer(example['translation']['en'])]
    fr_tokens = [token.text.lower() for token in fr_nlp.tokenizer(example['translation']['fr'])]

    en_tokens = [sos_token] + en_tokens + [eos_token]
    fr_tokens = [sos_token] + fr_tokens + [eos_token]

    return {'en_tokens': en_tokens, 'fr_tokens': fr_tokens}


max_length = 1000
sos_token = '<sos>'
eos_token = '<eos>'
pad_token = '<pad>'

fn_kwargs = {
    'en_nlp': en_nlp,
    'fr_nlp': fr_nlp,
    'max_length': max_length,
    'sos_token': sos_token,
    'eos_token': eos_token,
}

dataset_train = dataset_train['train'].map(batch_tokenize, fn_kwargs=fn_kwargs)
dataset_test = dataset_test['test'].map(batch_tokenize, fn_kwargs=fn_kwargs)
dataset_validation = dataset_validation['validation'].map(batch_tokenize, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/1361224 [00:00<?, ? examples/s]

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# from itertools import chain
# en_unique_words = set(chain(*dataset_train['en_tokens']))
# fr_unique_words = set(chain(*dataset_train['fr_tokens']))

In [None]:
from collections import Counter

def process_translations(df):
    en_texts = [item['en'] for item in df['translation']]
    fr_texts = [item['fr'] for item in df['translation']]

    return pd.DataFrame({
        'en': en_texts,
        'fr': fr_texts
    })

df_words = process_translations(dataset_train)
en = df_words['en'].tolist()
fr = df_words['fr'].tolist()


def lang_str_int(lang, nlp):
    special_vocab = ['<unk>', '<pad>', '<sos>', '<eos>']
    tokens = [token.text.lower() for sentence in lang for token in nlp.tokenizer(sentence)]
    vocab_counter = Counter(tokens)
    lang_vocab = special_vocab + [word for word, freq in vocab_counter.items() if freq >= 2]
    str2int = {ch: i for i, ch in enumerate(lang_vocab)}
    int2str = {i: ch for i, ch in enumerate(lang_vocab)}
    return str2int, int2str


fr_str2int, fr_int2str = lang_str_int(fr, fr_nlp)
en_str2int, en_int2str = lang_str_int(en, en_nlp)
len(fr_str2int), len(en_str2int)

(150345, 126272)

## tokens to ids

In [None]:
import torch
import numpy as np
import torch.nn as nn

def token_to_int(example, str2int):
    return [str2int.get(token, str2int['<unk>']) for token in example]

def tokens_to_ids(example):
    example['en_ids'] = token_to_int(example['en_tokens'], en_str2int)
    example['fr_ids'] = token_to_int(example['fr_tokens'], fr_str2int)
    return example

dataset_train = dataset_train.map(tokens_to_ids)
dataset_test = dataset_test.map(tokens_to_ids)
dataset_validation = dataset_validation.map(tokens_to_ids)

Map:   0%|          | 0/1361224 [00:00<?, ? examples/s]

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
'''
Reverse the source language, eg source_lang: 'hello' -> 'olleh'
This is said to give a significant boost in the model accuracy.
example:
    source_lang: a, b, c
    target_lang: α, β, γ

    reversing: c, b, a -> α, β, γ
    Why? because it makes it faster to establish a communication.
'''

def reverse_source_lang(example):
    '''reverse list'''
    example['en_ids'] = list(reversed(example['en_ids']))
    return example

dataset_train = dataset_train.map(reverse_source_lang)
dataset_test = dataset_test.map(reverse_source_lang)
dataset_validation = dataset_validation.map(reverse_source_lang)

dataset_train.set_format(
    type='torch',
    columns=['en_ids', 'fr_ids'],
    output_all_columns=False
)
dataset_test.set_format(
    type='torch',
    columns=['en_ids', 'fr_ids'],
    output_all_columns=False
)
dataset_validation.set_format(
    type='torch',
    columns=['en_ids', 'fr_ids'],
    output_all_columns=False
)

Map:   0%|          | 0/1361224 [00:00<?, ? examples/s]

Map:   0%|          | 0/3003 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

### Saving data

In [None]:
from pathlib import Path
Path('/kaggle/working/train_data').mkdir(parents=True, exist_ok=True)
Path('/kaggle/working/test_data').mkdir(parents=True, exist_ok=True)
Path('/kaggle/working/validation_data').mkdir(parents=True, exist_ok=True)


dataset_train.save_to_disk('/kaggle/working/dataset_train')
dataset_test.save_to_disk('/kaggle/working/dataset_test')
dataset_validation.save_to_disk('/kaggle/working/dataset_validation')

Saving the dataset (0/4 shards):   0%|          | 0/1361224 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3003 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
import json

with open('fr_str2int.json', 'w') as f:
    json.dump(fr_str2int, f)

with open('fr_int2str.json', 'w') as f:
    json.dump(fr_int2str, f)

with open('en_str2int.json', 'w') as f:
    json.dump(en_str2int, f)

with open('en_int2str.json', 'w') as f:
    json.dump(en_int2str, f)

In [None]:
# data batching
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example['en_ids'] for example in batch]
        batch_fr_ids = [example['fr_ids'] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_fr_ids = nn.utils.rnn.pad_sequence(batch_fr_ids, padding_value=pad_index)
        batch = {
            'en_ids': batch_en_ids,
            'fr_ids': batch_fr_ids
        }
        return batch
    return collate_fn


def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle
    )
    return data_loader


# data loader
batch_size = 128
PAD_INDEX = en_str2int[pad_token]

train_data_loader = get_data_loader(dataset_train, batch_size, PAD_INDEX, shuffle=False)
test_data_loader = get_data_loader(dataset_test, batch_size, PAD_INDEX, shuffle=False)
validation_data_loader = get_data_loader(dataset_validation, batch_size, PAD_INDEX, shuffle=False)

result = next(iter(train_data_loader))
result['en_ids'].shape, result['fr_ids'].shape

(torch.Size([9, 128]), torch.Size([24, 128]))

In [None]:
en_str2int[pad_token] == fr_str2int[pad_token]

True