In [8]:
from google.colab import drive
import os
from sklearn.model_selection import train_test_split
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
base_path = '/content/drive/My Drive/NLP Data/seq2seq/manythings'
os.path.exists(base_path)

True

We are going to create 3 files:

> train, test and valid

In [6]:
de_path = 'German - English/deu.txt'

In [9]:
lines = open(os.path.join(base_path, de_path)).read().split('\n')

eng, deu = [], []

for line in lines:
  try:
    en, de, _  = line.split('\t')
    eng.append(en)
    deu.append(de)
  except:
    continue


In [12]:
eng[:3], deu[:3]

(['Go.', 'Hi.', 'Hi.'], ['Geh.', 'Hallo!', 'Grüß Gott!'])

Split eng and deu into three sets

In [18]:
train_eng, test_eng , train_deu, test_deu = train_test_split(
    eng, deu, random_state=42,
    test_size = .005
)
train_eng, val_eng , train_deu, val_deu = train_test_split(
    train_eng, train_deu, random_state=42,
    test_size = .005
)

len(train_eng), len(test_eng), len(val_eng)

(240166, 1213, 1207)

Now we are going to create 6 files from these list of data which are:

```
1. train.de
2. train.en
3. test.de
4. test.en
5. val.de
6. val.en
```


In [23]:
writter = open(os.path.join(base_path, "German - English/train.en"), "w")
for line in train_eng:
  writter.write(line+'\n')
writter.close()
print("done")

writter = open(os.path.join(base_path, "German - English/test.en"), "w")
for line in test_eng:
  writter.write(line+'\n')
writter.close()
print("done")

writter = open(os.path.join(base_path, "German - English/valid.en"), "w")
for line in val_eng:
  writter.write(line+'\n')
writter.close()
print("done")


writter = open(os.path.join(base_path, "German - English/train.de"), "w")
for line in train_deu:
  writter.write(line+'\n')
writter.close()
print("done")

writter = open(os.path.join(base_path, "German - English/test.de"), "w")
for line in test_deu:
  writter.write(line+'\n')
writter.close()
print("done")

writter = open(os.path.join(base_path, "German - English/valid.de"), "w")
for line in val_deu:
  writter.write(line+'\n')
writter.close()
print("done")



done
done
done
done
done
done


Loading the dataset...

In [24]:
import torch
from torch import nn
from torch.nn  import functional as F
import spacy, math, random
import numpy as np
from torchtext.legacy import datasets, data

In [25]:
import spacy
import spacy.cli
spacy.cli.download('de_core_news_sm')

import de_core_news_sm, en_core_web_sm

spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [26]:
def tokenize_de(sent):
  return [tok.text for tok in spacy_de.tokenizer(sent)]

def tokenize_en(sent):
  return [tok.text for tok in spacy_en.tokenizer(sent)]

In [27]:

SRC = data.Field(
    tokenize = tokenize_de,
    lower= True,
    init_token = "<sos>",
     eos_token = "<eos>",
     include_lengths =True
)
TRG = data.Field(
    tokenize = tokenize_en,
    lower= True,
    init_token = "<sos>",
     eos_token = "<eos>"
)

We are making use of the following class to create a dataset from our custom file. The code bellow was taken from [this](https://pytorch.org/text/_modules/torchtext/datasets/translation.html) site.

In [None]:
class TranslationDataset(data.Dataset):
    """Defines a dataset for machine translation."""

    @staticmethod
    def sort_key(ex):
        return data.interleave_keys(len(ex.src), len(ex.trg))

    def __init__(self, path, exts, fields, **kwargs):
        """Create a TranslationDataset given paths and fields.

        Arguments:
            path: Common prefix of paths to the data files for both languages.
            exts: A tuple containing the extension to path for each language.
            fields: A tuple containing the fields that will be used for data
                in each language.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        if not isinstance(fields[0], (tuple, list)):
            fields = [('src', fields[0]), ('trg', fields[1])]

        src_path, trg_path = tuple(os.path.expanduser(path + x) for x in exts)

        examples = []
        with io.open(src_path, mode='r', encoding='utf-8') as src_file, \
                io.open(trg_path, mode='r', encoding='utf-8') as trg_file:
            for src_line, trg_line in zip(src_file, trg_file):
                src_line, trg_line = src_line.strip(), trg_line.strip()
                if src_line != '' and trg_line != '':
                    examples.append(data.Example.fromlist(
                        [src_line, trg_line], fields))

        super(TranslationDataset, self).__init__(examples, fields, **kwargs)


    @classmethod
    def splits(cls, exts, fields, path=None, root='.data',
               train='train', validation='val', test='test', **kwargs):
        """Create dataset objects for splits of a TranslationDataset.

        Arguments:
            exts: A tuple containing the extension to path for each language.
            fields: A tuple containing the fields that will be used for data
                in each language.
            path (str): Common prefix of the splits' file paths, or None to use
                the result of cls.download(root).
            root: Root dataset storage directory. Default is '.data'.
            train: The prefix of the train data. Default: 'train'.
            validation: The prefix of the validation data. Default: 'val'.
            test: The prefix of the test data. Default: 'test'.
            Remaining keyword arguments: Passed to the splits method of
                Dataset.
        """
        if path is None:
            path = cls.download(root)

        train_data = None if train is None else cls(
            os.path.join(path, train), exts, fields, **kwargs)
        val_data = None if validation is None else cls(
            os.path.join(path, validation), exts, fields, **kwargs)
        test_data = None if test is None else cls(
            os.path.join(path, test), exts, fields, **kwargs)
        return tuple(d for d in (train_data, val_data, test_data)
                     if d is not None)

In [33]:
root_path = os.path.join(base_path, "German - English")
os.listdir(root_path)

['deu.txt',
 'train.en',
 'test.en',
 'valid.en',
 'train.de',
 'test.de',
 'valid.de',
 'multi30k']

In [35]:
train_data, valid_data, test_data = datasets.TranslationDataset.splits(
    exts=('.de', '.en'),
    path=root_path,
    train='train', validation='valid', test='test',
    fields = (SRC, TRG)
)

In [36]:
print(vars(train_data[0]))

{'src': ['du', 'solltest', 'es', 'tun', '.'], 'trg': ['you', 'should', 'do', 'it', '.']}
