### 1. Import Dependencies and Connect to Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import numpy as np
import tensorflow_datasets as tfds
from tensorflow.keras.layers import TextVectorization

### 2. Get Dataset

In [3]:
config = tfds.translate.opus.OpusConfig(
    version=tfds.core.Version('0.1.0'),
    language_pair=("de","en"),
    subsets=["GNOME", "EMEA"]
)
builder = tfds.builder("opus", config=config)



In [4]:
builder.download_and_prepare()

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to ~/tensorflow_datasets/opus/de-en for GNOME, EMEA/0.1.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling ~/tensorflow_datasets/opus/de-en for GNOME, EMEA/0.1.0.incomplete8TXOGB/opus-train.tfrecord*...:   0…

[1mDataset opus downloaded and prepared to ~/tensorflow_datasets/opus/de-en for GNOME, EMEA/0.1.0. Subsequent calls will reuse this data.[0m


In [5]:
ds = builder.as_dataset(split='train')

In [7]:
ds.as_numpy_iterator().next()

{'de': b'Bei tr\xc3\xa4chtigen Ratten war die AUC f\xc3\xbcr die berechnete ungebundene Substanz bei dieser Dosis etwa 18-mal h\xc3\xb6her als die AUC beim Menschen bei einer 20 mg Dosis.\n',
 'en': b'In the pregnant rat the AUC for calculated free drug at this dose was approximately 18 times the human AUC at a 20 mg dose.\n'}

In [8]:
source = ds.map(lambda x: x['en'])
target = ds.map(lambda x: 'start' + x['de'] + 'end')

In [11]:
source.as_numpy_iterator().next()

b'In the pregnant rat the AUC for calculated free drug at this dose was approximately 18 times the human AUC at a 20 mg dose.\n'

In [12]:
target.as_numpy_iterator().next()

b'startBei tr\xc3\xa4chtigen Ratten war die AUC f\xc3\xbcr die berechnete ungebundene Substanz bei dieser Dosis etwa 18-mal h\xc3\xb6her als die AUC beim Menschen bei einer 20 mg Dosis.\nend'

In [9]:
en_vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200, pad_to_max_tokens=True)
de_vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200, pad_to_max_tokens=True)

In [13]:
len(source)

1137191

In [14]:
len(target)

1137191

In [15]:
en_vectorizer.adapt(source.take(100))
de_vectorizer.adapt(target.take(100))

In [18]:
english_vocab = en_vectorizer.get_vocabulary()

In [23]:
english_vocab[87]

'name'

In [19]:
en_vectorizer('[UNK]')

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])>

In [20]:
en_vectorizer(source.skip(100).as_numpy_iterator().next())

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([97, 87,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])>

In [25]:
sample_text = source.skip(50000).as_numpy_iterator().next()
sample_text

b'Medicinal product subject to medical prescription.\n'

In [26]:
en_vectorizer(sample_text)

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([  1, 255,   1,   5,   1,   1,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,  

In [None]:
from tensorflow.python.training.input import batch
ds = builder.as_dataset(split='train')
ds = ds.map(lambda x: ((en_vectorizer('start' + x['en']), de_vectorizer('start' + x['de'])), (de_vectorizer(x['de']) + 'end'))
ds = ds.cache()
ds = ds.shuffle(100000)
ds = ds.batch(64)
ds = ds.prefetch(32)