In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# Loading the saved files
with open('movie-xids.npy', 'rb') as f:
    Xids = np.load(f, allow_pickle=True)
    
with open('movie-xmask.npy', 'rb') as f:
    Xmask = np.load(f, allow_pickle=True)
    
with open('movie-labels.npy', 'rb') as f:
    labels = np.load(f, allow_pickle=True)

In [4]:
Xids.shape

(156060, 512)

In [5]:
# converting these arrays into a TF dataset object
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

In [6]:
dataset.take(1)

<TakeDataset shapes: ((512,), (512,), (5,)), types: (tf.int32, tf.int32, tf.float64)>

In [7]:
dataset

<TensorSliceDataset shapes: ((512,), (512,), (5,)), types: (tf.int32, tf.int32, tf.float64)>

Each sample in our dataset is a tuple containing a single `Xids`, `Xmask`, and `labels` tensor. However, when feeding data into our model we need a two-item tuple in the format **(\<inputs\>, \<outputs\>)**. Now, we have two tensors for our inputs - so, what we do is enter our **\<inputs\>** tensor as a dictionary:

```
{
    'input_ids': <input_id_tensor>,
    'attention_mask': <mask_tensor>
}
```

To rearrange the dataset format we can `map` a function that modifies the format like so:

In [8]:
def map_func(input_ids, masks, labels):
    # we convert our three-item tuple into a two-item tuple where the input item is a dictionary
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [9]:
# use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

In [10]:
dataset.take(1)

<TakeDataset shapes: ({input_ids: (512,), attention_mask: (512,)}, (5,)), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.float64)>

In [11]:
dataset

<MapDataset shapes: ({input_ids: (512,), attention_mask: (512,)}, (5,)), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.float64)>

In [12]:
# We will use data in batch with batch_size
batch_size = 16

In [13]:
# Shuffling the data before batching, then batching
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

In [14]:
dataset

<BatchDataset shapes: ({input_ids: (16, 512), attention_mask: (16, 512)}, (16, 5)), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.float64)>

In [15]:
# Train validation Split (90-10)
split = 0.9

In [16]:
# we need to calculate how many batches must be taken to create 90% training set
size = int((Xids.shape[0] / batch_size) * split)

In [17]:
size

8778

In [18]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

In [19]:
# free up memory
import gc
del dataset
gc.collect()

222

In [21]:
# Saving both files
tf.data.experimental.save(train_ds, 'train')
tf.data.experimental.save(val_ds, 'val')

In [22]:
train_ds.element_spec

({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None),
  'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int32, name=None)},
 TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))

In [23]:
val_ds.element_spec == train_ds.element_spec

True