# Creating Datasets

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.data import Dataset
from tensorflow.keras import layers, Sequential, utils, optimizers, losses
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [2]:
x, y = make_classification(n_samples=10000, n_features=100, n_informative=30, n_classes=2, random_state=12)

## Creating from tensor-slices

In [3]:
# one thing that not many know is this
dataset = tf.data.Dataset.from_tensor_slices((x,y))

# and that's not it. Say you need to give multiple labels then
dataset = tf.data.Dataset.from_tensor_slices((x, (y, y, y)))

# here x and y are both np arrays which you could get from df[column_name].values
# Also this is most useful in case of image data where x could be image paths.

In [4]:
for i in dataset:
    print(i)
    break

(<tf.Tensor: shape=(100,), dtype=float64, numpy=
array([ 1.44139631, -2.16397307, -1.0746983 ,  0.6167084 ,  0.81272494,
       -0.37931506,  1.83957514,  1.51234583,  0.44589029,  1.50567997,
       -1.25721123, -0.73325893,  2.53283655, -0.04788257,  2.45528537,
        1.15825488,  0.09460954, -1.30726863,  1.35940056,  5.40807878,
       -1.10293851,  0.50705595, -0.11272453, -0.04159435,  0.21263728,
        0.87173812, -2.48517499, -0.52114325, -2.02859787,  0.37754337,
       -3.47345709, -3.82066813, -0.03392974, -4.14599599, -0.06448292,
       -2.23823363,  5.05534458,  0.36371597,  2.93646623, -1.83519852,
       -0.7117152 , -0.61727136,  0.36552579, -3.3839417 , -0.48179345,
        0.70721297, -7.13466225, -1.78977258, -0.28585538, -0.99119017,
        0.15939806,  1.02433043, -4.5889883 , -1.28176521, -7.20402419,
       -0.98025049,  5.74923023,  2.14284388,  0.28325959,  1.53590254,
       -0.19572223, -0.96396027, -0.64818018, -0.07757217, -4.78795003,
        0.18210

### Operations you could perform on this dataset object

In [5]:
def aug_func(x, y):
    # lets just take first 10 values from x
    x = x[:10]
    return x, y

In [6]:
dataset = dataset.map(aug_func, num_parallel_calls=4) # here num parallel calls are used for better multiprocessing

In [7]:
# A better way to set num_parallel_call value
auto = tf.data.experimental.AUTOTUNE
num_parallel_calls = auto

In [8]:
for i in dataset:
    print(i)
    break

(<tf.Tensor: shape=(10,), dtype=float64, numpy=
array([ 1.44139631, -2.16397307, -1.0746983 ,  0.6167084 ,  0.81272494,
       -0.37931506,  1.83957514,  1.51234583,  0.44589029,  1.50567997])>, (<tf.Tensor: shape=(), dtype=int32, numpy=0>, <tf.Tensor: shape=(), dtype=int32, numpy=0>, <tf.Tensor: shape=(), dtype=int32, numpy=0>))


In [23]:
# Taking a batch
dataset = dataset.batch(5, num_parallel_calls=auto)

# you can also prefetch the data part so that you getter even better performance
dataset = dataset.prefetch(auto)

for i in dataset:
    print(i)
    break

(<tf.Tensor: shape=(5, 5, 10), dtype=float64, numpy=
array([[[ 1.44139631e+00, -2.16397307e+00, -1.07469830e+00,
          6.16708399e-01,  8.12724937e-01, -3.79315062e-01,
          1.83957514e+00,  1.51234583e+00,  4.45890289e-01,
          1.50567997e+00],
        [ 4.31892199e+00, -2.27792415e+00, -7.23961335e-01,
          1.76166922e+00,  2.76571512e-01,  6.04824936e-01,
          6.88614105e+00, -2.44003434e+00, -9.89677825e-01,
          6.12758188e+00],
        [-5.06952569e+00, -1.13780019e+00, -6.41313352e-01,
         -1.18308280e-01, -6.76752518e+00,  7.82262177e-01,
         -1.99826482e+00,  2.67734943e-01, -3.77383393e-02,
         -3.04714096e+00],
        [ 8.44038180e-02, -1.50634307e+00,  9.83509747e-02,
         -6.68776141e-01,  7.02835301e+00, -4.53985121e-01,
          1.73456955e+00,  1.01156258e+00, -1.54480066e+00,
          6.57667997e+00],
        [-6.87051804e+00, -2.14150573e-01, -1.16124397e+00,
          3.59379301e-01,  5.31002703e+00, -1.30167702e-01,

In [None]:
# now in the model you just need to pass x=dataset

## Creating the dataset class

In [10]:
class SampleDataset(keras.utils.Sequence):
    
    def __init__(self, x, y, batch_size=32, shuffle=True):
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(x)
    
    def __getitem__(self, index):
        x = self.x[index*self.batch_size : (index + 1)*self.batch_size]
        y = self.y[index*self.batch_size : (index + 1)*self.batch_size]
        return x, y

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [12]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((7000, 100), (3000, 100), (7000,), (3000,))

In [13]:
train_dataset = SampleDataset(x_train, y_train)
test_dataset = SampleDataset(x_test, y_test)

## Creating and training a simple model

In [14]:
model = Sequential(
                [layers.Input(shape=(100, )),
                layers.Dense(32, activation='relu'),
                layers.Dense(16, activation='relu'),
                layers.Dense(1)]
)



In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                3232      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 3,777
Trainable params: 3,777
Non-trainable params: 0
_________________________________________________________________


In [16]:
optimizer = optimizers.Adam(learning_rate=1e-3)
loss = losses.BinaryCrossentropy()

model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

In [17]:
model.fit(train_dataset, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x29614a69f10>

In [18]:
model.evaluate(test_dataset)



[1.5577664375305176, 0.8756666779518127]

## Using predefined functions

In [19]:
def train_datagen():
    for i in range(7000):
        # img = Image.open(img_paths[idx])
        # label = img_labels[idx]
        img = x[i]
        label = y[i]
        yield img, label

In [20]:
train = Dataset.from_generator(train_datagen,
                               output_signature = (tf.TensorSpec(shape=(100,), dtype=tf.float64),
                               tf.TensorSpec(shape=(), dtype=tf.int32))).batch(32)

In [21]:
model.fit(train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2961eb0ffa0>

## One more Method

If we have a dataframe containing the a column with the paths of the images we could use **ImageDataGenerator** to create a generetor which returns a batch of x and y with x being images and y being corresponding label.

In case of multilabel classification you can pass a list of column names in the **y_col** parameter.
Tutorial - <a href="https://vijayabhaskar96.medium.com/multi-label-image-classification-tutorial-with-keras-imagedatagenerator-cd541f8eaf24">here</a>

In [22]:
ImageDataGenerator.flow_from_dataframe

<function tensorflow.python.keras.preprocessing.image.ImageDataGenerator.flow_from_dataframe(self, dataframe, directory=None, x_col='filename', y_col='class', weight_col=None, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, save_to_dir=None, save_prefix='', save_format='png', subset=None, interpolation='nearest', validate_filenames=True, **kwargs)>