<a href="https://colab.research.google.com/github/Daz-Riza-Seriog/Tensorflow_ML/blob/main/2-Customise%20your%20Models/2-%20Week%202/2-Dataset%20Generators/Dataset_Generators.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
print(tf.__version__)

2.12.0


# Data Pipeline

 ## Coding tutorials
 #### [1. Keras datasets](#coding_tutorial_1)
 #### [2. Dataset generators](#coding_tutorial_2)
 #### [3. Keras image data augmentation](#coding_tutorial_3)
 #### [4. The Dataset class](#coding_tutorial_4)
 #### [5. Training with Datasets](#coding_tutorial_5)

***
<a id="coding_tutorial_2"></a>
## Dataset generators

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#### Load the UCI Fertility Dataset

We will be using a dataset available at https://archive.ics.uci.edu/ml/datasets/Fertility from UC Irvine.

#### Import the data

The dataset required for this tutorial can be downloaded from the following link:

https://drive.google.com/open?id=1OA0lwa5YLDs1njS377jbqPpMSlH5TzQV

You should store this file in Drive for use in this Colab notebook.

In [None]:
# Install the PyDrive wrapper & import libraries.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

#Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
urls = ["1OA0lwa5YLDs1njS377jbqPpMSlH5TzQV"]
output = ["fertility_diagnosis.txt"]

for i in np.arange(len(urls)):
  file_id = urls[i]
  downloaded = drive.CreateFile({'id':file_id})
  downloaded.GetContentFile(output[i])


In [None]:
# Run this cell to connect to your Drive folder

from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Load the fertility dataset
# Please use your own path once you have downloaded the dataset and mounted your Drive

headers = ['Season', 'Age', 'Diseases', 'Trauma', 'Surgery', 'Fever', 'Alcohol', 'Smoking', 'Sitting', 'Output']
fertility = pd.read_csv('/content/fertility_diagnosis.txt', delimiter=',', header=None, names=headers)

In [None]:
# Print the shape of the DataFrame

print(fertility.shape)

(100, 10)


In [None]:
# Show the head of the DataFrame

fertility.head()

Unnamed: 0,Season,Age,Diseases,Trauma,Surgery,Fever,Alcohol,Smoking,Sitting,Output
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,N
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,O
2,-0.33,0.5,1,0,0,0,1.0,-1,0.5,N
3,-0.33,0.75,0,1,1,0,1.0,-1,0.38,N
4,-0.33,0.67,1,1,0,0,0.8,-1,0.5,O


#### Process the data

In [None]:
# Map the 'Output' feature from 'N' to 0 and from 'O' to 1

fertility['Output'] = fertility['Output'].map(lambda x : 0.0 if x=='N' else 1.0)

In [None]:
# Show the head of the DataFrame

fertility.head()

Unnamed: 0,Season,Age,Diseases,Trauma,Surgery,Fever,Alcohol,Smoking,Sitting,Output
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,0.0
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,1.0
2,-0.33,0.5,1,0,0,0,1.0,-1,0.5,0.0
3,-0.33,0.75,0,1,1,0,1.0,-1,0.38,0.0
4,-0.33,0.67,1,1,0,0,0.8,-1,0.5,1.0


In [None]:
# Convert the DataFrame so that the features are mapped to floats

fertility = fertility.astype('float32')

In [None]:
# Shuffle the DataFrame

fertility = fertility.sample(frac=1).reset_index(drop=True)

In [None]:
# Show the head of the DataFrame

fertility.head()

Unnamed: 0,Season,Age,Diseases,Trauma,Surgery,Fever,Alcohol,Smoking,Sitting,Output
0,1.0,0.67,1.0,0.0,0.0,0.0,0.8,1.0,0.38,1.0
1,-1.0,0.69,0.0,1.0,1.0,0.0,0.6,-1.0,0.19,0.0
2,-1.0,0.67,1.0,0.0,0.0,0.0,1.0,-1.0,0.5,0.0
3,1.0,0.75,1.0,1.0,1.0,0.0,0.8,1.0,0.25,0.0
4,1.0,0.56,1.0,0.0,0.0,0.0,1.0,-1.0,0.44,0.0


In [None]:
# Convert the field Season to a one-hot encoded vector

fertility = pd.get_dummies(fertility, prefix='Season', columns=['Season'])

In [None]:
# Show the head of the DataFrame

fertility.head()

Unnamed: 0,Age,Diseases,Trauma,Surgery,Fever,Alcohol,Smoking,Sitting,Output,Season_-1.0,Season_-0.33000001311302185,Season_0.33000001311302185,Season_1.0
0,0.67,1.0,0.0,0.0,0.0,0.8,1.0,0.38,1.0,0,0,0,1
1,0.69,0.0,1.0,1.0,0.0,0.6,-1.0,0.19,0.0,1,0,0,0
2,0.67,1.0,0.0,0.0,0.0,1.0,-1.0,0.5,0.0,1,0,0,0
3,0.75,1.0,1.0,1.0,0.0,0.8,1.0,0.25,0.0,0,0,0,1
4,0.56,1.0,0.0,0.0,0.0,1.0,-1.0,0.44,0.0,0,0,0,1


*N.B. The below cell has been updated since the coding tutorial.*

In [None]:
# Move the Output column such that it is the last column in the DataFrame

fertility = fertility.reindex(columns = [col for col in fertility.columns if col != 'Output'] + ['Output'])

In [None]:
# Show the head of the DataFrame

fertility.head()

Unnamed: 0,Age,Diseases,Trauma,Surgery,Fever,Alcohol,Smoking,Sitting,Season_-1.0,Season_-0.33000001311302185,Season_0.33000001311302185,Season_1.0,Output
0,0.67,1.0,0.0,0.0,0.0,0.8,1.0,0.38,0,0,0,1,1.0
1,0.69,0.0,1.0,1.0,0.0,0.6,-1.0,0.19,1,0,0,0,0.0
2,0.67,1.0,0.0,0.0,0.0,1.0,-1.0,0.5,1,0,0,0,0.0
3,0.75,1.0,1.0,1.0,0.0,0.8,1.0,0.25,0,0,0,1,0.0
4,0.56,1.0,0.0,0.0,0.0,1.0,-1.0,0.44,0,0,0,1,0.0


In [None]:
# Convert the DataFrame to a numpy array.

fertility = fertility.to_numpy()

#### Split the Data

In [None]:
# Split the dataset into training and validation set

training = fertility[0:70]
validation = fertility[70:100]

In [None]:
# Verify the shape of the training data

training.shape

(70, 13)

In [None]:
# Separate the features and labels for the validation and training data

training_features = training[:,0:-1]
training_labels = training[:,-1]
validation_features = validation[:,0:-1]
validation_labels = validation[:,-1]

#### Create the Generator

In [None]:
# Create a function that returns a generator producing inputs and labels

def get_generator(features, labels, batch_size=1):
    for n in range(int(len(features)/batch_size)):
        yield (features[n*batch_size: (n+1)*batch_size], labels[n*batch_size: (n+1)*batch_size])

In [None]:
# Apply the function to our training features and labels with a batch size of 10

train_generator = get_generator(training_features, training_labels, batch_size=10)

In [None]:
# Test the generator using the next() function

next(train_generator)

(array([[ 0.67,  1.  ,  0.  ,  0.  ,  0.  ,  0.8 ,  1.  ,  0.38,  0.  ,
          0.  ,  0.  ,  1.  ],
        [ 0.69,  0.  ,  1.  ,  1.  ,  0.  ,  0.6 , -1.  ,  0.19,  1.  ,
          0.  ,  0.  ,  0.  ],
        [ 0.67,  1.  ,  0.  ,  0.  ,  0.  ,  1.  , -1.  ,  0.5 ,  1.  ,
          0.  ,  0.  ,  0.  ],
        [ 0.75,  1.  ,  1.  ,  1.  ,  0.  ,  0.8 ,  1.  ,  0.25,  0.  ,
          0.  ,  0.  ,  1.  ],
        [ 0.56,  1.  ,  0.  ,  0.  ,  0.  ,  1.  , -1.  ,  0.44,  0.  ,
          0.  ,  0.  ,  1.  ],
        [ 0.64,  1.  ,  1.  ,  1.  ,  0.  ,  0.8 , -1.  ,  0.31,  0.  ,
          1.  ,  0.  ,  0.  ],
        [ 0.67,  1.  ,  0.  ,  1.  ,  0.  ,  0.6 , -1.  ,  0.38,  0.  ,
          0.  ,  0.  ,  1.  ],
        [ 0.67,  1.  ,  0.  ,  1.  ,  0.  ,  0.8 , -1.  ,  0.19,  0.  ,
          1.  ,  0.  ,  0.  ],
        [ 0.53,  1.  ,  1.  ,  0.  ,  1.  ,  0.8 , -1.  ,  0.38,  1.  ,
          0.  ,  0.  ,  0.  ],
        [ 0.81,  1.  ,  1.  ,  1.  ,  1.  ,  0.8 , -1.  ,  0.38,  0.  ,
 

#### Build the model

In [None]:
# Create a model using Keras with 3 layers

from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input, BatchNormalization

input_shape = (12,)
output_shape = (1,)

model_input = Input(input_shape)
batch_1 = BatchNormalization(momentum=0.8)(model_input)
dense_1 = Dense(100, activation='relu')(batch_1)
batch_2 = BatchNormalization(momentum=0.8)(dense_1)
output = Dense(1, activation='sigmoid')(batch_2)

model = Model([model_input], output)

In [None]:
# Display the model summary to show the resultant structure

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 12)]              0         
                                                                 
 batch_normalization (BatchN  (None, 12)               48        
 ormalization)                                                   
                                                                 
 dense (Dense)               (None, 100)               1300      
                                                                 
 batch_normalization_1 (Batc  (None, 100)              400       
 hNormalization)                                                 
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 1,849
Trainable params: 1,625
Non-trainable par

#### Compile the model

In [None]:
# Create the optimizer object

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)

In [None]:
# Compile the model with loss function and metric

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

#### Train and evaluate the model using the generator

In [None]:
# Calculate the number of training steps per epoch for the given batch size.

batch_size = 5
train_steps = len(training) // batch_size

In [None]:
# Set the epochs to 3

epochs = 3

In [None]:
# Train the model

for epoch in range(epochs):
  train_generator = get_generator(training_features,training_labels,batch_size = batch_size)
  validation_generator = get_generator(validation_features, validation_labels, batch_size=30)
  model.fit_generator(train_generator, steps_per_epoch=train_steps, validation_data=validation_generator, validation_steps=1)


  model.fit_generator(train_generator, steps_per_epoch=train_steps, validation_data=validation_generator, validation_steps=1)




In [None]:
# Try to run the fit_generator function once more; observe what happens

model.fit_generator(train_generator, steps_per_epoch=train_steps)

  model.fit_generator(train_generator, steps_per_epoch=train_steps)


StopIteration: ignored

#### Make an infinitely looping generator

In [None]:
# Create a function that returns an infinitely looping generator

def get_generator_cyclic(features, labels, batch_size=1):
    while True:
      for n in range(int(len(features)/batch_size)):
        yield (features[n*batch_size: (n+1)*batch_size], labels[n*batch_size: (n+1)*batch_size])
      permuted = np.random.permutation(len(features))
      features = features[permuted]
      labels = labels[permuted]


In [None]:
# Create a generator using this function.

train_generator_cyclic = get_generator_cyclic(training_features, training_labels, batch_size=batch_size)

In [None]:
# Assert that the new cyclic generator does not raise a StopIteration

for i in range(2*train_steps):
    next(train_generator_cyclic)

In [None]:
# Generate a cyclic validation generator

validation_generator_cyclic = get_generator_cyclic(validation_features, validation_labels, batch_size=batch_size)

In [None]:
# Train the model

model.fit_generator(train_generator_cyclic,steps_per_epoch=train_steps,
                    validation_data=validation_generator_cyclic,validation_steps=1,epochs=3)

Epoch 1/3
 1/14 [=>............................] - ETA: 0s - loss: 0.1130 - accuracy: 1.0000

  model.fit_generator(train_generator_cyclic,steps_per_epoch=train_steps,


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fd888ba4e50>

#### Evaluate the model and get predictions

In [None]:
# Let's obtain a validation data generator.

validation_generator = get_generator(validation_features, validation_labels, batch_size=30)

In [None]:
# Get predictions on the validation data

predictions = model.predict(validation_generator, steps=1)
print(np.round(predictions.T[0]))

StopIteration: ignored

In [None]:
# Print the corresponding validation labels

print(validation_labels)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0.]


In [None]:
# Obtain a validation data generator

validation_generator = get_generator(validation_features, validation_labels, batch_size=30)

In [None]:
# Evaluate the model

print(model.evaluate(validation_generator))

[1.3056628704071045, 0.699999988079071]
