# USING TENSORFLOW RECORDS

## 1. Used to store data more efficiently
### -> One single file rather than many small files.
## 2. Can carry out pre-processing and store that data which can be loaded easily.
## 3. Can store output of a part of the model (Embeddings).
## 4. Encourage parallelizing of reading data.

In [1]:
import os
import cv2
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import import_ipynb
import DatasetPreparation
import Modeling
from tensorflow.keras.layers import Input, Normalization, Conv2D, MaxPooling2D, Dense, Flatten, BatchNormalization, Dropout
from tensorflow.keras.metrics import BinaryAccuracy, FalsePositives, FalseNegatives, TrueNegatives, TruePositives, Precision, Recall, F1Score, AUC
from tensorflow.keras.regularizers import L2
import sklearn
from tensorflow.train import BytesList, Int64List, FloatList
from tensorflow.train import Example, Feature, Features

Found 6799 files belonging to 3 classes.
Using 5440 files for training.
Found 6799 files belonging to 3 classes.
Using 1359 files for validation.
Found 2278 files belonging to 3 classes.


2025-01-28 16:08:50.171168: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-01-28 16:08:50.171196: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-01-28 16:08:50.171202: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2025-01-28 16:08:50.171216: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-28 16:08:50.171225: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
trainDataset, testDataset, valDataset = DatasetPreparation.trainDataset, DatasetPreparation.testDataset, DatasetPreparation.valDataset
model = Modeling.model

In [3]:
CONFIGURATION = Modeling.CONFIGURATION
CONFIGURATION

{'CLASS_NAMES': ['angry', 'happy', 'sad'],
 'BATCH_SIZE': 32,
 'IMAGE_SIZE': 256,
 'LEARNING_RATE': 0.001,
 'N_EPOCHS': 20,
 'DROPOUT_RATE': 0.0,
 'REGULARIZATION_RATE': 0.0,
 'N_FILTERS': 6,
 'KERNEL_SIZE': 3,
 'N_STRIDES': 1,
 'POOL_SIZE': 2,
 'N_DENSE_1': 100,
 'N_DENSE_2': 10,
 'NUM_CLASSES': 3}

In [4]:
# Augmentation Method Using Keras Sequential Layer
augmentLayers = tf.keras.Sequential([
    tf.keras.layers.RandomRotation(
        factor=(0.025, 0.025),
    ),
    tf.keras.layers.RandomFlip(
        mode = 'HORIZONTAL',
    ),
    tf.keras.layers.RandomContrast(
        factor=0.1,
    ),
])

def augment(image, label):
    return augmentLayers(image, training=True), label

trainDataset = trainDataset.map(augment)

## TF Records

In [5]:
trainDataset = trainDataset.unbatch()
valDataset = valDataset.unbatch()
testDataset = testDataset.unbatch()

In [6]:
print(trainDataset, testDataset, valDataset)

<_UnbatchDataset element_spec=(TensorSpec(shape=(256, 256, 3), dtype=tf.float32, name=None), TensorSpec(shape=(3,), dtype=tf.float32, name=None))> <_UnbatchDataset element_spec=(TensorSpec(shape=(256, 256, 3), dtype=tf.float32, name=None), TensorSpec(shape=(3,), dtype=tf.float32, name=None))> <_UnbatchDataset element_spec=(TensorSpec(shape=(256, 256, 3), dtype=tf.float32, name=None), TensorSpec(shape=(3,), dtype=tf.float32, name=None))>


### Encoding Dataset To Bytes

In [7]:
def encodeImage(image, label):
    image = tf.io.encode_jpeg(tf.image.convert_image_dtype(image, dtype=tf.uint8))
    label = tf.argmax(label)
    return image, label

trainDataset = trainDataset.map(encodeImage)

### Create Record Example

In [8]:
def createExample(image, label):
    images = tf.train.Feature(
        bytes_list = BytesList(value=[image])
    )
    labels = tf.train.Feature(
        int64_list = Int64List(value=[label])
    )
    example = Example(
        features = Features(feature={
            'images' : images,
            'labels' : labels,
        })
    )
    return example.SerializeToString()

### Write Records to a File

In [9]:
NUM_SHARDS = 10
path = 'TFRecords/shard_{:02d}.tfrecord'
os.makedirs("TFRecords", exist_ok=True)

In [10]:
for shardNumber in range(NUM_SHARDS):
    shardedDataset = trainDataset.shard(NUM_SHARDS, shardNumber).as_numpy_iterator()
    with tf.io.TFRecordWriter(path.format(shardNumber)) as fileWriter:
        for image, label in shardedDataset:
            fileWriter.write(createExample(image, label))

2025-01-28 16:08:50.694525: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2025-01-28 16:09:02.001264: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-01-28 16:09:13.641370: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-01-28 16:09:25.186012: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-01-28 16:09:37.019502: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-01-28 16:09:48.595520: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-01-28 16:10:00.498529: W tensorflow/core/framework/local_rendezvous.cc:404]

### Using records by converting back to TF Datasets

In [11]:
records = [path.format(shardNumber) for shardNumber in range(NUM_SHARDS)]
reconstructedDataset = tf.data.TFRecordDataset(filenames=records)

In [12]:
def parseRecords(example):
    featureDescription = {
        'images' : tf.io.FixedLenFeature([], tf.string),
        'labels' : tf.io.FixedLenFeature([], tf.int64),
    }
    example = tf.io.parse_single_example(example, featureDescription)
    example['images'] = tf.cast(tf.io.decode_jpeg(example['images'], channels=3), dtype=tf.float32)
    return example['images'], example['labels']

In [13]:
parsedDataset = reconstructedDataset.map(parseRecords).batch(CONFIGURATION["BATCH_SIZE"]).prefetch(tf.data.AUTOTUNE)

In [14]:
for i in parsedDataset.take(1):
    print(i)

(<tf.Tensor: shape=(32, 256, 256, 3), dtype=float32, numpy=
array([[[[223., 223., 223.],
         [157., 157., 157.],
         [  0.,   0.,   0.],
         ...,
         [103., 103., 103.],
         [195., 195., 195.],
         [100., 100., 100.]],

        [[  2.,   2.,   2.],
         [160., 160., 160.],
         [174., 174., 174.],
         ...,
         [212., 212., 212.],
         [ 48.,  48.,  48.],
         [ 91.,  91.,  91.]],

        [[ 68.,  68.,  68.],
         [  0.,   0.,   0.],
         [ 79.,  79.,  79.],
         ...,
         [ 32.,  32.,  32.],
         [155., 155., 155.],
         [ 84.,  84.,  84.]],

        ...,

        [[113., 113., 113.],
         [232., 232., 232.],
         [145., 145., 145.],
         ...,
         [110., 110., 110.],
         [127., 127., 127.],
         [100., 100., 100.]],

        [[135., 135., 135.],
         [144., 144., 144.],
         [136., 136., 136.],
         ...,
         [213., 213., 213.],
         [  9.,   9.,   9.],
       

2025-01-28 16:10:47.747564: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
