# DATASET VERSIONING WITH WandB

In [1]:
# Import Depedencies
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
import datetime
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Input, Normalization, Conv2D, MaxPooling2D, Dense, Flatten, BatchNormalization, Dropout
from tensorflow.keras.metrics import BinaryAccuracy, FalsePositives, FalseNegatives, TrueNegatives, TruePositives, Precision, Recall, F1Score, AUC
from tensorflow.keras.regularizers import L2
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint, WandbEvalCallback, WandbCallback

## Data Loader -> Original Dataset -> Train/Test/Val Datasets -> Peprocessed Datasets -> Augmented Datasets -> Final Dataset

### Loading Data as Numpy Arrays Into Directory

In [2]:
dataset, datasetInfo = tfds.load('malaria', with_info=True, as_supervised=True, shuffle_files=True, split=['train'])
print(dataset)

[<_PrefetchDataset element_spec=(TensorSpec(shape=(None, None, 3), dtype=tf.uint8, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>]


2025-01-27 15:26:39.096226: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-01-27 15:26:39.096257: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-01-27 15:26:39.096260: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2025-01-27 15:26:39.096274: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-27 15:26:39.096287: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
k=0
for image, label in dataset[0]:
    with open('Dataset/MalariaDataset' + str(k) + '.npz', mode='wb') as file:
        np.savez(file, image=image, label=label)
    k+=1
print(k)

27558


2025-01-27 15:26:46.187786: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## Run 1 : Original Data Loader

In [4]:
def loadOriginalData():
    with wandb.init(name="Original Data Loader", project="Malaria-Detection-Dataset-Model-Versioning", entity="amanjn2003-santa-clara-university") as run:
        originalData = wandb.Artifact(name="NewDataset", 
                                        type="rawData", 
                                        description="Tensorflow Malaria Dataset")
        originalData.add_dir('Dataset/')
        run.log_artifact(originalData)

In [5]:
loadOriginalData()

[34m[1mwandb[0m: Currently logged in as: [33mamanjn2003[0m ([33mamanjn2003-santa-clara-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Adding directory to artifact (./Dataset)... Done. 123.3s


## Using the Artifact

In [6]:
run = wandb.init()
artifact = run.use_artifact('amanjn2003-santa-clara-university/Malaria-Detection-Dataset-Model-Versioning/NewDataset:v0', type='rawData')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact NewDataset:v0, 1419.75MB. 27558 files... 
[34m[1mwandb[0m:   27558 of 27558 files downloaded.  
Done. 0:0:57.7


## Run 2 : Pre-Processing Data

In [2]:
IM_SIZE = 224
def resizeRescale(image):
    return tf.image.resize(image, (IM_SIZE, IM_SIZE))/255.0

In [3]:
def dataPreprocessor():
    with wandb.init(name="Data Preprocessor", project="Malaria-Detection-Dataset-Model-Versioning", entity="amanjn2003-santa-clara-university") as run:
        
        artifact = run.use_artifact('amanjn2003-santa-clara-university/Malaria-Detection-Dataset-Model-Versioning/NewDataset:v0', type='rawData')
        artifact_dir = artifact.download()

        preProcessedData = wandb.Artifact(name="PreprocessedDataset", 
                                        type="preprocessedData", 
                                        description="Tensorflow Malaria Dataset - Preprocessed")
        artifactDir = "artifacts/NewDataset:v0/"
        datasetX = []
        datasetY = []
        for f in os.listdir(artifactDir)[:1000]: # To save memory
            with open(artifactDir + f, 'rb') as file:
                npzArray = np.load(file, allow_pickle=True)
                x, y = npzArray['image'], npzArray['label']
                datasetX.append(resizeRescale(x))
                datasetY.append(y)

        with preProcessedData.new_file("PreprocessedDataset.npz", mode='wb') as file:
            np.savez(file, image=datasetX, label=datasetY)
        
        run.log_artifact(preProcessedData)

In [4]:
dataPreprocessor()

[34m[1mwandb[0m: Currently logged in as: [33mamanjn2003[0m ([33mamanjn2003-santa-clara-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Downloading large artifact NewDataset:v0, 1419.75MB. 27558 files... 
[34m[1mwandb[0m:   27558 of 27558 files downloaded.  
Done. 0:1:1.3
2025-01-27 16:15:24.613062: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2025-01-27 16:15:24.613083: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 18.00 GB
2025-01-27 16:15:24.613085: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 6.00 GB
2025-01-27 16:15:24.613305: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-27 16:15:24.613319: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Run 3 : Dataset Splitting

In [7]:
def dataSplitter():
    with wandb.init(name="Data Splitting", project="Malaria-Detection-Dataset-Model-Versioning", entity="amanjn2003-santa-clara-university") as run:
        
        artifact = run.use_artifact('amanjn2003-santa-clara-university/Malaria-Detection-Dataset-Model-Versioning/PreprocessedDataset:v0', type='preprocessedData')
        artifact_dir = artifact.download()

        trainData = wandb.Artifact(name="TrainDataset", 
                                        type="DatasetSplits", 
                                        description="Training Dataset")
        testData = wandb.Artifact(name="TestDataset", 
                                        type="DatasetSplits", 
                                        description="Testing Dataset")
        valData = wandb.Artifact(name="ValDataset", 
                                        type="DatasetSplits", 
                                        description="Validation Dataset")
        
        artifactFile = "artifacts/PreprocessedDataset:v0/PreprocessedDataset.npz"

        with open(artifactFile, 'rb') as file:
            npzArr = np.load(file, allow_pickle=True)
            images, labels = npzArr['image'], npzArr['label']
        
        trainSplit = 0.8
        testSplit = 0.1
        valSplit = 0.1

        dataLen = len(labels)
        trainArr = [images[0:int(trainSplit*dataLen)], labels[0:int(trainSplit*dataLen)]]
        testArr = [images[int(trainSplit*dataLen):int((trainSplit+testSplit)*dataLen)], labels[int(trainSplit*dataLen):int((trainSplit+testSplit)*dataLen)]]
        valArr = [images[int((trainSplit+testSplit)*dataLen):], labels[int((trainSplit+testSplit)*dataLen):]]

        with trainData.new_file("TrainDataset.npz", 'wb') as file:
            np.savez(file, image=trainArr[0], label=trainArr[1])
        with testData.new_file("TestDataset.npz", 'wb') as file:
            np.savez(file, image=testArr[0], label=testArr[1])
        with valData.new_file("ValDataset.npz", 'wb') as file:
            np.savez(file, image=valArr[0], label=valArr[1])
        
        run.log_artifact(trainData)
        run.log_artifact(testData)
        run.log_artifact(valData)

In [8]:
dataSplitter()

[34m[1mwandb[0m: Downloading large artifact PreprocessedDataset:v0, 574.23MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:1.8


## Run 4 : Dataset Augmentation

In [11]:
import albumentations as A

transforms = A.Compose([
    A.Resize(224, 224),
    A.OneOf([A.HorizontalFlip(), A.VerticalFlip()], p=0.3),
    A.RandomRotate90(),
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, brightness_by_max=True, p=0.5)
])

def augment(image):
    data = {"image":image}
    image = transforms(**data)
    image = image["image"]
    return image

In [12]:
def dataAugmentation():
    with wandb.init(name="Data Augmentation", project="Malaria-Detection-Dataset-Model-Versioning", entity="amanjn2003-santa-clara-university") as run:
        
        artifact = run.use_artifact('amanjn2003-santa-clara-university/Malaria-Detection-Dataset-Model-Versioning/TrainDataset:v0', type='DatasetSplits')
        artifact_dir = artifact.download()

        augmentedData = wandb.Artifact(name="AugmentedDataset", 
                                        type="augmentedData", 
                                        description="Augmented Train Dataset")
        
        artifactFile = "artifacts/TrainDataset:v0/TrainDataset.npz"

        datasetX = []
        with open(artifactFile, 'rb') as file:
            npzArr = np.load(file, allow_pickle=True)
            images, labels = npzArr['image'], npzArr['label']
            for img in images:
                datasetX.append(augment(img))
        

        with augmentedData.new_file("AugmentedTrainDataset.npz", 'wb') as file:
            np.savez(file, image=datasetX, label=labels)
        
        run.log_artifact(augmentedData)

In [13]:
dataAugmentation()

[34m[1mwandb[0m: Downloading large artifact TrainDataset:v0, 459.38MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:8.8
