In [1]:
import pandas as pd
import numpy as np
from PIL import Image
import pickle
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
# import images from ../../Resources/resized_images.pkl file
images = pickle.load(open("../../pickles/resized_images.pkl", "rb"))

# import labels from ../../Resources/labels.pkl file
labels = pickle.load(open("../../pickles/labels.pkl", "rb"))

#show first image and its label
print(f"Label: {labels.iloc[0]}")
images[0]

FileNotFoundError: [Errno 2] No such file or directory: '../../pickles/resized_images.pkl'

In [None]:
print(labels)

In [None]:
# perform augmentation on one image
# create a pixel array from the image
image = images[0]
image_pxl_array = np.array(image)

#print pixel values
print(image_pxl_array)


In [None]:
# Convert all images to a floating point numpy array for augmentation
imgs_pxl_array = np.array(images).astype('float32')

# Since pixel values are ranging from 0-255, normaize it by dividing by 255
normalized_images = [img / 255 for img in imgs_pxl_array]

# Print out the first image values
print(normalized_images[0])

In [6]:
# Create an image augmentation pipeline
image_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomRotation(0.05),        # Random rotation (18 degrees)
    tf.keras.layers.RandomZoom(0.25),            # Random zoom
    tf.keras.layers.RandomFlip("horizontal")    # Random horizontal flip
])

In [7]:
# Create X and y arrays
y = np.array(labels).reshape(-1, 1)

X = np.array(normalized_images)

In [None]:
display(X.shape, y.shape)

In [9]:
# Split the data into training and testing sets with 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=38)

In [None]:
# Create an empty list for X and y augmentations
X_train_aug = []
y_train_aug = []

# Loop through the entire X_train set
for i in range(len(X_train)):
    # Select the original image and its y label
    img = X_train[i]
    label = y_train[i]

    # Ensure that the input data has the correct shape
    img = np.expand_dims(img, axis=0)  # Add batch dimension

    # Add one more augmented for every original
    # Create and append the image
    X_train_aug.append(image_augmentation(img, training=True)[0].numpy())
    # Append the original label
    y_train_aug.append(label)

# Print the length of the augmented images and the labels
print(len(X_train_aug))
print(len(y_train_aug))

In [11]:
# Concatenate the original and augmented images and labels
X_train = np.concatenate((X_train, np.array(X_train_aug)))
y_train = np.concatenate((y_train, np.array(y_train_aug)))

In [12]:
y_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit(np.array(y_train).reshape(-1, 1))
y_train_enc = y_encoder.transform(np.array(y_train).reshape(-1, 1))
y_test_enc = y_encoder.transform(np.array(y_test).reshape(-1, 1))

In [None]:
display(X_train.shape, y_train_enc.shape, X_test.shape, y_test_enc.shape)

In [13]:
# set ulimit to 12G to deal with large data export
!ulimit -n 4096

In [14]:
# Export the augmented images and labels in pickle files
pickle.dump(y_train_enc, 
            open("../../pickles/y_train_aug.pkl", "wb"), 
            protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(X_train, 
            open("../../pickles/X_train_aug.pkl", "wb"), 
            protocol=pickle.HIGHEST_PROTOCOL)



In [15]:
# Export test images and labels in pickle files
pickle.dump(y_test_enc, 
            open("../../pickles/y_test.pkl", "wb"), 
            protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(X_test, 
            open("../../pickles/X_test.pkl", "wb"), 
            protocol=pickle.HIGHEST_PROTOCOL)