# Data cleaning and augmentation

## Imports

In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras as tfk
from PIL import Image
from sklearn.model_selection import train_test_split

## Set up variables

In [2]:
DATASET_PATH = "../data/public_data.npz"
FOLDER_PATH = "dataset"
SEED = 42
TEST_SPLIT = 0.2
TRAIN_PATH = "train"
TEST_PATH = "test"

## Load data from npz file

The data is split into two variables:

1. `images`
2. `labels`

In [3]:
npz = np.load(DATASET_PATH, allow_pickle=True)
images = npz["data"]
labels = npz["labels"]

## Remove Duplicates and Irrelevant Images

The dataset was found to contain a lot of duplicate images and images that were not relevant to the task at hand. The following code removes these images.

In [4]:
labels_dict = dict()
duplicates = 0
# iterate over the images
for x, image in enumerate(images):
    h = hash(image.tobytes())
    # add the hash of the image to the dictionary
    if h not in labels_dict:
        labels_dict[h] = [x]
    else:
        labels_dict[h].append(x)
        duplicates += 1


# create new data set
clean_data = []
clean_labels = []
# iterate over the labels
for label in labels_dict.values():
    #  images that are repeated more than 5 times are ignored (memes)
    if len(label) > 5:
        continue

    # keep only the first image of the repeated ones
    clean_data.append(images[label[0]])
    clean_labels.append(labels[label[0]])

print(f"Removed {duplicates} duplicates")
print(f"New dataset size: {len(clean_data)}")

images = np.array(clean_data)
labels = np.array(clean_labels)

Removed 348 duplicates
New dataset size: 4850


## Save Images to Folders

In [5]:
# split the dataset into validation and test sets
images_train_val, images_test, labels_train_val, labels_test = train_test_split(
    images,
    labels,
    random_state=SEED,
    test_size=TEST_SPLIT,
    stratify=labels,
)

# create the main folder
if not os.path.isdir(FOLDER_PATH):
    os.mkdir(FOLDER_PATH)

# create train/test folders
if not os.path.isdir(f"{FOLDER_PATH}/{TRAIN_PATH}"):
    os.mkdir(f"{FOLDER_PATH}/{TRAIN_PATH}")
if not os.path.isdir(f"{FOLDER_PATH}/{TEST_PATH}"):
    os.mkdir(f"{FOLDER_PATH}/{TEST_PATH}")

# create the subfolders
for c in set(labels):
    if not os.path.isdir(f"{FOLDER_PATH}/{TRAIN_PATH}/{c}"):
        os.mkdir(f"{FOLDER_PATH}/{TRAIN_PATH}/{c}")
    if not os.path.isdir(f"{FOLDER_PATH}/{TEST_PATH}/{c}"):
        os.mkdir(f"{FOLDER_PATH}/{TEST_PATH}/{c}")

# save train images
count = 0
for image, label in zip(images_train_val, labels_train_val):
    arr = np.array(image, dtype=np.uint8)
    img = Image.fromarray(arr)
    img.save(f"{FOLDER_PATH}/{TRAIN_PATH}/{label}/{count}.png")
    count += 1

# save test images
count = 0
for image, label in zip(images_test, labels_test):
    arr = np.array(image, dtype=np.uint8)
    img = Image.fromarray(arr)
    img.save(f"{FOLDER_PATH}/{TEST_PATH}/{label}/{count}.png")
    count += 1