In [1]:
"""
!pip install -q --upgrade gupload

from pydrive.auth import GoogleAuth
from google.colab import auth

# Authenticate and create the PyDrive client.
auth.authenticate_user()
"""

[K     |████████████████████████████████| 81 kB 5.8 MB/s 
[K     |████████████████████████████████| 56 kB 4.8 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
typer 0.4.2 requires click<9.0.0,>=7.1.1, but you have click 7.0 which is incompatible.
earthengine-api 0.1.316 requires google-api-python-client<2,>=1.12.1, but you have google-api-python-client 1.7.10 which is incompatible.[0m
[?25h

In [1]:
import os
import pathlib
import json
import random
from types import SimpleNamespace
import numpy as np
from PIL import Image
import tensorflow as tf
from sklearn.model_selection import KFold

In [2]:
"""
# Mount the remote storage with the dataset
from google.colab import drive
drive.mount('/content/drive')
os.system("unzip -q drive/MyDrive/msa/CatsDogs.zip")
"""

'\n# Mount the remote storage with the dataset\nfrom google.colab import drive\ndrive.mount(\'/content/drive\')\nos.system("unzip -q drive/MyDrive/msa/CatsDogs.zip")\n'

In [3]:
# Define the directory of the dataset
data_dir = pathlib.Path('CatsDogs/')

# Remove corrupted files
os.system("rm CatsDogs/Cats/666.jpg CatsDogs/Dogs/11702.jpg CatsDogs/Dogs/11410.jpg")

# Collects the path of all the files within the dataset
data_paths = [str(path) for path in list(data_dir.glob("*/*.jpg"))]
print(f"Images in the dataset: {len(data_paths)}")

Images in the dataset: 24997


rm: CatsDogs/Cats/666.jpg: No such file or directory
rm: CatsDogs/Dogs/11702.jpg: No such file or directory
rm: CatsDogs/Dogs/11410.jpg: No such file or directory


In [4]:
# Convert non-jpeg images into jpeg files
formats = [(path, Image.open(path).format) for path in data_paths]
non_jpegs = list(filter(lambda x: x[1]!='JPEG', formats))
for path, _ in non_jpegs:
    img = Image.open(path)
    img.convert('RGB').save(path, format='JPEG')



In [5]:
# Functions
def get_label(file_path):
    # Convert the path to a list of path components
    parts = tf.strings.split(file_path, os.path.sep)
    # The second to last is the class-directory
    one_hot = parts[-2] == ['Cats','Dogs']
    # Integer encode the label
    return tf.argmax(one_hot)


def decode_img(img):
    # Convert the compressed string to a 3D uint8 tensor
    img = tf.io.decode_jpeg(img, channels=3)
    # Resize the image to the desired size
    return tf.image.resize(img, [150, 150])


def process_path(file_path):
    label = get_label(file_path)
    # Load the raw data from the file as a string
    img = tf.io.read_file(file_path)
    img = decode_img(img)
    return img, label


# Configure dataset for performance
def configure_for_performance(ds):
    ds = ds.cache()
    ds = ds.shuffle(buffer_size=1000)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
    return ds


def prepare_data(data, train_index, test_index):

    # Get the paths to the data
    train_paths = np.asarray(data)[train_index]
    test_paths = np.asarray(data)[test_index]

    # Make it tf.data.Dataset
    train = tf.data.Dataset.from_tensor_slices(train_paths)
    test = tf.data.Dataset.from_tensor_slices(test_paths)

    # Get labels
    train = train.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
    test = test.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)

    # Configure for performance
    train = configure_for_performance(train)
    test = configure_for_performance(test)

    return train, test


def subsampling(dataset):
    random.shuffle(dataset)
    subsampled_dataset = random.sample(dataset, round(len(dataset)/2))
    return subsampled_dataset


def zero_one_loss(dataset, verbose=0):
    missmatches = 0
    for i, (data, labels) in enumerate(dataset.as_numpy_iterator()):

        y_pred_binary = np.around(model.predict(x=np.asarray(data), verbose=verbose)).flatten()
        missmatches += np.sum(np.logical_xor(y_pred_binary, labels))

    return round(missmatches, ndigits=0)


def save_results_colab(results):
    with open('results.json', 'at') as f:
        json.dump(results, f, indent=4)
    os.system("gupload --to '18f8mqMYFtymM-qu76xEo41iuAYdxTLqk' results.json")
    
def save_results(results):
    with open('results.json', 'w') as f:
        json.dump(results, f, indent=4)

In [6]:
# Hyper-paramters
batch_size = 64

# Hypter-parameters to by tuned
filters_coeffs = ['same', 'incremental']
list_n_filters = [16, 32, 64]
kernel_sizes = [3, 5, 7]
list_n_epochs = [10, 15, 20]

In [7]:
# Nested cross-val
subsampled_data_paths = subsampling(data_paths)
k_fold = KFold(n_splits=5)
k_splits = list(k_fold.split(subsampled_data_paths))

best_model = {}
external_results = []
with open('results.json') as f:
    internal_results = json.load(f)
internal_count=0

train_index, _ = k_splits[0]
train_paths = np.asarray(subsampled_data_paths)[train_index]  # Compute training part

# Internal cross validation
for filters_coeff in filters_coeffs:
    for n_filters in list_n_filters:
        for kernel_size in kernel_sizes:
            for n_epochs in list_n_epochs:
                internal_splits = k_fold.split(train_paths)
                tmp_results = []
                
                
                if internal_count<21: 
                    internal_count+=1
                    continue
                for internal_train_index, internal_test_index in internal_splits:  # Interal cross validation
                    train, test = prepare_data(train_paths, internal_train_index, internal_test_index)

                    model = tf.keras.Sequential([
                        tf.keras.layers.Rescaling(1./255),
                        tf.keras.layers.Conv2D(n_filters, kernel_size, activation=tf.nn.relu),
                        tf.keras.layers.MaxPooling2D(),
                        tf.keras.layers.Conv2D(n_filters * (1, 2)[filters_coeff=='incremental'], kernel_size, activation=tf.nn.relu),
                        tf.keras.layers.MaxPooling2D(),
                        tf.keras.layers.Conv2D(n_filters * (1, 4)[filters_coeff=='incremental'], kernel_size, activation=tf.nn.relu),
                        tf.keras.layers.MaxPooling2D(),
                        tf.keras.layers.Flatten(),
                        tf.keras.layers.Dense(256, activation=tf.nn.relu),
                        tf.keras.layers.Dense(1, activation='sigmoid')
                    ])

                    model.compile(
                        optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=['accuracy']
                    )

                    history = model.fit(
                        train,
                        epochs=n_epochs,
                        verbose=0
                    )

                    internal_loss = zero_one_loss(test)
                    tmp_results.append(internal_loss)
                    
                    # Clear the model
                    del model
                    tf.keras.backend.clear_session()

                result = {'filters_coeff': filters_coeff, 
                          'n_filters': n_filters, 
                          'kernel_size': kernel_size,
                          'n_epochs': n_epochs,
                          'zero_one_loss': np.round(np.mean(tmp_results), decimals=0)}  # Compute the mean loss of the internal cv

                internal_results.append(result)
                save_results(internal_results)
                print(f"Finished internal iteration {internal_count}")
                internal_count+=1

best_model = min(internal_results, key=lambda x: x['zero_one_loss'])
print(f'Best hyper parameters: {best_model}')

hyper_parameters = SimpleNamespace(**best_model)
for train_index, test_index in k_splits:
    train, test = prepare_data(subsampled_data_paths, train_index, test_index)
    model = tf.keras.Sequential([
        tf.keras.layers.Rescaling(1./255),
        tf.keras.layers.Conv2D(hyper_parameters.n_filters, hyper_parameters.kernel_size, activation=tf.nn.relu),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(hyper_parameters.n_filters * (1, 2)[hyper_parameters.filters_coeff=='incremental'], hyper_parameters.kernel_size, activation=tf.nn.relu),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(hyper_parameters.n_filters * (1, 4)[hyper_parameters.filters_coeff=='incremental'], hyper_parameters.kernel_size, activation=tf.nn.relu),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation=tf.nn.relu),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(
        train,
        epochs=hyper_parameters.n_epochs,
        verbose=0
    )

    external_loss = zero_one_loss(test, verbose=1)
    external_results.append(external_loss)

    # Clear the model
    del model
    tf.keras.backend.clear_session()

mean_zero_one_loss = np.round(np.mean(external_results), decimals=0)
std_zero_one_loss = np.round(np.std(external_results), decimals=0)
print(f'Zero one loss of best model: {int(mean_zero_one_loss)}, with std of {int(std_zero_one_loss)} samples.')

2022-07-15 17:31:29.854891: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-15 17:31:29.855004: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1 Pro


2022-07-15 17:31:30.253755: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-07-15 17:31:30.253889: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9
Corrupt JPEG data: 239 extraneous bytes before marker 0xd9
2022-07-15 17:35:09.332107: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9
2022-07-15 17:35:12.158683: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-15 17:35:12.867661: I tensorflow/core/grappler/optimi

Finished internal iteration 21


2022-07-15 17:49:57.866310: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9
Corrupt JPEG data: 239 extraneous bytes before marker 0xd9
2022-07-15 17:55:24.542384: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9
2022-07-15 17:55:27.226540: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-15 17:55:27.735077: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
Corrupt JPEG data: 162 extr

Finished internal iteration 22


2022-07-15 18:17:26.952253: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9
Corrupt JPEG data: 239 extraneous bytes before marker 0xd9
2022-07-15 18:24:42.131785: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9
2022-07-15 18:24:45.029441: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-15 18:24:45.498387: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
Corrupt JPEG data: 162 extr

KeyboardInterrupt: 

In [None]:
"""# Nested cross-val
subsampled_data_paths = subsampling(data_paths)
k_fold = KFold(n_splits=5)
k_splits = k_fold.split(subsampled_data_paths)

best_model = {}
external_results = []
internal_results = []
internal_count=0
external_count=0
for train_index, test_index in k_splits:  # External cross validation

    train_paths = np.asarray(subsampled_data_paths)[train_index]  # Compute training part
    for coeff in coeffs:
        for n_filters in list_n_filters:
            for kernel_size in kernel_sizes:
                internal_splits = k_fold.split(train_paths)
                tmp_results = []
                for internal_train_index, internal_test_index in internal_splits:  # Interal cross validation
                    train, test = prepare_data(train_paths, internal_train_index, internal_test_index)

                    model = tf.keras.Sequential([
                        tf.keras.layers.Rescaling(1./255),
                        tf.keras.layers.Conv2D(n_filters, kernel_size, activation=tf.nn.relu),
                        tf.keras.layers.MaxPooling2D(),
                        tf.keras.layers.Conv2D(n_filters * (1, 2)[coeff=='incremental'], kernel_size, activation=tf.nn.relu),
                        tf.keras.layers.MaxPooling2D(),
                        tf.keras.layers.Conv2D(n_filters * (1, 4)[coeff=='incremental'], kernel_size, activation=tf.nn.relu),
                        tf.keras.layers.MaxPooling2D(),
                        tf.keras.layers.Flatten(),
                        tf.keras.layers.Dense(256, activation=tf.nn.relu),
                        tf.keras.layers.Dense(1, activation='sigmoid')
                    ])

                    model.compile(
                        optimizer='adam',
                        loss='binary_crossentropy',
                        metrics=['accuracy']
                    )

                    history = model.fit(
                        train,
                        epochs=n_epochs,
                        verbose=0
                    )

                    internal_loss = zero_one_loss(test)
                    tmp_results.append(internal_loss)
                    
                    # Clear the model
                    del model
                    tf.keras.backend.clear_session()

                result = {'coeff': coeff,
                            'n_filters': n_filters,
                            'kernel_size': kernel_size,
                            'zero_one_loss': np.round(np.mean(tmp_results), decimals=0)}


                # Compute the mean loss of the cv
                internal_results.append(result)
                print(f"Finished internal iteration {internal_count} of external iteration {external_count}")
                internal_count+=1

    best_model = min(internal_results, key=lambda x: x['zero_one_loss'])
    hyper_parameters = SimpleNamespace(**best_model)
    
    train, test = prepare_data(subsampled_data_paths, train_index, test_index)
    model = tf.keras.Sequential([
        tf.keras.layers.Rescaling(1./255),
        tf.keras.layers.Conv2D(hyper_parameters.n_filters, hyper_parameters.kernel_size, activation=tf.nn.relu),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(hyper_parameters.n_filters * (1, 2)[hyper_parameters.coeff=='incremental'], hyper_parameters.kernel_size, activation=tf.nn.relu),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Conv2D(hyper_parameters.n_filters * (1, 4)[hyper_parameters.coeff=='incremental'], hyper_parameters.kernel_size, activation=tf.nn.relu),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation=tf.nn.relu),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(
        train,
        epochs=hyper_parameters.n_epochs,
        verbose=0
    )

    external_loss = zero_one_loss(test)
    external_results.append(external_loss)

    # Clear the model
    del model
    tf.keras.backend.clear_session()
    print(f'Finished external iteration {external_count}')
    external_count+=1

mean_zero_one_loss = np.round(np.mean(external_results), decimals=0)
std_zero_one_loss = np.round(np.std(external_results), decimals=0)
best_model['zero_one_loss']=mean_zero_one_loss
save_results(best_model)
print(f'Best hyper parameters: {best_model}')
print(f'Zero one loss of best model: {int(mean_zero_one_loss)}, with std of {int(std_zero_one_loss)} samples.')"""