# MAME Loader

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as io

RAW_DATA_PATH = 'input/data/raw'

MINITRAIN_PATH = 'input/data/train/'
MINIVAL_PATH = 'input/data/val/'
MINITEST_PATH = 'input/data/test/'

META_PATH = 'input/metadata/'
IMG_SIZE = 28
N_CHANNELS = 3

# Load Data

There is a toy dataset much smaller than the real one. This can be pretty good to configure the whole load-train-predict pipeline before facing a much bigger problem.

## Label Distribution Set

In [2]:
data = io.loadmat(os.path.join(RAW_DATA_PATH, 'caltech101_silhouettes_28_split1.mat'))
data.keys()

X_train, y_train = data['train_data'], data['train_labels']
print(X_train.shape, y_train.shape)

X_val, y_val = data['val_data'], data['val_labels']
print(X_val.shape, y_val.shape)

X_test, y_test = data['test_data'], data['test_labels']
print(X_test.shape, y_test.shape)

total = X_train.shape[0] + X_val.shape[0] + X_test.shape[0]
print("Train: {:.2%}".format(X_train.shape[0] / total), 
      "Val: {:.2%}".format(X_val.shape[0] / total), 
      "Test: {:.2%}".format(X_test.shape[0] / total))

labels = list([x[0] for x in data['classnames'][0]])

print(labels)

(4100, 784) (4100, 1)
(2264, 784) (2264, 1)
(2307, 784) (2307, 1)
Train: 47.28% Val: 26.11% Test: 26.61%
['Airplanes Side 2', 'Faces 2', 'Faces 3', 'Leopards', 'Motorbikes 16', 'accordion', 'anchor', 'ant', 'barrel', 'bass', 'beaver', 'binocular', 'bonsai', 'brain', 'brontosaurus', 'buddha', 'butterfly', 'camera', 'cannon', 'car side', 'ceiling fan', 'cellphone', 'chair', 'chandelier', 'cougar body', 'cougar face', 'crab', 'crayfish', 'crocodile', 'crocodile head', 'cup', 'dalmatian', 'dollar bill', 'dolphin', 'dragonfly', 'electric guitar', 'elephant', 'emu', 'euphonium', 'ewer', 'ferry', 'flamingo', 'flamingo head', 'garfield', 'gerenuk', 'gramophone', 'grand piano', 'hawksbill', 'headphone', 'hedgehog', 'helicopter', 'ibis', 'inline skate', 'joshua tree', 'kangaroo', 'ketch', 'lamp', 'laptop', 'llama', 'lobster', 'lotus', 'mandolin', 'mayfly', 'menorah', 'metronome', 'minaret', 'nautilus', 'octopus', 'okapi', 'pagoda', 'panda', 'pigeon', 'pizza', 'platypus', 'pyramid', 'revolver', '

In [9]:
print("Shape of each image:", X_train[0].shape)
print("Range of values in the image:", X_train[0].min(), "-", X_train[0].max())

Shape of each image: (784,)
Range of values in the image: 0 - 1


In [13]:
X.shape[0] * 0.8

6936.8

In [15]:
X = np.concatenate((X_train, X_val, X_test))
y = np.concatenate((y_train, y_val, y_test))

N = X.shape[0]
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y,
                                                        test_size=int(test_ratio * N),
                                                        random_state=42,
                                                        stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval,
                                                    test_size=int(val_ratio * N),
                                                    random_state=42,
                                                    stratify=y_trainval)
print("Original Size:", X.shape, y.shape)
print("Train:", X_train.shape, y_train.shape)
print("Val:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)
print("Total sum:", X_train.shape[0] + X_val.shape[0] + X_test.shape[0])

Original Size: (8671, 784) (8671, 1)
Train: (6937, 784) (6937, 1)
Val: (867, 784) (867, 1)
Test: (867, 784) (867, 1)
Total sum: 8671


In [17]:
def create_partition(input_path, val_ratio, test_ratio, output_path):

    data = io.loadmat(input_path)
    X_train, y_train = data['train_data'], data['train_labels']

    X_val, y_val = data['val_data'], data['val_labels']

    X_test, y_test = data['test_data'], data['test_labels']

    X = np.concatenate((X_train, X_val, X_test))
    y = np.concatenate((y_train, y_val, y_test))

    X_trainval, X_test, y_trainval, y_test = train_test_split(X, y,
                                                            test_size=int(test_ratio * N),
                                                            random_state=42,
                                                            stratify=y)

    X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval,
                                                        test_size=int(val_ratio * N),
                                                        random_state=42,
                                                        stratify=y_trainval)
    
    assert X_train.shape[0] + X_val.shape[0] + X_test.shape[0] == total

    # Create directories for train, val y test
    train_path = os.path.join(output_path, 'train')
    val_path = os.path.join(output_path, 'val')
    test_path = os.path.join(output_path, 'test')

    os.makedirs(train_path, exist_ok=True)
    os.makedirs(val_path, exist_ok=True)
    os.makedirs(test_path, exist_ok=True)

    # Save images
    io.savemat(os.path.join(train_path, 'train_data.mat'), {'X': X_train, 'y': y_train})
    io.savemat(os.path.join(val_path, 'val_data.mat'), {'X': X_val, 'y': y_val})
    io.savemat(os.path.join(test_path, 'test_data.mat'), {'X': X_test, 'y': y_test})


In [18]:
input_data = os.path.join(RAW_DATA_PATH, 'caltech101_silhouettes_28_split1.mat')
output_path = f'input/data/train_{(1-val_ratio-test_ratio) * 100:.0f}_{val_ratio * 100:.0f}_{test_ratio * 100:.0f}'
test_ratio = 0.1
val_ratio = 0.1

create_partition(input_data, val_ratio, test_ratio, output_path)