Let's start with loading the labels (outputs) which are in a CSV file:

In [1]:
import pandas as pd

mypath = "/datasets/invasive/"
labels = pd.read_csv(mypath + "train_labels.csv")
labels.head(5)

Unnamed: 0,name,invasive
0,1,0
1,2,0
2,3,1
3,4,0
4,5,1


Get list of image file names and shuffle:

In [2]:
from os import listdir
from os.path import isfile, join
import random

# get images list
images = [f for f in listdir(mypath + "train/") if isfile(join(mypath + "train/", f))]
random.shuffle(images)
print(len(images))

1868


Define and apply a function to load images and downsize them (to make processing faster):

In [3]:
from tensorflow.keras.preprocessing import image
from tqdm import tqdm # to display progress bar when running loops

def prepare_training_data(smallimg_size):
    X = []
    y = []
    
    for f in tqdm(images, miniters=100):
        im_number = int(f.split('.jpg')[0])
        y.append(int(labels[labels['name']==im_number]['invasive']))
        img = image.load_img(mypath + 'train/' + f, target_size=(smallimg_size, smallimg_size))
        X.append(image.img_to_array(img))
        
    return X, y

In [None]:
X_raw, y = prepare_training_data(224)

  0%|          | 0/1868 [00:00<?, ?it/s]

In [None]:
import numpy as np
X_raw = np.array(X_raw, np.float16)
y = np.array(y)

Normalize data:

In [None]:
X = X_raw / 255

Dump inputs and outputs:

In [None]:
import pickle
pickle.dump(X, open(mypath + 'X.pkl', 'wb'))
pickle.dump(y, open(mypath + 'y.pkl', 'wb'))