# Extract the dataset

Download ed estrazione del dataset

In [13]:
import zipfile
import os
import requests
from tqdm import tqdm

def extract_images(path):
    for root, subdir, files in os.walk(img_path):
        for f in files:
            if '.jpg' in f:
                return False
    return True

base_path = os.getcwd()
img_path = os.path.join(base_path, 'tmp')

if extract_images(img_path):

    url = 'https://md-datasets-cache-zipfiles-prod.s3.eu-west-1.amazonaws.com/4drtyfjtfy-1.zip'
    filename = url.split("/")[-1]
    file_path = os.path.join(img_path, filename)

    if os.path.isfile(file_path) == False:
        buffer_size = 1024
        response = requests.get(url, stream=True)
        file_size = int(response.headers.get("Content-Length", 0))
        progress = tqdm(response.iter_content(buffer_size), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
        with open(file_path, "wb") as f:
            for data in progress:
                f.write(data)
                progress.update(len(data))

        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(img_path)
            os.remove(file_path)
        
        file_path = os.path.join(img_path, 'dataset2.zip')
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(img_path)
            os.remove(file_path)

img_path = os.path.join(img_path, 'dataset2')

## Load images

Definiamo la funzione per importare il dataset scaricato.
Questa prende in input il path della cartella contenete le immagini ed un modello Tensorflow che funge da encoder per eseguire feature extraction.
Per ogni sample processato la classe viene dedotta dal nome del file.

In [14]:
from tensorflow import keras
import numpy as np
from sklearn import preprocessing
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from pathlib import Path
import re

def prepare_targets(Y):
	le = preprocessing.LabelEncoder()
	le.fit(Y)
	y_enc = le.transform(Y)
	return y_enc

def import_data(path, keras_encoder):
    x_list, y_list = [], []

    files = os.listdir(path)
    for f in tqdm(files):
        if f.endswith('.jpg'):
            file_path = os.path.normpath(os.path.join(path, f))
            label = re.sub(r'\d+', '', Path(file_path).stem) 
            x = np.expand_dims(
                image.img_to_array(
                    image.load_img(
                        file_path, 
                        target_size=(224, 224)
                        )
                    ), 
                axis=0
                )
            x = preprocess_input(x)
            x = model.predict(x)
            x_list.append(x)
            y_list.append(label)
            
            # Free memory
            x = None            

    Y = prepare_targets(np.array(y_list))

    return np.vstack(x_list), Y

## block5_pool

In [15]:
model = VGG16(weights='imagenet', include_top=False)
x, y = import_data(img_path, model)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
100%|██████████| 1125/1125 [02:59<00:00,  6.28it/s]


In [19]:
# Save the numpy array for future training
from numpy import asarray
from numpy import savez_compressed
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

save_path = os.path.join(base_path, 'data', 'block5_pool')

savez_compressed(os.path.join(save_path, 'x_train.npz'), x_train)
savez_compressed(os.path.join(save_path, 'y_train.npz'), y_train)
savez_compressed(os.path.join(save_path, 'x_test.npz'), x_test)
savez_compressed(os.path.join(save_path, 'y_test.npz'), y_test)

In [20]:
# Free up numpy array memory for further feature extration with different VGG16 cuts
x, y, x_train, y_train, x_test, y_test = None, None, None, None, None, None

## block4_pool

In [22]:
# Load the complete VGG16 model
from tensorflow.keras.models import Model
base_model = VGG16(weights='imagenet')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [23]:
model = Model(inputs=base_model.input, outputs=base_model.get_layer('block4_pool').output)
print(model.summary())
print('block4_pool - extracting features...')
x, y = import_data(img_path, model)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

save_path = os.path.join(base_path, 'data', 'block4_pool')

print('block4_pool - saving dataset...')
savez_compressed(os.path.join(save_path, 'x_train.npz'), x_train)
savez_compressed(os.path.join(save_path, 'y_train.npz'), y_train)
savez_compressed(os.path.join(save_path, 'x_test.npz'), x_test)
savez_compressed(os.path.join(save_path, 'y_test.npz'), y_test)
print('block4_pool - Done!')

x, y, x_train, y_train, x_test, y_test = None, None, None, None, None, None

  0%|          | 0/1125 [00:00<?, ?it/s]Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooli

## block2_pool

In [24]:
model = Model(inputs=base_model.input, outputs=base_model.get_layer('block2_pool').output)
print(model.summary())
print('block2_pool - extracting features...')
x, y = import_data(img_path, model)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

save_path = os.path.join(base_path, 'data', 'block2_pool')

print('block2_pool - saving dataset...')
savez_compressed(os.path.join(save_path, 'x_train.npz'), x_train)
savez_compressed(os.path.join(save_path, 'y_train.npz'), y_train)
savez_compressed(os.path.join(save_path, 'x_test.npz'), x_test)
savez_compressed(os.path.join(save_path, 'y_test.npz'), y_test)
print('block2_pool - Done!')

x, y, x_train, y_train, x_test, y_test = None, None, None, None, None, None

  0%|          | 1/1125 [00:00<01:58,  9.46it/s]Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool