In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from PIL import Image
import Butterfly_identification.preprocessbutterfly as preproc
# from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [3]:
import os
from tensorflow.data import Dataset, AUTOTUNE
from tensorflow.strings import split
from tensorflow.io import decode_jpeg, read_file
from tensorflow import argmax
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential, layers

In [4]:
from tensorflow.data.experimental import cardinality

# Get Data

In [5]:
df_train,df_val,df_test = preproc.get_data()

In [6]:
df_train.shape

(36536, 15)

In [7]:
df_train = preproc.feature_engineering(df_train)
df_val = preproc.feature_engineering(df_val)
df_test = preproc.feature_engineering(df_test)

## Generator implementation

In [8]:
PATH_TRAIN = '/home/zac/code/Em3line/Butterfly_identification/raw_data/IGM_labels/Train/'
PATH_VAL = '/home/zac/code/Em3line/Butterfly_identification/raw_data/IGM_labels/Val/'
PATH_TEST = '/home/zac/code/Em3line/Butterfly_identification/raw_data/IGM_labels/Test/'

In [9]:
df_train = preproc.get_data_minphoto(df_train)
df_train = preproc.resampling(df_train)

In [10]:
df_val = preproc.filter_val_test(df_train, df_val)
df_test = preproc.filter_val_test(df_train, df_test)

In [11]:
df_train['full_path'] = df_train['species']+'/'+df_train['image_name']
df_val['full_path'] = df_val['species']+'/'+df_val['image_name']
df_test['full_path'] = df_test['species']+'/'+df_test['image_name']

In [12]:
black_list = list(PATH_TRAIN + df_train['full_path'])
black_list_val = list(PATH_VAL + df_val['full_path'])
black_list_test = list(PATH_TEST + df_test['full_path'])

In [13]:
train_names = df_train['species'].drop_duplicates()

In [14]:
del df_train
del df_val
del df_test

In [15]:
filtered_train = [os.path.join(PATH_TRAIN, species) for species in black_list]
filtered_val = [os.path.join(PATH_VAL, species) for species in black_list_val]
filtered_test = [os.path.join(PATH_TEST, species) for species in black_list_test]

In [16]:
train_ds = Dataset.from_tensor_slices(filtered_train)
val_ds = Dataset.from_tensor_slices(filtered_val)
test_ds = Dataset.from_tensor_slices(filtered_test)

2021-08-30 12:28:36.384467: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-08-30 12:28:36.631420: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-08-30 12:28:36.632641: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-08-30 12:28:36.652756: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [17]:
def get_label(file_path,class_names=train_names):
  # convert the path to a list of path components
  parts = split(file_path, os.path.sep)
  # The second to last is the class-directory
  one_hot = parts[-2] == class_names
  # Integer encode the label
  return argmax(one_hot)

In [18]:
def decode_img(img):
  # convert the compressed string to a 3D uint8 tensor
  img = decode_jpeg(img, channels=3)
  # resize the image to the desired size
  return img


In [19]:
def process_path(file_path, class_names=train_names):
    label = get_label(file_path, class_names)
    # load the raw data from the file as a string
    img = read_file(file_path)
    img = decode_img(img)
    return img, label

In [20]:
AUTOTUNE = AUTOTUNE

In [21]:
train_ds = train_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.map(process_path, num_parallel_calls=AUTOTUNE)

In [22]:
def configure_for_performance(ds):
    ds = ds.cache()
    ds = ds.shuffle(buffer_size=1000)
    ds = ds.batch(batch_size=32)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

train_ds = configure_for_performance(train_ds)
val_ds = configure_for_performance(val_ds)

# Data Augmentation

In [23]:
data_augmentation = Sequential([
  layers.experimental.preprocessing.RandomFlip("horizontal_and_vertical"),
  layers.experimental.preprocessing.RandomRotation(0.2),
])

In [24]:
IMG_SIZE = 128

resize_and_rescale = Sequential([
  layers.experimental.preprocessing.Resizing(IMG_SIZE, IMG_SIZE),
  layers.experimental.preprocessing.Rescaling(1./255)
])


#augmentation = ImageDataGenerator(rotation_range=25, width_shift_range=0.2,
                  height_shift_range=0.2,  
                  zoom_range=0.1, horizontal_flip=True)

In [25]:
batch_size = 32
def prepare(ds, shuffle=False, augment=False):
    # Resize and rescale all datasets
    ds = ds.map(lambda x, y: (resize_and_rescale(x), y), 
              num_parallel_calls=AUTOTUNE)

    if shuffle:
        ds = ds.shuffle(1000)

    # Use data augmentation only on the training set
    if augment:
        ds = ds.map(lambda x, y: (data_augmentation(x, training=True), y), 
                num_parallel_calls=AUTOTUNE)

    # Use buffered prefecting on all datasets
    return ds.prefetch(buffer_size=AUTOTUNE)


In [26]:
#train_ds = prepare(train_ds, shuffle=False, augment=True)

image_batch, label_batch = next(iter(train_ds))
print(len(label_batch))
plt.figure(figsize=(10, 10))
for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    print(image_batch.shape)
    plt.imshow(image_batch[i].numpy().astype("uint8"))
    label = label_batch[i]
    plt.title(train_names[label])
    plt.axis("off")

In [27]:
#train_ds = train_ds.map(augmentation.flow, num_parallel_calls=AUTOTUNE)

# Model

In [28]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras import optimizers, layers, models
from tensorflow.keras.callbacks import EarlyStopping

In [29]:
def model_VGG16(train_ds, val_ds, test_ds, image_size=(128, 128, 3),  patience=2, learning_rate=0.001, nb_epochs=15, nb_couches_dense_layer=130):
    
    model = VGG16(weights="imagenet", include_top=False, input_shape = image_size)



    model.trainable = False
    
    #add last layers
    
    flatten_layer = layers.Flatten()
    dense_layer = layers.Dense(nb_couches_dense_layer, activation='relu')
    dropout_layer1 = layers.Dropout(.2, input_shape=(2,))
    dense_layer2 = layers.Dense(1000, activation='relu')
    dropout_layer2 = layers.Dropout(.2, input_shape=(2,))
    prediction_layer = layers.Dense(1, activation='softmax')
    model = models.Sequential([
        data_augmentation,
        resize_and_rescale,
        model,
        flatten_layer,
        dense_layer,
        prediction_layer
    ])
    
    #build model
    opt = optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    
    #set earlystopping
    es = EarlyStopping(monitor='val_loss', patience=patience, verbose=1, restore_best_weights=True)
    
    #launch model
    history = model.fit(train_ds, 
                    validation_data=(val_ds), 
                    epochs=nb_epochs,  
                    callbacks=[es])
    
    model.summary()
    
    #evaluate model
    res_vgg = model.evaluate(test_ds)
    
    test_accuracy_vgg = res_vgg[-1]
    
    return (f"test_accuracy_vgg = {round(test_accuracy_vgg,2)*100} %"), history

In [30]:
from Butterfly_identification.trainer import plot_history

In [None]:
model = model_VGG16(train_ds, val_ds, test_ds, nb_epochs=2, nb_couches_dense_layer=130)

Epoch 1/2


2021-08-30 12:28:40.183677: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-08-30 12:28:50.340134: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 786 of 1000
2021-08-30 12:28:52.937875: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:228] Shuffle buffer filled.
2021-08-30 12:28:54.023621: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8202
2021-08-30 12:28:55.729935: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.04GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


