<img src="../images/BDG_LOGO.png" alt="drawing" align="right" width="200"/>

# H2020 RIA BigDataGrapes - Predictive Data Analytics (T4.3)

## Wine Making Pilot

### This pilot is described in the deliverable D4.3 (Pilot 2).

The specific goal of the price prediction is to develop a machine-learned pipeline aiming at counting leaves from side-view grapevine images taken into the imaging cabin of the PhenoArch platform (see picture below) managed by INRA.

In [1]:
!pip install --user pandas tqdm keras_tqdm keras tensorflow scikit-image 



In [2]:
%matplotlib inline

import os
import json
import traceback

from matplotlib import pyplot as plt
from IPython import display

import numpy as np
import pandas as pd
import scipy as sc

import skimage
from skimage import io, transform

import pickle
import random

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Dense, Input, Subtract, BatchNormalization, InputLayer
from keras import backend as K, Model, regularizers, backend, optimizers
from keras.constraints import maxnorm, Constraint
from keras import initializers
import tensorflow as tf

from keras.preprocessing.image import ImageDataGenerator

import tqdm
from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.


In [3]:
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)

plt.rcParams['figure.facecolor'] = '#FFFFFF'

# Set the dataset to use

In [4]:
dataset = "ARCH2012-03-14"
# dataset = "ARCH2013-03-17"

json_file = f"{dataset}_images_phenology_enriched.json"

if dataset == "ARCH2012-03-14":
    path_dataset = f"/data/{dataset}/"
elif dataset == "ARCH2013-03-17":
    path_dataset = f"/data/{dataset}/"
else:
    print("Dataset not supported!!")
    path_dataset = None

# Load the dataset from json

## Loading Utilities

In [15]:
class ImageDataset(object):
    
    def __init__(self, image_list, labels, uris, plants, dates):
        self.image_list = image_list
        self.labels = labels
        self.uris = uris
        self.plants = plants
        self.dates = dates
    
    def __len__(self):
        return self.labels.size
    
    def size(self):
        return len(self)
    
    def num_plants(self):
        return np.unique(self.plants).size
    
    def X_is_loaded(self):
        return False
    
    def split(self, vali_ratio=0.2, test_ratio=0.2, shuffle=True, seed=42):
        size = self.size()
        indexes = np.arange(size)
        if shuffle:
            np.random.seed(seed)
            indexes = np.random.permutation(indexes)
        idx_start_vali = int(np.round(size * 0.6))
        idx_start_test = int(np.round(size * 0.8))
        
        image_ids_train = indexes[:idx_start_vali]
        image_ids_vali = indexes[idx_start_vali:idx_start_test]
        image_ids_test = indexes[idx_start_test:]
        
        return (
            ImageDataset(self.image_list[image_ids_train], self.labels[image_ids_train], self.uris[image_ids_train], self.plants[image_ids_train], self.dates[image_ids_train]), 
            ImageDataset(self.image_list[image_ids_vali],  self.labels[image_ids_vali],  self.uris[image_ids_vali],  self.plants[image_ids_vali], self.dates[image_ids_vali]),
            ImageDataset(self.image_list[image_ids_test],  self.labels[image_ids_test],  self.uris[image_ids_test],  self.plants[image_ids_test], self.dates[image_ids_test])
        )
    
    def split_by_plants(self, vali_ratio=0.2, test_ratio=0.2, shuffle=True, seed=42):
        size = self.num_plants()
        indexes = np.arange(size, dtype=np.uint16)
        if shuffle:
            np.random.seed(seed)
            indexes = np.random.permutation(indexes)
        idx_start_vali = int(np.round(size * 0.6))
        idx_start_test = int(np.round(size * 0.8))
        
        plants_id_train = indexes[:idx_start_vali]
        plants_id_vali  = indexes[idx_start_vali:idx_start_test]
        plants_id_test  = indexes[idx_start_test:]
        
        idx_train = np.concatenate([np.arange(start, end) for start, end in zip(np.searchsorted(self.plants, plants_id_train, side='left'), np.searchsorted(self.plants, plants_id_train, side='right'))]).ravel()
        idx_vali  = np.concatenate([np.arange(start, end) for start, end in zip(np.searchsorted(self.plants, plants_id_vali,  side='left'), np.searchsorted(self.plants, plants_id_vali,  side='right'))]).ravel()
        idx_test  = np.concatenate([np.arange(start, end) for start, end in zip(np.searchsorted(self.plants, plants_id_test,  side='left'), np.searchsorted(self.plants, plants_id_test,  side='right'))]).ravel()
        
        return (
            ImageDataset(self.image_list[idx_train], self.labels[idx_train], self.uris[idx_train], self.plants[idx_train], self.dates[idx_train]), 
            ImageDataset(self.image_list[idx_vali],  self.labels[idx_vali],  self.uris[idx_vali],  self.plants[idx_vali],  self.dates[idx_vali]),
            ImageDataset(self.image_list[idx_test],  self.labels[idx_test],  self.uris[idx_test],  self.plants[idx_test],  self.dates[idx_test])
        )

In [7]:
def read_annotated_json(json_file, path_dataset, skip_top=True):

    labels = []
    images = []
    uris = []
    plants = []
    dates = []
    with open(json_file, 'r') as reader:
        for plant_num, line in enumerate(reader):
            obj = json.loads(line)
            for image in obj['images']:
                if skip_top and 'top' in image['labelView']:
                    continue
                labels.append(obj['leafNumber'])
                dates.append(obj['date'])
                images.append(os.path.join(
                    path_dataset, 
                    os.path.basename(image['filename'])
                ))
                uris.append(image['uri'])
                plants.append(plant_num)
                
    return ImageDataset(
        np.array(images, dtype=np.object), 
        np.array(labels, dtype=np.float32), 
        np.array(uris, dtype=object), 
        np.array(plants, dtype=np.uint16), 
        np.array(dates, dtype=object)
    )

## Read the dataset and split in Train, Vali and Test

In [8]:
image_dataset = read_annotated_json(json_file, path_dataset, skip_top=False)
train_dataset, vali_dataset, test_dataset = image_dataset.split_by_plants()

print("Train Size: {}\nVali Size: {}\nTest Size: {}".format(train_dataset.size(), vali_dataset.size(), test_dataset.size()))

Train Size: 48810
Vali Size: 16305
Test Size: 16248


# Neural Network

## NN Utilities

In [9]:
def set_seed(seed_value=0):

    # 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    # 2. Set the `python` built-in pseudo-random generator at a fixed value
    random.seed(seed_value)

    # 3. Set the `numpy` pseudo-random generator at a fixed value
    np.random.seed(seed_value)

    # 4. Set the `tensorflow` pseudo-random generator at a fixed value
    tf.set_random_seed(seed_value)

    # 5. Configure a new global `tensorflow` session
    session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
    K.set_session(sess)

In [21]:
class ImageGeneratorBatch(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, image_dataset, cropped_dim=(1800,800), rescale=0.25, n_channels=3, batch_size=32, shuffle=True, seed=42):
        'Initialization'
        self.cropped_dim = cropped_dim
        self.rescale = rescale
        self.final_dim = (int(self.cropped_dim[0]*self.rescale), int(self.cropped_dim[1]*self.rescale))
        self.image_dataset = image_dataset
        self.n_channels = n_channels
        self.batch_size = batch_size
        self.shuffle = shuffle
        np.random.seed(seed)
        self.cur_idx = 0
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.ceil(self.image_dataset.size() / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        
        if index == (len(self)-1):
            indexes = self.indexes[index*self.batch_size:]

        # Find list of URIs
        list_images_batch = self.image_dataset.image_list[indexes]

        # Generate data
        if self.image_dataset.X_is_loaded():
            X = self.image_dataset.X[indexes]
        else:
            X = self.__data_generation(list_images_batch)

        y = self.image_dataset.labels[indexes]

        return X, y
    
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(self.image_dataset.size())
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_images_batch):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        new_h, new_w = self.cropped_dim[0], self.cropped_dim[1]
        
        # Initialization
        X = np.empty((self.batch_size, *self.final_dim, self.n_channels))

        # Generate data
        for i, image_path in enumerate(list_images_batch):
            
            try:
                img = skimage.io.imread(image_path)
                if self.n_channels == 3:
                    # remove alpha channel
                    img = img[:,:,:3]
    
                cropped_img = skimage.util.crop(img, (
                                                   (20, img.shape[0] - new_h - 20), 
                                                   (int( (img.shape[1] - new_w) / 2), int( (img.shape[1] - new_w) / 2)), 
                                                   (0,0)
                                               ),
                                               copy=False)
        
                X[i,] = skimage.img_as_ubyte(skimage.transform.rescale(cropped_img, self.rescale, anti_aliasing=True, preserve_range=False, multichannel=True), force_copy=False)
            except:
                print("Unexpected error on index {} for image {}: {}".format(self.cur_idx, list_images_batch[i], traceback.format_exc()))
                X[i,] = np.zeros((*self.final_dim, self.n_channels), dtype=np.uint8)

        return X

In [22]:
def create_model(conv_layers, dense_layers, height, width, channels):
    
    def ConvBlock(n_conv, filters, kernel_size, strides=(1,1), padding='same', kernel_initializer='he_uniform', activation='relu', is_last=False, bnorm=False, dropout=0.0):

        for i in range(n_conv):
            model.add(keras.layers.Conv2D(filters, kernel_size, strides=strides, padding=padding, kernel_initializer=kernel_initializer))
            if bnorm:
                model.add(keras.layers.BatchNormalization())
            model.add(keras.layers.Activation(activation))

        if is_last:
            model.add(keras.layers.GlobalMaxPooling2D())
        else:
            model.add(keras.layers.MaxPooling2D())
        ""
        if dropout > 0:
            model.add(keras.layers.Dropout(dropout))
        
        return
    
    def DenseBlock(size, kernel_initializer='glorot_uniform', activation='relu', is_last=False, bnorm=False, dropout=0.0, bias_initializer=0):

        model.add(keras.layers.Dense(size, kernel_initializer=kernel_initializer, bias_initializer=keras.initializers.Constant(value=bias_initializer)))

        if not is_last:
            if bnorm:
                model.add(keras.layers.BatchNormalization())

            model.add(keras.layers.Activation(activation))

            if dropout > 0:
                model.add(keras.layers.Dropout(dropout))

        return

    # reset seed of keras
    tf.random.set_seed(0)

    model = keras.models.Sequential(name='Leaf Counter')
    
    model.add(
        keras.layers.InputLayer(
            input_shape=(height, width, channels),
            name='input'
        )
    )
    
    for i, params in enumerate(conv_layers):
        is_last = i == len(conv_layers)-1
        ConvBlock(is_last=is_last, **params)

    for i, params in enumerate(dense_layers):
        is_last = i == len(dense_layers)-1
        DenseBlock(is_last=is_last, **params)

    # Compile model
    model.compile(loss='mae', 
                  optimizer=keras.optimizers.Adadelta(),# Adam(),
                  metrics=[keras.metrics.mae, keras.metrics.mse])
    
    return model

In [18]:
def describe_model(model): 

    pd.set_option('max_colwidth', -1)
    pd.set_option('display.max_rows', 999)
    layers = [(layer.__class__.__name__, layer.name, layer.input_shape, layer.output_shape, layer.count_params(), layer.trainable) for layer in model.layers]
    df = pd.DataFrame(layers, columns=['Layer Type', 'Layer Name', 'Input Shape', 'Output Shape', "# Params", 'Trainable'])
    display.display(df)

    print("Total params: {:,}".format(df['# Params'].sum()))
    print("Non-trainable params: {:,}".format(df[df['Trainable'] == False]['# Params'].sum()))
    print("Trainable params: {:,}".format(df[df['Trainable'] == True]['# Params'].sum()))

## Create the Neural Network model

In [19]:
WIDTH_IMAGES = 200
HEIGHT_IMAGES = 450
CHANNELS = 3

model = create_model(
    conv_layers=[
            {'n_conv':2, 'filters':4,   'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':8,   'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':16,  'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':32,  'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':64,  'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':128, 'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':256, 'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':512, 'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
        ],
    dense_layers=[
        {'size':1024, 'bnorm':True,  'activation':'relu'},
        {'size':1024, 'bnorm':True,  'activation':'relu'},
        {'size':1024, 'bnorm':True,  'activation':'relu'},
        {'size':1,    'bnorm':False},
    ],
    width=WIDTH_IMAGES, height=HEIGHT_IMAGES, channels=CHANNELS,
)

print ('Model Summary')
describe_model(model)

Model Summary


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Layer Type,Layer Name,Input Shape,Output Shape,# Params,Trainable
0,Conv2D,conv2d_17,"(None, 450, 200, 3)","(None, 450, 200, 4)",112,True
1,BatchNormalization,batch_normalization_20,"(None, 450, 200, 4)","(None, 450, 200, 4)",16,True
2,Activation,activation_20,"(None, 450, 200, 4)","(None, 450, 200, 4)",0,True
3,Conv2D,conv2d_18,"(None, 450, 200, 4)","(None, 450, 200, 4)",148,True
4,BatchNormalization,batch_normalization_21,"(None, 450, 200, 4)","(None, 450, 200, 4)",16,True
5,Activation,activation_21,"(None, 450, 200, 4)","(None, 450, 200, 4)",0,True
6,MaxPooling2D,max_pooling2d_8,"(None, 450, 200, 4)","(None, 225, 100, 4)",0,True
7,Conv2D,conv2d_19,"(None, 225, 100, 4)","(None, 225, 100, 8)",296,True
8,BatchNormalization,batch_normalization_22,"(None, 225, 100, 8)","(None, 225, 100, 8)",32,True
9,Activation,activation_22,"(None, 225, 100, 8)","(None, 225, 100, 8)",0,True


Total params: 7,366,581
Non-trainable params: 0
Trainable params: 7,366,581


## Train the Neural Network model

In [None]:
BATCH_SIZE=16
nb_epoch=500

CROPPED_WIDTH_IMAGES = 800
CROPPED_HEIGHT_IMAGES = 1800
RESCALE = 0.25
CHANNELS = 3

train_generator = ImageGeneratorBatch(train_dataset, cropped_dim=(CROPPED_HEIGHT_IMAGES, CROPPED_WIDTH_IMAGES), rescale=RESCALE, n_channels=CHANNELS, batch_size=BATCH_SIZE, shuffle=True, seed=42)
vali_generator  = ImageGeneratorBatch(vali_dataset,  cropped_dim=(CROPPED_HEIGHT_IMAGES, CROPPED_WIDTH_IMAGES), rescale=RESCALE, n_channels=CHANNELS, batch_size=BATCH_SIZE, shuffle=True, seed=42)

early_stopping = keras.callbacks.EarlyStopping(monitor='val_mean_absolute_error',
                                               min_delta=0,
                                               patience=50,
                                               verbose=0, 
                                               mode='auto',
                                               restore_best_weights=True)

progress_bar = TQDMNotebookCallback(
    leave_inner=True, 
    leave_outer=True,
    metric_format="{name}: {value:0.4f}"
)

history = model.fit_generator(
    train_generator,
    steps_per_epoch=len(train_generator),
    epochs=nb_epoch, 
    verbose=1,
    callbacks=[early_stopping], #, progress_bar
    validation_data=vali_generator, 
    validation_steps=len(vali_generator),
    max_queue_size=10,
    workers=1, 
    use_multiprocessing=False, 
    shuffle=True)

## Save the model on file

In [49]:
model.save_weights(f"{dataset}/plant_splitting_big_model_0.87_valid.h5")

## Load the model from file

In [24]:
WIDTH_IMAGES = 200
HEIGHT_IMAGES = 450
CHANNELS = 3

model = create_model(
    conv_layers=[
            {'n_conv':2, 'filters':4,   'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':8,   'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':16,  'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':32,  'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':64,  'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':128, 'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':256, 'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
            {'n_conv':2, 'filters':512, 'kernel_size':(3,3), 'strides':(1, 1), 'bnorm':True, 'activation':'relu'},
        ],
    dense_layers=[
        {'size':1024, 'bnorm':True,  'activation':'relu'},
        {'size':1024, 'bnorm':True,  'activation':'relu'},
        {'size':1024, 'bnorm':True,  'activation':'relu'},
        {'size':1,    'bnorm':False},
    ],
    width=WIDTH_IMAGES, height=HEIGHT_IMAGES, channels=CHANNELS,
)

model.load_weights("./plant_splitting_big_model_0.87_valid.h5")

## Compute the predictions

In [45]:
preds = {}
datasets = [train_dataset, vali_dataset, test_dataset]

for dataset in datasets:
    dataset_generator = ImageGeneratorBatch(dataset, cropped_dim=(CROPPED_HEIGHT_IMAGES, CROPPED_WIDTH_IMAGES), rescale=RESCALE, n_channels=CHANNELS, batch_size=BATCH_SIZE, shuffle=False)
    preds[dataset] = model.predict_generator(dataset_generator, max_queue_size=1, workers=1, use_multiprocessing=False).ravel()

## Compute the Mean Absolute Error (MAE)

In [39]:
mae = []
for dataset in datasets:
    mae.append(np.abs(dataset.labels - preds[dataset]).mean())

print(f"Train MAE: {mae[0]:.4f}\nVali  MAE: {mae[1]:.4f}\nTest  MAE: {mae[2]:.4f}")

Train MAE: 0.2940
Vali  MAE: 0.8674
Test  MAE: 0.8776
