In [1]:
import tensorflow as tf
configuration = tf.compat.v1.ConfigProto()
configuration.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=configuration)

from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPooling2D
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.optimizers import SGD, Adam, schedules
from tensorflow.keras import utils
from tensorflow.keras.regularizers import l2
import keras
from keras import backend as K

import numpy as np
import h5py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import datetime

In [2]:
# %load_ext tensorboard

In [3]:
folder = "D:\FARM_data\Soil_Spectra_Label"
f = h5py.File(folder+'\labeled_data.hdf5', 'r')
reread = pd.read_hdf(folder+"\labeled_data.hdf5", key='FR')
countries = ['AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'EL', 'ES', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'MT', 'NL', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK', 'UK']
for country in countries:
    temppd = pd.read_hdf(folder+"\labeled_data.hdf5", key=country)
    reread = pd.concat((reread, temppd), ignore_index = True)
print('done')

done


In [4]:
folder1 = "D:\FARM_data\LUCAS2015_topsoildata_20200323"
tempdf = pd.read_csv(folder1+"\LUCAS_Topsoil_2015_20200323.csv", usecols=["Point_ID", "LC1_Desc"])
reread = pd.merge(reread, tempdf, on='Point_ID', how='left')

all_landscapes = reread["LC1_Desc"].unique()

print(len(reread))

20559


In [5]:
filtered_df = pd.DataFrame(columns=reread.columns)

for landscape in all_landscapes:
    temppd = reread.loc[reread['LC1_Desc'] == landscape]
    Q1 = temppd['OC'].quantile(0.25)
    Q3 = temppd['OC'].quantile(0.75)
    IQR = Q3 - Q1
    temppd = temppd[temppd['OC'] < Q3 + IQR * 1.5]
    temppd = temppd[temppd['OC'] > Q1 - IQR * 1.5]
    filtered_df = filtered_df.append(temppd)

reread = filtered_df
filtered_df = None
print(len(reread))

19097


In [6]:
reread

Unnamed: 0,Point_ID,OC,NUTS_0,OC_state,spectogram,LC1_Desc
3,37802434,77.4,FR,60-80,"[[[57, 86, 140], [57, 86, 140], [57, 86, 140],...",Grassland without tree/shrub cover
5,38342416,26.9,FR,20-40,"[[[66, 65, 134], [66, 65, 134], [66, 65, 134],...",Grassland without tree/shrub cover
12,37742430,60.7,FR,60-80,"[[[55, 91, 141], [55, 91, 141], [55, 91, 141],...",Grassland without tree/shrub cover
22,38022434,61.4,FR,60-80,"[[[63, 71, 136], [63, 71, 136], [63, 71, 136],...",Grassland without tree/shrub cover
27,38122428,85.6,FR,80-100,"[[[61, 78, 138], [61, 78, 138], [61, 78, 138],...",Grassland without tree/shrub cover
...,...,...,...,...,...,...
14569,39683304,21.5,NL,20-40,"[[[65, 66, 135], [65, 66, 135], [65, 66, 135],...",Floriculture and ornamental plants
14588,40343302,14.7,NL,0-20,"[[[62, 73, 137], [62, 73, 137], [62, 73, 137],...",Floriculture and ornamental plants
14593,40363308,13.6,NL,0-20,"[[[64, 70, 136], [64, 70, 136], [64, 70, 136],...",Floriculture and ornamental plants
14711,51123106,29.1,PL,20-40,"[[[59, 82, 139], [59, 82, 139], [59, 82, 139],...",Floriculture and ornamental plants


In [7]:
# Q1 = reread['OC'].quantile(0.25)
# Q3 = reread['OC'].quantile(0.75)
# IQR = Q3 - Q1
# print(Q1, Q3, IQR)

In [8]:
# reread = reread[reread['OC'] < Q3 + IQR * 3]
# reread = reread[reread['OC'] > Q1 - IQR * 3]
# # reread

In [9]:
# reread = reread.reset_index()

In [10]:
f.keys()

<KeysViewHDF5 ['AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'EL', 'ES', 'FR', 'HR', 'HU', 'IE', 'IT', 'LT', 'LU', 'LV', 'MT', 'NL', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK', 'UK']>

In [11]:
X = np.array(list(reread['spectogram'].values))

In [12]:
y = reread['OC_state'].values

In [13]:
lb = LabelBinarizer()
y = lb.fit_transform(y)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
# X = None
# y = None
# reread = None

In [16]:
y_test.shape

(4775, 22)

In [17]:
y_train.shape

(14322, 22)

In [18]:
X_train.shape

(14322, 217, 335, 3)

In [19]:
y_train.shape

(14322, 22)

## First models

In [20]:
# model=Sequential() # Linear stacking of layers

# # Convolution Layer 1
# model.add(Conv2D(16,(5,5),input_shape=(217,335,3),
#     padding='same',activation='relu',
#     kernel_constraint=MaxNorm(3)))
# model.add(Dropout(0.2))

# # Convolution Layer 2
# model.add(Conv2D(8,(7,7), activation='relu', padding='same', kernel_constraint=MaxNorm(3)))
# model.add(MaxPooling2D(pool_size=(9,9)))
# model.add(Flatten())

# # Fully Connected Layer 3
# model.add(Dense(512,activation='relu',kernel_constraint=MaxNorm(3)))

# # Fully Connected Layer 4
# model.add(Dropout(0.5))
# model.add(Dense(len(y[1]), activation='relu'))

# model.compile(loss='categorical_crossentropy', metrics='accuracy', optimizer='adam') # categorical_crossentropy

In [21]:
# model.summary()

In [22]:
# model=Sequential() # Linear stacking of layers
# model.add(Conv2D(16, (7, 7), strides=(2, 2), padding="valid", kernel_regularizer=reg,
#             input_shape=(217,335,3)))
#         # here we stack two CONV layers on top of each other where
#         # each layerswill learn a total of 32 (3x3) filters
# model.add(Conv2D(32, (3, 3), padding="same",
#             kernel_initializer=init, kernel_regularizer=reg))
# model.add(Activation("relu"))
# model.add(BatchNormalization(axis=chanDim))
# model.add(Conv2D(32, (3, 3), strides=(2, 2), padding="same",
#             kernel_initializer=init, kernel_regularizer=reg))
# model.add(Activation("relu"))
# model.add(BatchNormalization(axis=chanDim))
# model.add(Dropout(0.25))
#         # stack two more CONV layers, keeping the size of each filter
#         # as 3x3 but increasing to 64 total learned filters
# model.add(Conv2D(64, (3, 3), padding="same",
#             kernel_initializer=init, kernel_regularizer=reg))
# model.add(Activation("relu"))
# model.add(BatchNormalization(axis=chanDim))
# model.add(Conv2D(64, (3, 3), strides=(2, 2), padding="same",
#             kernel_initializer=init, kernel_regularizer=reg))
# model.add(Activation("relu"))
# model.add(BatchNormalization(axis=chanDim))
# model.add(Dropout(0.25))
#         # increase the number of filters again, this time to 128
# model.add(Conv2D(128, (3, 3), padding="same",
#             kernel_initializer=init, kernel_regularizer=reg))
# model.add(Activation("relu"))
# model.add(BatchNormalization(axis=chanDim))
# model.add(Conv2D(128, (3, 3), strides=(2, 2), padding="same",
#             kernel_initializer=init, kernel_regularizer=reg))
# model.add(Activation("relu"))
# model.add(BatchNormalization(axis=chanDim))
# model.add(Dropout(0.25))
# # fully-connected layer
# model.add(Flatten())
# model.add(Dense(512, kernel_initializer=init))
# model.add(Activation("relu"))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
#         # softmax classifier
# model.add(Dense(len(y[1])))
# model.add(Activation("softmax"))
# model.compile(loss='categorical_crossentropy', metrics='accuracy', optimizer='adam')

In [23]:
reg=l2(0.0005)
init="he_normal"
chanDim = -1

In [24]:
model = Sequential()
model.add(Conv2D(16, (7, 7), strides=(2, 2), padding="valid",
            kernel_initializer=init, kernel_regularizer=reg,
            input_shape=(217,335,3)))

# here we stack two CONV layers on top of each other where
# each layerswill learn a total of 32 (3x3) filters
model.add(Conv2D(32, (3, 3), padding="same",
    kernel_initializer=init, kernel_regularizer=reg))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(32, (3, 3), strides=(2, 2), padding="same",
    kernel_initializer=init, kernel_regularizer=reg))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Dropout(0.25))

# stack two more CONV layers, keeping the size of each filter
# as 3x3 but increasing to 64 total learned filters
model.add(Conv2D(64, (3, 3), padding="same",
    kernel_initializer=init, kernel_regularizer=reg))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(64, (3, 3), strides=(2, 2), padding="same",
    kernel_initializer=init, kernel_regularizer=reg))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Dropout(0.25))

# increase the number of filters again, this time to 128
model.add(Conv2D(128, (3, 3), padding="same",
    kernel_initializer=init, kernel_regularizer=reg))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Conv2D(128, (3, 3), strides=(2, 2), padding="same",
    kernel_initializer=init, kernel_regularizer=reg))
model.add(Activation("relu"))
model.add(BatchNormalization(axis=chanDim))
model.add(Dropout(0.25))

# fully-connected layer
model.add(Flatten())
model.add(Dense(512, kernel_initializer=init))
model.add(Activation("relu"))
model.add(BatchNormalization())
model.add(Dropout(0.5))

# softmax classifier
model.add(Dense(len(y[1])))
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy', metrics='accuracy', optimizer='adam')

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 106, 165, 16)      2368      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 106, 165, 32)      4640      
_________________________________________________________________
activation (Activation)      (None, 106, 165, 32)      0         
_________________________________________________________________
batch_normalization (BatchNo (None, 106, 165, 32)      128       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 53, 83, 32)        9248      
_________________________________________________________________
activation_1 (Activation)    (None, 53, 83, 32)        0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 53, 83, 32)        1

In [26]:
# log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [27]:
history = model.fit(X_train, y_train,
                batch_size=64,
                epochs=10,
                verbose=1,
                shuffle = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
# %tensorboard --logdir logs/fit

In [29]:
score = model.evaluate(X_test, y_test)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 1.1298558712005615
Test accuracy: 0.6467015743255615


## Research models

### What is our input data?

TODO: put here code that reads an array from above and prints a spectogram

Our input data consists of 217x335x3 arrays. Which represent the rgb values of every pixel in a spectrogram. That spectrogram is created with spectral data from satellites.

So now we have to research what hypothetically is a good model to train on our data.

#### Filters
In this [source](https://datascience.stackexchange.com/questions/55545/in-cnn-why-do-we-increase-the-number-of-filters-in-deeper-convolution-layers-fo) you can see why we should start with lower amount of data filters and build that amount up in following layers.

#### Kernel size


You can see in this [source](https://www.sicara.ai/blog/2019-10-31-convolutional-layer-convolution-kernel#:~:text=A%20common%20choice%20is%20to,%3A%203%2C%201%20by%20color.) that the most commenly used kernel sizes are 3x3 and 5x5. About half way on this page you can see that a 3x3 kernel size gives an higher accuracy.

So lets start with 3x3 and if that doesn't really work. We can always try a 5x5 kernel size.

#### Strides
[Here](https://www.pyimagesearch.com/2018/12/31/keras-conv2d-and-convolutional-layers/) is being said that you could actually use 2x2 for the strides instead of using MaxPooling. Also in the source there is being referenced to a paper that using strided convolution may actually be better than using pooling layers and can increase accuracy.

So we'll try 2x2 strides and use no max pooling

#### Padding
Its prob better to use padding source [here](https://stats.stackexchange.com/questions/246512/convolutional-layers-to-pad-or-not-to-pad)

In [30]:
# reg=l2(0.0009)
# init="he_normal"
# chanDim = -1

In [31]:
# def root_mean_squared_error(y_true, y_pred):
#         return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [32]:
# reg=l2(0.0005)
# init="he_normal"
# chanDim = -1

# # Conv2D Layer 1
# cnn = Sequential()
# cnn.add(Conv2D(16, (3, 3), strides=(2, 2), padding="valid", kernel_regularizer=reg, kernel_initializer=init,
#             input_shape=(217,335,3)))
# cnn.add(Activation("relu"))

# # Conv2D Layer 2
# cnn.add(Conv2D(32, (3, 3), strides=(2, 2), padding="same", kernel_regularizer=reg))
# cnn.add(Activation("relu"))
# cnn.add(BatchNormalization())
# cnn.add(Dropout(0.2))

# # Conv2D Layer 3
# cnn.add(Conv2D(64, (3, 3), strides=(2, 2), padding="same", kernel_regularizer=reg))
# cnn.add(Activation("relu"))
# cnn.add(BatchNormalization())
# cnn.add(Dropout(0.2))

# # Conv2D Layer 4
# cnn.add(Conv2D(128, (3, 3), strides=(2, 2), padding="same", kernel_regularizer=reg))
# cnn.add(Activation("relu"))
# cnn.add(BatchNormalization())
# cnn.add(Dropout(0.2))
# cnn.add(Flatten())

# # Fully connected layer 1
# cnn.add(Dense(194))
# cnn.add(Activation("relu"))
# cnn.add(BatchNormalization())
# cnn.add(Dropout(0.2))

# # Fully connected layer 2
# cnn.add(Dense(len(y[1])))
# cnn.add(Dropout(0.5))
# cnn.add(Activation("relu"))
# cnn.compile(loss='categorical_crossentropy', metrics='accuracy', optimizer='adam')

In [33]:
# cnn.summary()

In [34]:
# history = cnn.fit(X_train, y_train,
#                 batch_size=32,
#                 epochs=10,
#                 verbose=1)

In [35]:
# score = cnn.evaluate(X_test, y_test)
# print('Test score:', score[0])
# print('Test mse:', score[1])