In [7]:

import tensorflow as tf
configuration = tf.compat.v1.ConfigProto()
configuration.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=configuration)

from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPooling2D
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.optimizers import SGD, Adam, schedules
from tensorflow.keras import utils
from tensorflow.keras.regularizers import l2
import keras

import numpy as np
import h5py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
!pip install --upgrade tables

Collecting tables
[?25l  Downloading https://files.pythonhosted.org/packages/0f/cb/4097be890a773af95343389faa8c283b0d9ff606f144227a548461dcbdd5/tables-3.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.3MB)
[K     |████████████████████████████████| 4.3MB 7.3MB/s 
Installing collected packages: tables
  Found existing installation: tables 3.4.4
    Uninstalling tables-3.4.4:
      Successfully uninstalled tables-3.4.4
Successfully installed tables-3.6.1


In [8]:
"""This section is used to reach the files in my google drive, when using this code offline or localy, remove this cell of code."""
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
def data_setup():
  """This function sets up all the data needed to complile maken and train the CNN model"""

  # Load data
  """This part is used to read the data off of the dataset file (hdf5), and turn it into a pandas dataframe
  The data hdf5 file was created with pickle 4 protocol to support python 3.7"""

  print('loading data...')
  folder = "/content/drive/My Drive/" # change to folder that contains the hdf5 data file

  reread = pd.read_hdf(folder+"labeled_data_pickle4.hdf5", key='FR')
  countries = ['AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'EL', 'ES', 'HR', 'HU', 'IE', 'LT', 'LU', 'LV', 'MT', 'NL', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK', 'UK', 'IT'] #ES FR
  for country in countries:
      temppd = pd.read_hdf(folder+"labeled_data_pickle4.hdf5", key=country)
      reread = pd.concat((reread, temppd), ignore_index = True)
  temppd = None
  print('loading data: done\n')

  
  # Add landscape column
  """This part adds the column 'LC1_Desc' which tells which type of landscape the data point is"""

  print("Adding landscape column...")
  folder = "/content/drive/My Drive/" # change to folder that contains the csv data file, containing the landscape descriptions and point id's
  tempdf = pd.read_csv(folder+"LUCAS_Topsoil_2015_20200323.csv" ,usecols=["Point_ID", "LC1_Desc"])
  reread = pd.merge(reread, tempdf, on='Point_ID', how='left')

  all_landscapes = reread["LC1_Desc"].unique()
  print("Adding lanscape column: Done\n")


  # Remove outliers
  """This part filters all datapoints their outliers for each landscape type.
  This is measured based on the outliers of the OC value column"""

  print("Removing outliers...")
  filtered_df = pd.DataFrame(columns=reread.columns)

  for landscape in all_landscapes:
    temppd  = reread.loc[reread['LC1_Desc'] == landscape] #common wheat
    Q1 = temppd['OC'].quantile(0.25)
    Q3 = temppd['OC'].quantile(0.75)
    IQR = Q3 - Q1
    temppd = temppd[temppd['OC'] < Q3 + IQR * 1.5]
    temppd = temppd[temppd['OC'] > Q1 - IQR * 1.5]
    filtered_df = filtered_df.append(temppd)

  reread = filtered_df
  filtered_df = None
  print("Removing outliers: Done\n")

  # Setup data
  """This part sets up the data to use it for the CNN model, First it splits
  the data from the input values and labels (x and y), then binarizes the labels.
  After this the all the data is plit into train and test data"""

  print("Setup data...")
  X = np.array(list(reread['spectogram'].values))
  y = reread['OC_state'].values
  reread = None
  lb = LabelBinarizer()
  y = lb.fit_transform(y)

  label_lenght = len(y[0])
  X_train, X_test, y_train, y_test = train_test_split(X, y)
  X = None
  y = None
  print("Setup data: Done\n")
  return label_lenght, X_train, X_test, y_train, y_test

In [20]:
def setup_model(label_train_length):
  """This function sets up the whole model for the CNN"""
  print("Making model...")

  #Setup key parameters
  reg = l2(0.0005)
  init="he_normal"
  chanDim = -1

  # The model:
  model = Sequential()
  model.add(Conv2D(32, (7, 7), strides=(2, 2), padding="valid",
              kernel_initializer=init, kernel_regularizer=reg,
              input_shape=(217,335,3)))
  # here we stack two CONV layers on top of each other where
  # each layerswill learn a total of 32 (3x3) filters
  model.add(Conv2D(32, (3, 3), padding="same",
      kernel_initializer=init, kernel_regularizer=reg))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))
  model.add(Conv2D(32, (3, 3), strides=(2, 2), padding="same",
      kernel_initializer=init, kernel_regularizer=reg))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))
  model.add(Dropout(0.25))
  # stack two more CONV layers, keeping the size of each filter
  # as 3x3 but increasing to 64 total learned filters
  model.add(Conv2D(64, (3, 3), padding="same",
      kernel_initializer=init, kernel_regularizer=reg))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))
  model.add(Conv2D(64, (3, 3), strides=(2, 2), padding="same",
      kernel_initializer=init, kernel_regularizer=reg))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))
  model.add(Dropout(0.25))
  # increase the number of filters again, this time to 128
  model.add(Conv2D(128, (3, 3), padding="same",
      kernel_initializer=init, kernel_regularizer=reg))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=chanDim))


  # fully-connected layer
  model.add(Flatten())
  model.add(Dense(512, kernel_initializer=init))
  model.add(Activation("selu"))
  model.add(BatchNormalization())
  model.add(Dropout(0.5))

  # softmax classifier
  model.add(Dense(label_train_length))
  model.add(Activation("softmax"))
  print("Making model: Done\n")
  return model

def train_model(model, X_train, y_train):
  """This function trains the model"""
  print("Training model...")
  model.compile(loss='categorical_crossentropy', metrics='accuracy', optimizer="adamax")
  history = model.fit(X_train, y_train,
                  batch_size=64,
                  epochs=8,
                  verbose=1, shuffle=True)
  return model
  print("Training model: Done\n")

def score_model(model, X_test, y_test):
  """This function scores the models perfomance and prints it out"""
  score = model.evaluate(X_test, y_test)

  print('Test score:', score[0])
  print('Test accuracy:', score[1])



In [11]:
# setup data, only needed if not already done
label_length, X_train, X_test, y_train, y_test = data_setup()

loading data...
loading data: done

Adding landscape column...
Adding lanscape column: Done

Removing outliers...
Removing outliers: Done

Setup data...
Setup data: Done



In [21]:

# Train and score model
model = setup_model(lenght_label)
model = train_model(model, X_train, y_train)
score_model(model, X_test, y_test)



Making model...
Making model: Done

Training model...
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Test score: 1.1450220346450806
Test accuracy: 0.6561256647109985


In [1]:
#Test score: 0.9720392227172852
#Test accuracy: 0.6349738240242004

#Test score: 1.0995421409606934
#Test accuracy: 0.6544502377510071]
