In [3]:
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.regularizers import l2, l1, l1_l2
from keras.optimizers import SGD

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import pandas
import numpy as np
import statistics

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
dataset = pandas.read_csv("data/and/table.csv", sep="\t")
dataset.head()

Unnamed: 0,relevant,depth,number_bold,number_br,number_div,number_img,number_li,number_links,number_p,number_relevants,number_td,number_th,number_tr,relevants_ratio
0,0,5,0,0,0,0,0,1,0,3,4,0,2,0.12
1,0,7,9,30,0,0,0,0,3,42,5,0,1,0.19
2,0,5,0,0,0,0,0,0,1,24,1,0,1,0.32
3,0,5,0,0,1,1,0,0,1,9,1,0,1,0.27
4,0,6,3,0,6,0,0,0,0,21,14,0,2,0.23


In [5]:
# Dataset to numpy arrays

TARGET_FEATURE = 'relevant'

CHOSEN_FEATURES = [
    'depth',
    'number_bold', 
    'number_br',
    'number_div',
    'number_img',
    'number_li',
    'number_links',
    'number_p',
    'number_relevants', 
    'number_td', 
    'number_th',
    'number_tr', 
    'relevants_ratio'
]

CHOSEN_FEATURES = sorted(CHOSEN_FEATURES) # XXX as convention we sort the features based on their names
print ("Sorted features:\n", CHOSEN_FEATURES)

if (TARGET_FEATURE in CHOSEN_FEATURES): CHOSEN_FEATURES.remove(TARGET_FEATURE) # ensure we do not use target feature

data  = np.ndarray((len(dataset), len(CHOSEN_FEATURES)), np.float32)
label = np.ndarray((len(dataset), 2), np.float32)

for i, row in dataset.iterrows():
    label[i] = np.zeros(2)
    label[i][int(row[TARGET_FEATURE])] = 1

    data[i]  = np.fromiter([row[feature] for feature in CHOSEN_FEATURES], np.float32)
    
    
print ("\nExample data vector:\n", data[0])
print ("\nExample label vector:\n", label[0])

Sorted features:
 ['depth', 'number_bold', 'number_br', 'number_div', 'number_img', 'number_li', 'number_links', 'number_p', 'number_relevants', 'number_td', 'number_th', 'number_tr', 'relevants_ratio']

Example data vector:
 [5.   0.   0.   0.   0.   0.   1.   0.   3.   4.   0.   2.   0.12]

Example label vector:
 [1. 0.]


In [6]:
# model definition
import matplotlib as plt
import keras.backend as K

def table_classifier(neuron, activation, input_shape=()):
    model = Sequential()
    model.add(Dense(neuron, input_shape=input_shape, 
                            activation=activation
                            ))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(optimizer="rmsprop", 
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [7]:
# train & test
import itertools

idx = np.random.permutation(len(data))
data, label = data[idx], label[idx]


loo = KFold(4)

val_percent = 0.3
epochs = [50]
batch_size = [32]
neurons = [8, 16, 32]
activation = ['tanh']

hyperparams = list(itertools.product(epochs, batch_size, neurons, activation))
hyperparams

[(50, 32, 8, 'tanh'), (50, 32, 16, 'tanh'), (50, 32, 32, 'tanh')]

In [8]:

early_stop_val_acc = EarlyStopping(monitor = "val_loss", verbose=True, mode='auto')

for epoch, batch, neuron, activation in hyperparams:

    res = []
    for (i, (train_index, test_index)) in enumerate(loo.split(data)):
        
        model = table_classifier(neuron, activation, input_shape=(len(CHOSEN_FEATURES),))
        
        data_train,   data_test =  data[train_index],  data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        
        t = model.fit(data_train, label_train, epochs=epoch, batch_size=batch, shuffle=True, verbose=False,
                 validation_split=val_percent, callbacks=[early_stop_val_acc])
        
        r = model.evaluate(data_test, label_test, verbose=False)
        print(r)
        res.append(r)
        
    loss = statistics.mean(list(map(lambda x: x[0], res)))
    acc  = statistics.mean(list(map(lambda x: x[1], res)))
    acc_std = statistics.stdev(list(map(lambda x: x[1], res)))
    
    print()
    print()
    print ("Training on {} samples".format(len(data_train)))
    print ("Testing on {} samples, with {} of validation".format(len(data_test), 
                                                                 len(data_test) * val_percent))
    
    print("\n\n>>({}, {}, {}, {})  - loss: {}, acc: {} (std: {})".format(epoch, batch, neuron, activation,
                                                                   loss, acc, acc_std))
    

Epoch 00028: early stopping
[0.164133927528409, 0.9246376823687898]
Epoch 00040: early stopping
[0.19957964558532273, 0.9304347838180652]
Epoch 00038: early stopping
[0.1345936108218587, 0.939130433227705]
[0.24659416191819786, 0.8927536243977754]


Training on 1035 samples
Testing on 345 samples, with 103.5 of validation


>>(50, 32, 8, tanh)  - loss: 0.18622533646344708, acc: 0.9217391309530838 (std: 0.020220723397320388)
Epoch 00042: early stopping
[0.13127977312475012, 0.9449275346769803]
Epoch 00048: early stopping
[0.13999868406765703, 0.9507246373356253]
Epoch 00049: early stopping
[0.10916701868392419, 0.9449275346769803]
Epoch 00026: early stopping
[0.1350716984142428, 0.9304347838180652]


Training on 1035 samples
Testing on 345 samples, with 103.5 of validation


>>(50, 32, 16, tanh)  - loss: 0.12887929357264352, acc: 0.9427536226269128 (std: 0.00865529996182951)
Epoch 00027: early stopping
[0.0986589325942855, 0.9565217403398044]
Epoch 00023: early stopping
[0.1278029979139

In [18]:
predicted = model.predict(data_train)

yes = []
no = []

for i, l in enumerate(label_train):
    if l[0] == 1:
        no.append(predicted[i][0])
    else:
        yes.append(predicted[i][1])
        
yes = list(map(float, yes)) 
no  = list(map(float, no)) 

In [19]:
print ("YES: mean {} std {}".format(statistics.mean(yes), statistics.stdev(yes)))
print ("NO: mean {} std {}".format(statistics.mean(no), statistics.stdev(no)))

YES: mean 0.9225313375228656 std 0.15904805605788852
NO: mean 0.9505750839324558 std 0.1311598874429693


In [22]:
## set the best hyperparameters from the results above
best_epochs = int(statistics.mean([19, 15, 18, 22]))
best_batch_size = 32
best_neurons = 32
best_activation = 'tanh'

In [23]:
## retrain the model on the whole dataset and save it
model = table_classifier(best_neurons, best_activation, input_shape=(len(CHOSEN_FEATURES),))

model.fit(data, label, epochs=best_epochs, 
                       batch_size=best_epochs, 
                       shuffle=True, verbose=True)

model.save("models/table_classifier.h5")

Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18
