In [157]:
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import pandas
import numpy as np
import statistics

In [158]:
dataset = pandas.read_csv("data/table2.csv", sep="\t")
dataset.head()

Unnamed: 0,relevant,depth,number_bold,number_br,number_div,number_img,number_li,number_links,number_p,number_relevants,number_td,number_th,number_tr
0,1,10,0,4,1,0,0,7,0,45,9,9,9
1,0,6,0,0,13,6,0,18,0,108,152,30,27
2,0,7,1,0,0,1,0,0,0,1,2,0,2
3,0,5,0,0,8,4,0,0,8,43,4,4,2
4,1,6,3,0,2,0,0,1,0,48,2,2,2


## analisi features
Addestrando una rete (epochs:5, neurons:16, batch_size:16, activation:'tanh')
    
con KFold(8), utilizzando **una** sola feature si è ottenuto

       1. depth                loss: 0.338, acc: 0.068 (std: 0.137)
       2. number_links         loss: 0.293, acc: 0.437 (std: 0.350)
       3. number_relevants     loss: 0.303, acc: 0.240 (std: 0.216)
       4. number_td            loss: 0.289, acc: 0.440 (std: 0.262)
       5. number_th            loss: 0.302, acc: 0.292 (std: 0.305)
       6. number_tr            loss: 0.287, acc: 0.450 (std: 0.254)

In [159]:
# Dataset to numpy arrays

# choose the target feature and the features to train on
TARGET_FEATURE = 'relevant'
CHOSEN_FEATURES = ['number_img', 'number_td', 'number_tr', 'number_relevants', 'number_links', 'number_bold']

CHOSEN_FEATURES = sorted(CHOSEN_FEATURES) # XXX as convention we sort the features based on their names
print(CHOSEN_FEATURES)

if (TARGET_FEATURE in CHOSEN_FEATURES): CHOSEN_FEATURES.remove(TARGET_FEATURE) # ensure we do not use target feature

data  = np.ndarray((len(dataset), len(CHOSEN_FEATURES)), np.int)
label = np.ndarray((len(dataset), 1), np.int)

for i, row in dataset.iterrows():
    label[i] = row[TARGET_FEATURE]
    data[i]  = np.fromiter([row[feature] for feature in CHOSEN_FEATURES], np.int)
    
print(data[0])

['number_bold', 'number_img', 'number_links', 'number_relevants', 'number_td', 'number_tr']
[ 0  0  7 45  9  9]


In [160]:
# model definition

def table_classifier(neuron, activation, input_shape=()):
    model = Sequential()
    model.add(Dense(neuron, input_shape=input_shape, activation=activation))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='rmsprop', 
              loss='mean_squared_error',
              metrics=['accuracy'])
    return model

In [167]:
# train & test
import itertools


loo = KFold(8)

epochs = [30]
batch_size = [8, 16, 32]
neurons = [16, 32]
activation = ['tanh', 'sigmoid']

hyperparams = list(itertools.product(epochs, batch_size, neurons, activation))
hyperparams

[(30, 8, 16, 'tanh'),
 (30, 8, 16, 'sigmoid'),
 (30, 8, 32, 'tanh'),
 (30, 8, 32, 'sigmoid'),
 (30, 16, 16, 'tanh'),
 (30, 16, 16, 'sigmoid'),
 (30, 16, 32, 'tanh'),
 (30, 16, 32, 'sigmoid'),
 (30, 32, 16, 'tanh'),
 (30, 32, 16, 'sigmoid'),
 (30, 32, 32, 'tanh'),
 (30, 32, 32, 'sigmoid')]

In [168]:
early_stop_val_acc = EarlyStopping(monitor = "val_loss", min_delta=0.005, patience=5, verbose=True)

for epoch, batch, neuron, activation in hyperparams:

    res = []
    for (i, (train_index, test_index)) in enumerate(loo.split(data)):
        
        model = table_classifier(neuron, activation, input_shape=(len(CHOSEN_FEATURES),))
        
        data_train,   data_test =  data[train_index],  data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        
        
        model.fit(data_train, label_train, epochs=epoch, batch_size=batch, shuffle=True, verbose=False,
                 validation_split=0.3, callbacks=[early_stop_val_acc])
        
        r = model.evaluate(data_test, label_test, verbose=False)
        print(r)
        res.append(r)
        
    loss = statistics.mean(list(map(lambda x: x[0], res)))
    acc  = statistics.mean(list(map(lambda x: x[1], res)))
    acc_std = statistics.stdev(list(map(lambda x: x[1], res)))
    
    print("({}, {}, {}, {})  - loss: {}, acc: {} (std: {})".format(epoch, batch, neuron, activation,
                                                                   loss, acc, acc_std))
    print()

Epoch 00026: early stopping
[0.058541366527246876, 0.9209302347759868]
[0.09062465134748193, 0.8790697696597077]
Epoch 00023: early stopping
[0.07324220472297004, 0.9116279091945915]
Epoch 00028: early stopping
[0.05457261478485063, 0.9162790719852891]
Epoch 00022: early stopping
[0.0573044523250225, 0.9395348859387774]
[0.05766247421781594, 0.9299065448413385]
[0.04022436021505115, 0.9626168207587483]
Epoch 00020: early stopping
[0.07889702620211049, 0.8831775728787217]
(30, 8, 16, tanh)  - loss: 0.06388364379281869, acc: 0.9178928512541451 (std: 0.027713848125633798)

[0.06581321376007657, 0.8930232527644135]
[0.07820115276547365, 0.8930232527644135]
[0.08962553718755412, 0.8744186049283937]
[0.06856921765693398, 0.9069767422454301]
Epoch 00028: early stopping
[0.07086370670518209, 0.9162790697674419]
[0.07098529954379965, 0.9158878532525535]
Epoch 00029: early stopping
[0.055776376694997895, 0.9345794375811782]
Epoch 00029: early stopping
[0.05267029332223339, 0.9299065403849165]
(3

In [180]:
## set the best hyperparameters from the results above
best_epochs = 20    
best_batch_size = 8
best_neurons = 32
best_activation = 'tanh'

In [181]:
## retrain the model on the whole dataset and save it
model = table_classifier(best_neurons, best_activation, input_shape=(len(CHOSEN_FEATURES),))

model.fit(data, label, epochs=best_epochs, 
                       batch_size=best_epochs, 
                       shuffle=True, verbose=True)

model.save("models/table_classifier.h5")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
