In [197]:
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import pandas
import numpy as np
import statistics

In [198]:
dataset = pandas.read_csv("data/table2.csv", sep="\t")
dataset.head()

Unnamed: 0,relevant,depth,number_bold,number_br,number_div,number_img,number_li,number_links,number_p,number_relevants,number_td,number_th,number_tr,relevants_ratio
0,1,6,3,0,2,0,0,1,0,55,2,2,2,0.5
1,0,3,10,0,0,0,0,0,0,15,20,1,11,0.28
2,0,14,1,1,5,5,0,0,0,16,7,0,5,0.39
3,1,6,3,0,2,0,0,1,0,32,2,2,2,0.53
4,0,3,14,0,0,0,0,0,0,16,28,1,15,0.13


## analisi features
Addestrando una rete (epochs:5, neurons:16, batch_size:16, activation:'tanh')
    
con KFold(8), utilizzando **una** sola feature si è ottenuto

       1. depth                loss: 0.338, acc: 0.068 (std: 0.137)
       2. number_links         loss: 0.293, acc: 0.437 (std: 0.350)
       3. number_relevants     loss: 0.303, acc: 0.240 (std: 0.216)
       4. number_td            loss: 0.289, acc: 0.440 (std: 0.262)
       5. number_th            loss: 0.302, acc: 0.292 (std: 0.305)
       6. number_tr            loss: 0.287, acc: 0.450 (std: 0.254)

In [202]:
# Dataset to numpy arrays

# choose the target feature and the features to train on
TARGET_FEATURE = 'relevant'
CHOSEN_FEATURES = ['number_img', 'number_td', 'number_tr', 'number_relevants', 'number_links', 'number_bold', 'relevants_ratio']

CHOSEN_FEATURES = sorted(CHOSEN_FEATURES) # XXX as convention we sort the features based on their names
print(CHOSEN_FEATURES)

if (TARGET_FEATURE in CHOSEN_FEATURES): CHOSEN_FEATURES.remove(TARGET_FEATURE) # ensure we do not use target feature

data  = np.ndarray((len(dataset), len(CHOSEN_FEATURES)), np.float32)
label = np.ndarray((len(dataset), 1), np.float32)

for i, row in dataset.iterrows():
    label[i] = row[TARGET_FEATURE]
    data[i]  = np.fromiter([row[feature] for feature in CHOSEN_FEATURES], np.float32)
    
print(data[0])

['number_bold', 'number_img', 'number_links', 'number_relevants', 'number_td', 'number_tr', 'relevants_ratio']
[ 3.   0.   1.  55.   2.   2.   0.5]


In [204]:
# model definition

def table_classifier(neuron, activation, input_shape=()):
    model = Sequential()
    model.add(Dense(neuron, input_shape=input_shape, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer="adam", 
              loss='mean_squared_error',
              metrics=['accuracy'])
    return model

In [205]:
# train & test
import itertools

loo = KFold(8)

epochs = [30]
batch_size = [8]
neurons = [32]
activation = ['tanh', 'sigmoid']

hyperparams = list(itertools.product(epochs, batch_size, neurons, activation))
hyperparams

[(30, 8, 32, 'tanh'), (30, 8, 32, 'sigmoid')]

In [207]:
early_stop_val_acc = EarlyStopping(monitor = "val_loss", min_delta=0.0005, patience=5, verbose=True)

for epoch, batch, neuron, activation in hyperparams:

    res = []
    for (i, (train_index, test_index)) in enumerate(loo.split(data)):
        
        model = table_classifier(neuron, activation, input_shape=(len(CHOSEN_FEATURES),))
        
        data_train,   data_test =  data[train_index],  data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        
        
        t = model.fit(data_train, label_train, epochs=epoch, batch_size=batch, shuffle=True, verbose=False,
                 validation_split=0.3, callbacks=[early_stop_val_acc])
        
        r = model.evaluate(data_test, label_test, verbose=False)
        print(r)
        res.append(r)
        
    loss = statistics.mean(list(map(lambda x: x[0], res)))
    acc  = statistics.mean(list(map(lambda x: x[1], res)))
    acc_std = statistics.stdev(list(map(lambda x: x[1], res)))
    
    print("({}, {}, {}, {})  - loss: {}, acc: {} (std: {})".format(epoch, batch, neuron, activation,
                                                                   loss, acc, acc_std))
    print()

[0.040347829328051636, 0.9464285714285714]
[0.045884401884113725, 0.9282511213435186]
Epoch 00021: early stopping
[0.05437118991073472, 0.9327354265435395]
[0.03965112994604581, 0.9461883410744603]
[0.033167422786689126, 0.968609865470852]
Epoch 00019: early stopping
[0.06526916210159593, 0.9192825077360521]
Epoch 00023: early stopping
[0.07467027098742302, 0.8968609828050895]
Epoch 00024: early stopping
[0.07291455358066366, 0.901345287737825]
(30, 8, 32, tanh)  - loss: 0.053284495065664704, acc: 0.9299627630174886 (std: 0.024105019440806705)

Epoch 00021: early stopping
[0.04702323914638588, 0.9330357142857143]
Epoch 00020: early stopping
[0.061608531199094964, 0.905829592403275]
Epoch 00026: early stopping
[0.058574520118300694, 0.9058295929378458]
[0.050153423986092814, 0.9147982028033167]
[0.04481445311364037, 0.9417040361417248]
Epoch 00022: early stopping
[0.07069648347895241, 0.9013452880051104]
[0.0731907311509543, 0.8923766818816352]
Epoch 00027: early stopping
[0.07635830762

In [208]:
## set the best hyperparameters from the results above
best_epochs = 30    
best_batch_size = 8
best_neurons = 32
best_activation = 'tanh'

In [209]:
## retrain the model on the whole dataset and save it
model = table_classifier(best_neurons, best_activation, input_shape=(len(CHOSEN_FEATURES),))

model.fit(data, label, epochs=best_epochs, 
                       batch_size=best_epochs, 
                       shuffle=True, verbose=True)

model.save("models/table_classifier.h5")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
