In [222]:
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import pandas
import numpy as np
import statistics

In [223]:
dataset = pandas.read_csv("data/table2.csv", sep="\t")
dataset.head()

Unnamed: 0,relevant,depth,number_bold,number_br,number_div,number_img,number_li,number_links,number_p,number_relevants,number_td,number_th,number_tr,relevants_ratio
0,0,7,0,0,14,6,0,20,0,80,86,19,16,0.48
1,0,3,1,0,0,0,0,0,0,2,2,1,2,0.33
2,0,5,15,60,0,0,0,0,0,142,5,0,1,0.42
3,1,3,0,0,0,0,0,0,0,60,54,0,27,0.65
4,1,4,7,0,0,0,0,0,0,46,53,53,53,0.3


## analisi features
Addestrando una rete (epochs:5, neurons:16, batch_size:16, activation:'tanh')
    
con KFold(8), utilizzando **una** sola feature si è ottenuto

       1. depth                loss: 0.338, acc: 0.068 (std: 0.137)
       2. number_links         loss: 0.293, acc: 0.437 (std: 0.350)
       3. number_relevants     loss: 0.303, acc: 0.240 (std: 0.216)
       4. number_td            loss: 0.289, acc: 0.440 (std: 0.262)
       5. number_th            loss: 0.302, acc: 0.292 (std: 0.305)
       6. number_tr            loss: 0.287, acc: 0.450 (std: 0.254)

In [224]:
# Dataset to numpy arrays

# choose the target feature and the features to train on
TARGET_FEATURE = 'relevant'
CHOSEN_FEATURES = ['number_img', 'number_td', 'number_tr', 'number_relevants', 'number_links', 'number_bold', 'relevants_ratio']
#CHOSEN_FEATURES = ["relevants_ratio"]

CHOSEN_FEATURES = sorted(CHOSEN_FEATURES) # XXX as convention we sort the features based on their names
print(CHOSEN_FEATURES)

if (TARGET_FEATURE in CHOSEN_FEATURES): CHOSEN_FEATURES.remove(TARGET_FEATURE) # ensure we do not use target feature

data  = np.ndarray((len(dataset), len(CHOSEN_FEATURES)), np.float32)
label = np.ndarray((len(dataset), 1), np.float32)

for i, row in dataset.iterrows():
    label[i] = row[TARGET_FEATURE]
    data[i]  = np.fromiter([row[feature] for feature in CHOSEN_FEATURES], np.float32)
    
print(data[0])

['number_bold', 'number_img', 'number_links', 'number_relevants', 'number_td', 'number_tr', 'relevants_ratio']
[ 0.    6.   20.   80.   86.   16.    0.48]


In [225]:
# model definition

def table_classifier(neuron, activation, input_shape=()):
    model = Sequential()
    model.add(Dense(neuron, input_shape=input_shape, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer="adam", 
              loss='mean_squared_error',
              metrics=['accuracy'])
    return model

In [226]:
# train & test
import itertools

loo = KFold(8)

epochs = [30]
batch_size = [8]
neurons = [32]
activation = ['tanh', 'sigmoid']

hyperparams = list(itertools.product(epochs, batch_size, neurons, activation))
hyperparams

[(30, 8, 32, 'tanh'), (30, 8, 32, 'sigmoid')]

In [227]:
early_stop_val_acc = EarlyStopping(monitor = "val_loss", min_delta=0.0005, patience=5, verbose=True)

for epoch, batch, neuron, activation in hyperparams:

    res = []
    for (i, (train_index, test_index)) in enumerate(loo.split(data)):
        
        model = table_classifier(neuron, activation, input_shape=(len(CHOSEN_FEATURES),))
        
        data_train,   data_test =  data[train_index],  data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        
        
        t = model.fit(data_train, label_train, epochs=epoch, batch_size=batch, shuffle=True, verbose=False,
                 validation_split=0.3, callbacks=[early_stop_val_acc])
        
        r = model.evaluate(data_test, label_test, verbose=False)
        print(r)
        res.append(r)
        
    loss = statistics.mean(list(map(lambda x: x[0], res)))
    acc  = statistics.mean(list(map(lambda x: x[1], res)))
    acc_std = statistics.stdev(list(map(lambda x: x[1], res)))
    
    print("({}, {}, {}, {})  - loss: {}, acc: {} (std: {})".format(epoch, batch, neuron, activation,
                                                                   loss, acc, acc_std))
    print()

[0.02810156703528828, 0.9737991266375546]
[0.03290202643917507, 0.9650655021834061]
Epoch 00028: early stopping
[0.06643162058934765, 0.9126637554585153]
[0.0338356675492197, 0.9563318777292577]
[0.048616986844296516, 0.9473684210526315]
[0.03325533879953518, 0.9649122807017544]
Epoch 00029: early stopping
[0.023429387700417192, 0.9692982456140351]
[0.04417674664364313, 0.9342105263157895]
(30, 8, 32, tanh)  - loss: 0.03884366770011534, acc: 0.952956216961618 (std: 0.020731525178141065)

[0.031986238876170595, 0.9650655021834061]
[0.037131290933662235, 0.9563318777292577]
[0.06455956624353643, 0.9082969432314411]
[0.04231700355870299, 0.9388646288209607]
[0.05650026174752336, 0.9429824561403509]
[0.037979137871349065, 0.9605263157894737]
[0.025795557564778023, 0.9692982456140351]
[0.048817538245777156, 0.9385964912280702]
(30, 8, 32, sigmoid)  - loss: 0.04313582438018748, acc: 0.9474953075921244 (std: 0.019793675527809685)



In [229]:
## set the best hyperparameters from the results above
best_epochs = 30    
best_batch_size = 8
best_neurons = 32
best_activation = 'tanh'

In [230]:
## retrain the model on the whole dataset and save it
model = table_classifier(best_neurons, best_activation, input_shape=(len(CHOSEN_FEATURES),))

model.fit(data, label, epochs=best_epochs, 
                       batch_size=best_epochs, 
                       shuffle=True, verbose=True)

model.save("models/table_classifier.h5")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
