In [1]:
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.regularizers import l2, l1, l1_l2
from keras.optimizers import SGD

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import pandas
import numpy as np
import statistics

Using TensorFlow backend.


In [2]:
dataset = pandas.read_csv("data/and/table.csv", sep="\t")
dataset.head()

Unnamed: 0,relevant,depth,number_bold,number_br,number_div,number_img,number_li,number_links,number_p,number_relevants,number_td,number_th,number_tr,relevants_ratio
0,0,5,0,0,0,0,0,1,0,3,4,0,2,0.12
1,0,7,9,30,0,0,0,0,3,42,5,0,1,0.19
2,0,5,0,0,0,0,0,0,1,24,1,0,1,0.32
3,0,5,0,0,1,1,0,0,1,9,1,0,1,0.27
4,0,6,3,0,6,0,0,0,0,21,14,0,2,0.23


In [3]:
# Dataset to numpy arrays

TARGET_FEATURE = 'relevant'

CHOSEN_FEATURES = [
    'depth',
    'number_bold', 
    'number_br',
    'number_div',
    'number_img',
    'number_li',
    'number_links',
    'number_p',
    'number_relevants', 
    'number_td', 
    'number_th',
    'number_tr', 
    'relevants_ratio'
]

CHOSEN_FEATURES = sorted(CHOSEN_FEATURES) # XXX as convention we sort the features based on their names
print ("Sorted features:\n", CHOSEN_FEATURES)

if (TARGET_FEATURE in CHOSEN_FEATURES): CHOSEN_FEATURES.remove(TARGET_FEATURE) # ensure we do not use target feature

data  = np.ndarray((len(dataset), len(CHOSEN_FEATURES)), np.float32)
label = np.ndarray((len(dataset), 2), np.float32)

for i, row in dataset.iterrows():
    label[i] = np.zeros(2)
    label[i][int(row[TARGET_FEATURE])] = 1

    data[i]  = np.fromiter([row[feature] for feature in CHOSEN_FEATURES], np.float32)
    
    
print ("\nExample data vector:\n", data[0])
print ("\nExample label vector:\n", label[0])

Sorted features:
 ['depth', 'number_bold', 'number_br', 'number_div', 'number_img', 'number_li', 'number_links', 'number_p', 'number_relevants', 'number_td', 'number_th', 'number_tr', 'relevants_ratio']

Example data vector:
 [ 5.    0.    0.    0.    0.    0.    1.    0.    3.    4.    0.    2.
  0.12]

Example label vector:
 [ 1.  0.]


In [4]:
# model definition
import matplotlib as plt
import keras.backend as K

def table_classifier(neuron, activation, input_shape=()):
    model = Sequential()
    model.add(Dense(neuron, input_shape=input_shape, 
                            activation=activation
                            ))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(optimizer="rmsprop", 
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [5]:
# train & test
import itertools

idx = np.random.permutation(len(data))
data, label = data[idx], label[idx]


loo = KFold(4)

val_percent = 0.3
epochs = [50]
batch_size = [32]
neurons = [8, 16, 32]
activation = ['tanh']

hyperparams = list(itertools.product(epochs, batch_size, neurons, activation))
hyperparams

[(50, 32, 8, 'tanh'), (50, 32, 16, 'tanh'), (50, 32, 32, 'tanh')]

In [9]:

early_stop_val_acc = EarlyStopping(monitor = "val_loss", verbose=True, mode='auto')

for epoch, batch, neuron, activation in hyperparams:

    res = []
    for (i, (train_index, test_index)) in enumerate(loo.split(data)):
        
        model = table_classifier(neuron, activation, input_shape=(len(CHOSEN_FEATURES),))
        
        data_train,   data_test =  data[train_index],  data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        
        t = model.fit(data_train, label_train, epochs=epoch, batch_size=batch, shuffle=True, verbose=False,
                 validation_split=val_percent, callbacks=[early_stop_val_acc])
        
        r = model.evaluate(data_test, label_test, verbose=False)
        print(r)
        res.append(r)
        
    loss = statistics.mean(list(map(lambda x: x[0], res)))
    acc  = statistics.mean(list(map(lambda x: x[1], res)))
    acc_std = statistics.stdev(list(map(lambda x: x[1], res)))
    
    print()
    print()
    print ("Training on {} samples".format(len(data_train)))
    print ("Testing on {} samples, with {} of validation".format(len(data_test), 
                                                                 len(data_test) * val_percent))
    
    print("\n\n>>({}, {}, {}, {})  - loss: {}, acc: {} (std: {})".format(epoch, batch, neuron, activation,
                                                                   loss, acc, acc_std))
    

TypeError: softmax() got an unexpected keyword argument 'axis'

In [8]:
predicted = model.predict(data_train)

yes = []
no = []

for i, l in enumerate(label_train):
    if l[0] == 1:
        no.append(predicted[i][0])
    else:
        yes.append(predicted[i][1])
        
yes = list(map(float, yes)) 
no  = list(map(float, no)) 

NameError: name 'model' is not defined

In [7]:
print ("YES: mean {} std {}".format(statistics.mean(yes), statistics.stdev(yes)))
print ("NO: mean {} std {}".format(statistics.mean(no), statistics.stdev(no)))

NameError: name 'yes' is not defined

In [None]:
## set the best hyperparameters from the results above
best_epochs = int(statistics.mean([19, 15, 18, 22]))
best_batch_size = 32
best_neurons = 32
best_activation = 'tanh'

In [None]:
## retrain the model on the whole dataset and save it
model = table_classifier(best_neurons, best_activation, input_shape=(len(CHOSEN_FEATURES),))

model.fit(data, label, epochs=best_epochs, 
                       batch_size=best_epochs, 
                       shuffle=True, verbose=True)

model.save("models/table_classifier.h5")