In [3]:
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.regularizers import l2
from keras.optimizers import SGD

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import pandas
import numpy as np
import statistics

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
dataset = pandas.read_csv("data/and/list.csv", sep="\t")

dataset = dataset.reindex(np.random.permutation(dataset.index))
dataset.head()

Unnamed: 0,relevant,avg_tag_in_li,depth,number_bold,number_br,number_div,number_img,number_links,number_p,number_relevants,number_row,relevants_ratio
1156,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.24
1356,1.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.04
362,0.0,8.33,8.0,0.0,0.0,42.0,6.0,12.0,0.0,7.0,18.0,0.14
1331,1.0,2.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.33
1228,1.0,4.0,4.0,3.0,3.0,0.0,0.0,0.0,0.0,14.0,4.0,0.35


In [5]:
dataset.keys()

Index(['relevant', 'avg_tag_in_li', 'depth', 'number_bold', 'number_br',
       'number_div', 'number_img', 'number_links', 'number_p',
       'number_relevants', 'number_row', 'relevants_ratio'],
      dtype='object')

In [6]:
# Dataset to numpy arrays

TARGET_FEATURE = 'relevant'

CHOSEN_FEATURES = [
    'avg_tag_in_li',
    'depth',
    'number_bold',
    'number_br',
    'number_div',
    'number_img',
    'number_links',
    'number_p',
    'number_relevants',
    'number_row',
    'relevants_ratio'
]


CHOSEN_FEATURES = sorted(CHOSEN_FEATURES) # XXX as convention we sort the features based on their names
print ("Sorted features:\n", CHOSEN_FEATURES)

if (TARGET_FEATURE in CHOSEN_FEATURES): CHOSEN_FEATURES.remove(TARGET_FEATURE) # ensure we do not use target feature

data  = np.ndarray((len(dataset), len(CHOSEN_FEATURES)), np.float32)
label = np.ndarray((len(dataset), 2), np.float32)

for i, row in dataset.iterrows():
    label[i] = np.zeros(2)
    label[i][int(row[TARGET_FEATURE])] = 1

    data[i]  = np.fromiter([row[feature] for feature in CHOSEN_FEATURES], np.float32)
    
    
print ("\nExample data vector:\n", data[0])
print ("\nExample label vector:\n", label[0])

Sorted features:
 ['avg_tag_in_li', 'depth', 'number_bold', 'number_br', 'number_div', 'number_img', 'number_links', 'number_p', 'number_relevants', 'number_row', 'relevants_ratio']

Example data vector:
 [20.    7.    0.    0.   42.    7.   21.    0.   26.    7.    0.18]

Example label vector:
 [1. 0.]


In [7]:
# model definition

def list_classifier(neuron, activation, input_shape=()):
    model = Sequential()
    model.add(Dense(neuron, input_shape=input_shape, 
                            activation=activation
                            ))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(optimizer="rmsprop", 
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    return model

In [9]:
# train & test
import itertools

idx = np.random.permutation(len(data))
data, label = data[idx], label[idx]

loo = KFold(4)

val_percent = 0.3
epochs = [50]
batch_size = [32]
neurons = [8, 16, 32]
activation = ['tanh']

hyperparams = list(itertools.product(epochs, batch_size, neurons, activation))
hyperparams

[(50, 32, 8, 'tanh'), (50, 32, 16, 'tanh'), (50, 32, 32, 'tanh')]

In [10]:
early_stop_val_acc = EarlyStopping(monitor = "val_loss", verbose=True, mode='auto')

for epoch, batch, neuron, activation in hyperparams:

    res = []
    for (i, (train_index, test_index)) in enumerate(loo.split(data)):
        
        model = list_classifier(neuron, activation, input_shape=(len(CHOSEN_FEATURES),))
        
        data_train,   data_test =  data[train_index],  data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        
        
        t = model.fit(data_train, label_train, epochs=epoch, batch_size=batch, shuffle=True, verbose=False,
                     validation_split=val_percent, callbacks=[early_stop_val_acc])
        
        r = model.evaluate(data_test, label_test, verbose=False)
        print (r)
        res.append(r)
        
    loss = statistics.mean(list(map(lambda x: x[0], res)))
    acc  = statistics.mean(list(map(lambda x: x[1], res)))
    acc_std = statistics.stdev(list(map(lambda x: x[1], res)))
    
    print()
    print()
    print ("Training on {} samples".format(len(data_train)))
    print ("Testing on {} samples, with {} of validation".format(len(data_test), 
                                                                 len(data_test) * val_percent))
    
    print("\n\n>>({}, {}, {}, {})  - loss: {}, acc: {} (std: {})".format(epoch, batch, neuron, activation,
                                                                   loss, acc, acc_std))

Epoch 00027: early stopping
[0.22408606601420697, 0.920993229336448]
Epoch 00042: early stopping
[0.1678971190081224, 0.9345372464533046]
Epoch 00028: early stopping
[0.15574721203130834, 0.9502262446135957]
Epoch 00036: early stopping
[0.1807196961538824, 0.9276018102244554]


Training on 1328 samples
Testing on 442 samples, with 132.6 of validation


>>(50, 32, 8, tanh)  - loss: 0.18211252330188002, acc: 0.933339632656951 (std: 0.012542570488791229)
Epoch 00026: early stopping
[0.17830832452254694, 0.9390519187358917]
Epoch 00032: early stopping
[0.16707095490352294, 0.9458239281688802]
Epoch 00023: early stopping
[0.16739055750326873, 0.9524886880525097]
Epoch 00018: early stopping
[0.16129552009957948, 0.9389140255310956]


Training on 1328 samples
Testing on 442 samples, with 132.6 of validation


>>(50, 32, 16, tanh)  - loss: 0.16851633925722953, acc: 0.9440696401220943 (std: 0.006473427333497107)
Epoch 00016: early stopping
[0.1739914400616564, 0.9390519187358917]
Epoch 00011: e

In [11]:
predicted = model.predict(data_train)

yes = []
no = []

for i, l in enumerate(label_train):
    if l[0] == 1:
        no.append(predicted[i][0])
    else:
        yes.append(predicted[i][1])
        
yes = list(map(float, yes)) 
no  = list(map(float, no)) 

In [12]:
print ("YES: mean {} std {}".format(statistics.mean(yes), statistics.stdev(yes)))
print ("NO: mean {} std {}".format(statistics.mean(no), statistics.stdev(no)))

YES: mean 0.89067799965481 std 0.10966267227792012
NO: mean 0.9057175528466611 std 0.2374656861714324


In [13]:
## set the best hyperparameters from the results above
best_epochs = int(statistics.mean([16, 11, 16, 16]))
best_batch_size = 32
best_neurons = 32
best_activation = 'tanh'

In [14]:
## retrain the model on the whole dataset and save it
model = list_classifier(best_neurons, best_activation, input_shape=(len(CHOSEN_FEATURES),))

model.fit(data, label, epochs=best_epochs, 
                       batch_size=best_epochs, 
                       shuffle=True, verbose=True)

model.save("models/list_classifier.h5")

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14
