In [1]:
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import pandas
import numpy as np
import statistics

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
dataset = pandas.read_csv("data/list.csv", sep="\t")
dataset.head()

Unnamed: 0,relevant,avg_tag_in_li,depth,number_bold,number_br,number_div,number_img,number_links,number_p,number_relevants,number_row,relevants_ratio
0,1,0.0,2,2,26,0,0,0,0,28,1,0.12
1,1,0.0,1,0,0,0,0,0,0,20,5,0.51
2,1,1.0,2,0,0,0,0,0,0,29,6,0.52
3,1,1.38,5,0,0,4,0,1,0,35,8,0.48
4,1,2.5,5,0,0,4,0,1,0,19,2,0.59


In [4]:
# Dataset to numpy arrays

# choose the target feature and the features to train on
TARGET_FEATURE = 'relevant'
CHOSEN_FEATURES = ['avg_tag_in_li', 'number_img', 'number_relevants', 'number_links', 'number_bold', 'relevants_ratio']
#CHOSEN_FEATURES = ["relevants_ratio"]

CHOSEN_FEATURES = sorted(CHOSEN_FEATURES) # XXX as convention we sort the features based on their names
print(CHOSEN_FEATURES)

if (TARGET_FEATURE in CHOSEN_FEATURES): CHOSEN_FEATURES.remove(TARGET_FEATURE) # ensure we do not use target feature

data  = np.ndarray((len(dataset), len(CHOSEN_FEATURES)), np.float32)
label = np.ndarray((len(dataset), 2), np.float32)

for i, row in dataset.iterrows():
    label[i] = np.zeros(2)
    label[i][int(row[TARGET_FEATURE])] = 1
    
    data[i]  = np.fromiter([row[feature] for feature in CHOSEN_FEATURES], np.float32)
    
print(data[0])

['avg_tag_in_li', 'number_bold', 'number_img', 'number_links', 'number_relevants', 'relevants_ratio']
[ 0.    2.    0.    0.   28.    0.12]


In [5]:
# model definition

def list_classifier(neuron, activation, input_shape=()):
    model = Sequential()
    model.add(Dense(neuron, input_shape=input_shape, activation=activation))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(optimizer="adam", 
          loss='binary_crossentropy',
          metrics=['accuracy'])
    
    return model

In [21]:
# train & test
import itertools

loo = KFold(6)

epochs = [30]
batch_size = [8]
neurons = [32, 64]
activation = ['tanh', 'relu', 'relu']

hyperparams = list(itertools.product(epochs, batch_size, neurons, activation))
hyperparams

[(30, 8, 32, 'tanh'),
 (30, 8, 32, 'relu'),
 (30, 8, 32, 'relu'),
 (30, 8, 64, 'tanh'),
 (30, 8, 64, 'relu'),
 (30, 8, 64, 'relu')]

In [22]:
early_stop_val_acc = EarlyStopping(monitor = "val_loss", min_delta=0.005, patience=5, verbose=True)

for epoch, batch, neuron, activation in hyperparams:

    res = []
    for (i, (train_index, test_index)) in enumerate(loo.split(data)):
        
        model = list_classifier(neuron, activation, input_shape=(len(CHOSEN_FEATURES),))
        
        data_train,   data_test =  data[train_index],  data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        
        
        t = model.fit(data_train, label_train, epochs=epoch, batch_size=batch, shuffle=True, verbose=False,
                 validation_split=0.3, callbacks=[early_stop_val_acc])
        
        r = model.evaluate(data_test, label_test, verbose=False)
        print(r)
        res.append(r)
        
    loss = statistics.mean(list(map(lambda x: x[0], res)))
    acc  = statistics.mean(list(map(lambda x: x[1], res)))
    acc_std = statistics.stdev(list(map(lambda x: x[1], res)))
    
    print("({}, {}, {}, {})  - loss: {}, acc: {} (std: {})".format(epoch, batch, neuron, activation,
                                                                   loss, acc, acc_std))
    print()

Epoch 00011: early stopping
[0.09629759293150257, 0.9972972972972973]
Epoch 00010: early stopping
[0.11829464765819343, 0.9891891891891892]
Epoch 00011: early stopping
[0.10874255283458813, 0.9972972972972973]
Epoch 00013: early stopping
[0.5563361320946668, 0.8378378365490887]
Epoch 00012: early stopping
[0.6654756262495711, 0.8243243256130734]
Epoch 00012: early stopping
[0.5415318782265122, 0.8486486476820868]
(30, 8, 32, tanh)  - loss: 0.3477797383325057, acc: 0.9157657656046722 (std: 0.08674651237257427)

Epoch 00009: early stopping
[0.16091313366148924, 0.9945945945945946]
Epoch 00007: early stopping
[0.14000922344826364, 0.9891891879004401]
Epoch 00011: early stopping
[0.22237959061120008, 0.9918918918918919]
Epoch 00016: early stopping
[0.6651997522727863, 0.835135133846386]
Epoch 00009: early stopping
[0.6553851945980175, 0.818918920207668]
Epoch 00008: early stopping
[0.6560526139027364, 0.8243243236799498]
(30, 8, 32, relu)  - loss: 0.4166565847490822, acc: 0.909009008686821

In [23]:
## set the best hyperparameters from the results above
best_epochs = 30    
best_batch_size = 8
best_neurons = 32
best_activation = 'tanh'

In [24]:
## retrain the model on the whole dataset and save it
model = list_classifier(best_neurons, best_activation, input_shape=(len(CHOSEN_FEATURES),))

model.fit(data, label, epochs=best_epochs, 
                       batch_size=best_epochs, 
                       shuffle=True, verbose=True)

model.save("models/list_classifier.h5")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
