In [32]:
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import pandas
import numpy as np
import statistics

In [33]:
dataset = pandas.read_csv("data/list.csv", sep="\t")
dataset.head()

Unnamed: 0,relevant,avg_tag_in_li,depth,number_bold,number_br,number_div,number_img,number_links,number_p,number_relevants,number_row,relevants_ratio
0,1,0.0,2,2,26,0,0,0,0,28,1,0.12
1,1,0.0,1,0,0,0,0,0,0,20,5,0.51
2,1,1.0,2,0,0,0,0,0,0,29,6,0.52
3,1,1.38,5,0,0,4,0,1,0,35,8,0.48
4,1,2.5,5,0,0,4,0,1,0,19,2,0.59


In [34]:
# Dataset to numpy arrays

# choose the target feature and the features to train on
TARGET_FEATURE = 'relevant'
CHOSEN_FEATURES = ['avg_tag_in_li', 'number_img', 'number_relevants', 'number_links', 'number_bold', 'relevants_ratio']
#CHOSEN_FEATURES = ["relevants_ratio"]

CHOSEN_FEATURES = sorted(CHOSEN_FEATURES) # XXX as convention we sort the features based on their names
print(CHOSEN_FEATURES)

if (TARGET_FEATURE in CHOSEN_FEATURES): CHOSEN_FEATURES.remove(TARGET_FEATURE) # ensure we do not use target feature

data  = np.ndarray((len(dataset), len(CHOSEN_FEATURES)), np.float32)
label = np.ndarray((len(dataset), 1), np.float32)

for i, row in dataset.iterrows():
    label[i] = row[TARGET_FEATURE]
    data[i]  = np.fromiter([row[feature] for feature in CHOSEN_FEATURES], np.float32)
    
print(data[0])

['avg_tag_in_li', 'number_bold', 'number_img', 'number_links', 'number_relevants', 'relevants_ratio']
[ 0.    2.    0.    0.   28.    0.12]


In [35]:
# model definition

def list_classifier(neuron, activation, input_shape=()):
    model = Sequential()
    model.add(Dense(neuron, input_shape=input_shape, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer="adam", 
              loss='mean_squared_error',
              metrics=['accuracy'])
    return model

In [36]:
# train & test
import itertools

loo = KFold(8)

epochs = [30]
batch_size = [8]
neurons = [32]
activation = ['tanh']

hyperparams = list(itertools.product(epochs, batch_size, neurons, activation))
hyperparams

[(30, 8, 32, 'tanh')]

In [37]:
early_stop_val_acc = EarlyStopping(monitor = "val_loss", min_delta=0.0005, patience=5, verbose=True)

for epoch, batch, neuron, activation in hyperparams:

    res = []
    for (i, (train_index, test_index)) in enumerate(loo.split(data)):
        
        model = list_classifier(neuron, activation, input_shape=(len(CHOSEN_FEATURES),))
        
        data_train,   data_test =  data[train_index],  data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        
        
        t = model.fit(data_train, label_train, epochs=epoch, batch_size=batch, shuffle=True, verbose=False,
                 validation_split=0.3, callbacks=[early_stop_val_acc])
        
        r = model.evaluate(data_test, label_test, verbose=False)
        print(r)
        res.append(r)
        
    loss = statistics.mean(list(map(lambda x: x[0], res)))
    acc  = statistics.mean(list(map(lambda x: x[1], res)))
    acc_std = statistics.stdev(list(map(lambda x: x[1], res)))
    
    print("({}, {}, {}, {})  - loss: {}, acc: {} (std: {})".format(epoch, batch, neuron, activation,
                                                                   loss, acc, acc_std))
    print()

Epoch 00016: early stopping
[0.008007324555312558, 0.9964028776978417]
Epoch 00018: early stopping
[0.012075067698794732, 0.9892086330935251]
Epoch 00013: early stopping
[0.010577169039266572, 0.9928057541092523]
Epoch 00013: early stopping
[0.006531273885864577, 1.0]
Epoch 00018: early stopping
[0.16353614278648734, 0.8194945861286205]
Epoch 00011: early stopping
[0.14159637625036686, 0.8519855606426832]
Epoch 00022: early stopping
[0.11454717214521867, 0.8808664259927798]
Epoch 00017: early stopping
[0.13945136388716714, 0.848375452339434]
(30, 8, 32, tanh)  - loss: 0.07454023628105981, acc: 0.9223924112505171 (std: 0.07898547607410975)



In [38]:
## set the best hyperparameters from the results above
best_epochs = 30    
best_batch_size = 8
best_neurons = 32
best_activation = 'tanh'

In [39]:
## retrain the model on the whole dataset and save it
model = list_classifier(best_neurons, best_activation, input_shape=(len(CHOSEN_FEATURES),))

model.fit(data, label, epochs=best_epochs, 
                       batch_size=best_epochs, 
                       shuffle=True, verbose=True)

model.save("models/list_classifier.h5")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
