In [2]:
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import keras.backend as K

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import pickle
import pandas
import numpy as np
import statistics

import sys
sys.path.append("../")
from carbonaraextractor import DefaultFeatures

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [11]:
dataset_name = 'table'
dataset = pandas.read_csv(f"../data/and/{dataset_name}.csv", sep="\t")
dataset.head()

Unnamed: 0,relevant,depth,number_bold,number_br,number_div,number_img,number_li,number_links,number_p,number_relevants,number_td,number_th,number_tr,relevants_ratio
0,0,5,0,0,0,0,0,1,0,3,4,0,2,0.12
1,0,7,9,30,0,0,0,0,3,42,5,0,1,0.19
2,0,5,0,0,0,0,0,0,1,24,1,0,1,0.32
3,0,5,0,0,1,1,0,0,1,9,1,0,1,0.27
4,0,6,3,0,6,0,0,0,0,21,14,0,2,0.23


In [12]:
# for table classifier:
CHOSEN_FEATURES = DefaultFeatures.table_selected
# for list classifier:
#CHOSEN_FEATURES = DefaultFeatures.list_selected

In [13]:
TARGET_FEATURE = 'relevant'

if (TARGET_FEATURE in CHOSEN_FEATURES): 
    CHOSEN_FEATURES.remove(TARGET_FEATURE)

In [14]:
data  = dataset.loc[:, CHOSEN_FEATURES].values.astype(np.float32)
label = np.ndarray((len(dataset), 2), np.float32)

for i, row in dataset.iterrows():
    label[i] = np.zeros(2)
    label[i][int(row[TARGET_FEATURE])] = 1.
    
shuffle_idx = np.random.permutation(len(data))
data, label = data[shuffle_idx], label[shuffle_idx]

In [15]:
def simple_classifier(neuron, activation, input_shape=()):
    model = Sequential()
    model.add(Dense(neuron, input_shape = input_shape, activation = activation))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation = 'softmax'))
    model.compile(optimizer = "adam", loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [16]:
# train & test
import itertools

kfold = KFold(4)

val_percent = 0.3
epochs = [50]
batch_size = [32, 64]
neurons = [8, 16, 32]
activation = ['tanh']

hyperparams = list(itertools.product(epochs, batch_size, neurons, activation))
hyperparams

[(50, 32, 8, 'tanh'),
 (50, 32, 16, 'tanh'),
 (50, 32, 32, 'tanh'),
 (50, 64, 8, 'tanh'),
 (50, 64, 16, 'tanh'),
 (50, 64, 32, 'tanh')]

In [17]:
early_stop_val_acc = EarlyStopping(monitor = "val_loss", verbose = True, mode = 'auto')

for epoch, batch, neuron, activation in hyperparams:
    print(f"Evaluating: epoch:{epoch}, batch:{batch}, neuron:{neuron}, activation:{activation}")

    results = []
    for (i, (train_index, test_index)) in enumerate(kfold.split(data)):  
        print (f"{i}) ", end='')
        
        data_train,   data_test =  data[train_index],  data[test_index]
        label_train, label_test = label[train_index], label[test_index]
        
        scaler = preprocessing.StandardScaler().fit(data_train)
        data_train = scaler.transform(data_train)
        data_test  = scaler.transform(data_test)
        
        model = simple_classifier(neuron, activation, input_shape=(len(CHOSEN_FEATURES),))
        model.fit(
            data_train, 
            label_train, 
            validation_split = val_percent,
            epochs = epoch, 
            batch_size = batch, 
            shuffle = True, 
            verbose = False,
            callbacks=[early_stop_val_acc]
        )
        
        result = model.evaluate(data_test, label_test, verbose = False)
        results.append(result)
        
    loss = np.mean(list(map(lambda x: x[0], results)))
    acc  = np.mean(list(map(lambda x: x[1], results)))
    acc_std = np.std(list(map(lambda x: x[1], results)))
    
    print()
    print(f"Trained on {len(data_train)} samples")
    print(f"Tested on {len(data_test)} samples, with {len(data_test) * val_percent:.3f} of validation")
    print(f"Result: loss: {loss:.3f}, acc: {acc:.3f} (std: {acc_std:.3f})")
    print()
    print()

Evaluating: epoch:50, batch:32, neuron:8, activation:tanh
0) 1) 2) 3) Epoch 00050: early stopping

Trained on 1035 samples
Tested on 345 samples, with 103.500 of validation
Result: loss: 0.205, acc: 0.925 (std: 0.010)


Evaluating: epoch:50, batch:32, neuron:16, activation:tanh
0) 1) Epoch 00046: early stopping
2) 3) Epoch 00038: early stopping

Trained on 1035 samples
Tested on 345 samples, with 103.500 of validation
Result: loss: 0.194, acc: 0.930 (std: 0.011)


Evaluating: epoch:50, batch:32, neuron:32, activation:tanh
0) Epoch 00046: early stopping
1) Epoch 00033: early stopping
2) Epoch 00038: early stopping
3) Epoch 00028: early stopping

Trained on 1035 samples
Tested on 345 samples, with 103.500 of validation
Result: loss: 0.193, acc: 0.932 (std: 0.016)


Evaluating: epoch:50, batch:64, neuron:8, activation:tanh
0) 1) 2) 3) 
Trained on 1035 samples
Tested on 345 samples, with 103.500 of validation
Result: loss: 0.245, acc: 0.914 (std: 0.016)


Evaluating: epoch:50, batch:64, ne

In [None]:
## set the best hyperparameters from the results above
## retrain with the whole dataset

best_epochs = int(np.mean([46, 33, 28, 38]))
best_batch_size = 32
best_neurons = 32
best_activation = 'tanh'

scaler = preprocessing.StandardScaler()
data = scaler.fit_transform(data)

model = simple_classifier(best_neurons, best_activation, input_shape=(len(CHOSEN_FEATURES),))
model.fit(data, label, epochs = best_epochs, batch_size = best_batch_size, shuffle = True, verbose = True)

In [None]:
model.save(f"../models/{dataset_name}_classifier.h5")

In [None]:
pickle.dump(scaler, open(f"../models/{dataset_name}_scaler.pkl", "wb"))