In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import pickle
from tensorflow.keras.optimizers import Adadelta
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, concatenate, Dropout, Input

In [2]:
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Flatten, Dropout, InputLayer, Embedding, Input, Concatenate
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import model_from_json
import numpy as np
import pandas as pd

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.math import exp

In [3]:
def shuffle_dataset(X, y):
    
    idx = np.random.permutation(len(X))

    X = X[idx]
    y = np.array(y)[idx]
    
    return X, y

In [4]:
from tensorflow.keras.callbacks import Callback

class TerminateOnBaseline(Callback):
    """Callback that terminates training when either acc or val_acc reaches a specified baseline
    """
    def __init__(self, monitor='accuracy', baseline=0.9):
        super(TerminateOnBaseline, self).__init__()
        self.monitor = monitor
        self.baseline = baseline

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        acc = logs.get(self.monitor)
        if acc is not None:
            if acc >= self.baseline:
                print('Epoch %d: Reached baseline, terminating training' % (epoch))
                self.model.stop_training = True

In [5]:
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * exp(-0.1)

In [29]:
parameters_mlp = { 
    'introduction':  [ 0.2, 200, 8],
    'materials':     [ 0.2, 200, 8],
    'conclusion':  [ 0.2, 200, 8],
    'concat':  [ 0.2, 200, 8]}

In [27]:
columns = list(range(0, 383))
columns = list(map(str, columns))

section='concat'

folder_to_save = 'models_v1'
path_to_save = "/scratch/cinthiasouza/mv-text-summarizer/notebook/{}".format(folder_to_save)

#X_embedd = pd.read_csv("dataset/embed_bert_{}_train.csv".format(section))

#y_embedd = X_embedd['label']
#X_embedd = X_embedd[columns]

In [31]:
with open('dataset/dataset_{}.pkl'.format('features'), 'rb') as fp:
    dataset = pickle.load(fp)

path_to_save = "/scratch/cinthiasouza/mv-text-summarizer/notebook/models_v6"
class_weight={0:1, 1:1}

section='concat'

X = dataset[section][0]
y = dataset[section][2]


model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dropout(.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.3))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.3))
model.add(Dense(256, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(.2))
model.add(Dense(2, activation='sigmoid'))


model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(
                learning_rate=0.001), metrics=[keras.metrics.Precision()])

one_hot_label = to_categorical(y)

test_size, epochs, batch_size =parameters_mlp.get(section)

X_train, X_valid, y_train, y_valid = train_test_split(X, one_hot_label, stratify=one_hot_label, test_size=test_size)

my_callbacks = [EarlyStopping(patience=20)]

my_callbacks = [EarlyStopping(patience=10, restore_best_weights=True),
                keras.callbacks.LearningRateScheduler(scheduler),
                TerminateOnBaseline(monitor='val_precision', baseline=0.9)]
model.fit(
            np.array(X_train), np.array(y_train), validation_data=(X_valid,y_valid), epochs=5,
             batch_size=batch_size, class_weight=class_weight, callbacks=my_callbacks, verbose=True)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f8a0018c220>

In [None]:
path_to_save = "/scratch/cinthiasouza/mv-text-summarizer/notebook/models_v1"
model_json = model.to_json()
with open('{}/mlp_embed_{}.json'.format(path_to_save, section), "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights('{}/mlp_embed_{}.h5'.format(path_to_save, section))
print("Saved model to disk")

In [17]:
path_to_save = "/scratch/cinthiasouza/mv-text-summarizer/notebook/models_v1"
model_json = model.to_json()
with open('{}/mlp_{}.json'.format(path_to_save, section), "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights('{}/mlp_{}.h5'.format(path_to_save, section))
print("Saved model to disk")

Saved model to disk


In [20]:
json_file = open('{}/mlp_{}.json'.format(path_to_save, section), 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights('{}/mlp_{}.h5'.format(path_to_save, section))
print("Loaded model from disk")
loaded_model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(
                learning_rate=0.001), metrics=[keras.metrics.Precision()])

Loaded model from disk
