In [1]:
import sys
sys.path.insert(1, '/scratch/cinthiasouza/mv-text-summarizer')

import itertools
import re
import pickle
import json
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import BatchNormalization
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Flatten, concatenate, Dropout, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import model_from_json


from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

In [3]:
%load_ext autoreload
%autoreload 2

from src import preprocess
from src import extract_features
from src import tokenizer
from src import create_features_df
from src import transform_data
from src import loader
from src import utils
#from src import ensemble_tree_models
from src import tunning_hyperparametrs as th
from src import mlp_classifier
#from src import summarization
#from src import normalization
from src import ensemble_tree_models as classifiers
from src import utils_classification as utils_clf
from src import evaluate_classifiers as ev
from src import prepare_data
from src import display_results as dr
import joblib
from joblib import Parallel, delayed
#from tensorflow.keras.utils import to_categorical
#from src import pipeline_extract_features as pef

In [4]:
section='conclusion'

with open('dataset/dataset_{}.pkl'.format('features'), 'rb') as fp:
    dataset = pickle.load(fp)

X_features = dataset[section][0]
y_features = dataset[section][2]

columns = list(range(0, 383))
columns = list(map(str, columns))

folder_to_save = 'models_v1'
path_to_save = "/scratch/cinthiasouza/mv-text-summarizer/notebook/{}".format(folder_to_save)

X_embedd = pd.read_csv("dataset/embed_bert_{}_train.csv".format(section))

y_embedd = X_embedd['label']
X_embedd = X_embedd[columns]

In [5]:
bottleneck_dim=64

In [7]:
#encoder vision 1
sequence_input = Input(shape=(X_embedd.shape[1],), dtype='int32')

e_1 = Dense(X_embedd.shape[1]*2)(sequence_input)
e_1 = BatchNormalization()(e_1)
e_1 = LeakyReLU()(e_1)

e_2 = Dense(X_embedd.shape[1])(e_1)
e_2 = BatchNormalization()(e_2)
e_2 = LeakyReLU()(e_2)


#encoder vision 2
sequence_input2 = Input(shape=(X_features.shape[1],), dtype='int32')

e_3 = Dense(X_features.shape[1]*2)(sequence_input2)
e_3 = BatchNormalization()(e_3)
e_3 = LeakyReLU()(e_3)

e_4 = Dense(X_features.shape[1]*2)(sequence_input2)
e_4 = BatchNormalization()(e_4)
e_4 = LeakyReLU()(e_4)



#Concatenate visions
v_1 = e_2

v_2_concat = concatenate([v_1, e_3])
v_2 = Dense(256, activation='relu')(v_2_concat)

v_3_concat = concatenate([v_1, v_2, e_4])
v_3 = Dense(256, activation='relu')(v_3_concat)

out_concat = concatenate([v_1, v_2, v_3])

#Shared Inputs

shared_input = Dense(bottleneck_dim)(out_concat)
bottleneck = Dense(bottleneck_dim)(shared_input)

# decoder  vision 1
d_1 = Dense(X_embedd.shape[1])(bottleneck)
d_1 = BatchNormalization()(d_1)
d_1 = LeakyReLU()(d_1)
dropout1 = Dropout(.2)(d_1)

d_2 = Dense(X_embedd.shape[1])(dropout1)
d_2 = BatchNormalization()(d_2)
d_2 = LeakyReLU()(d_2)
dropout2 = Dropout(.2)(d_2)

d_v1 = Dense(X_embedd.shape[1])(dropout2)
d_v1 = BatchNormalization()(d_v1)
d_v1 = LeakyReLU()(d_v1)

#decoder vision 2
d_5 = Dense(X_features.shape[1])(bottleneck)
d_5 = BatchNormalization()(d_5)
d_5 = LeakyReLU()(d_5)
dropout3 = Dropout(.2)(d_5)

d_4 = Dense(X_embedd.shape[1])(dropout3)
d_4 = BatchNormalization()(d_4)
d_4 = LeakyReLU()(d_4)
dropout4 = Dropout(.2)(d_4)

d_v2 = Dense(X_features.shape[1])(dropout4)
d_v2 = BatchNormalization()(d_v2)
d_v2 = LeakyReLU()(d_v2)

output_v1 = Dense(X_embedd.shape[1], activation='linear')(d_v1)
output_v2 = Dense(X_features.shape[1], activation='linear')(d_v2)

model = Model(inputs=[sequence_input, sequence_input2], outputs=[output_v1, output_v2])

model.compile(optimizer=keras.optimizers.Adam(
                learning_rate=0.0001) ,loss=keras.metrics.mean_squared_error)

In [8]:
one_hot_label = to_categorical(y_embedd)
X_train_embedd, X_valid_embedd, y_train_embedd, y_valid_embedd = train_test_split(
    X_embedd, y_embedd, stratify=one_hot_label, shuffle=True, test_size=0.2)

one_hot_label = to_categorical(y_features)
X_train_features, X_valid_features, y_train_features, y_valid_features = train_test_split(
    X_features, one_hot_label, stratify=one_hot_label, shuffle=True, test_size=0.2)

history = model.fit(
	x=[X_train_embedd, X_train_features], y=[X_train_embedd, X_train_features],
	epochs=5, validation_data=([X_valid_embedd, X_valid_features], [X_valid_embedd, X_valid_features]),
    shuffle=True, batch_size=4)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
encoder = Model(inputs=[sequence_input, sequence_input2], outputs=bottleneck)
encoder.save('autoencoder_{}dim/encoder_{}.h5'.format(bottleneck_dim, section))

In [10]:
model_json = model.to_json()
with open('autoencoder_{}dim/autoencoder_{}.json'.format(bottleneck_dim, section), "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights('autoencoder_{}dim/autoencoder_{}.h5'.format(bottleneck_dim, section))
print("Saved model to disk")

Saved model to disk


In [12]:
with open('autoencoder_16dim/history_{}.pkl'.format(section), 'wb') as fp:
    pickle.dump(history.history, fp)

In [6]:
#encoder vision 1
sequence_input = Input(shape=(X_embedd.shape[1],), dtype='int32')

e_1 = Dense(X_embedd.shape[1]*2)(sequence_input)
e_1 = BatchNormalization()(e_1)
e_1 = LeakyReLU()(e_1)

e_2 = Dense(X_embedd.shape[1])(e_1)
e_2 = BatchNormalization()(e_2)
e_2 = LeakyReLU()(e_2)


#encoder vision 2
sequence_input2 = Input(shape=(X_features.shape[1],), dtype='int32')

e_3 = Dense(X_features.shape[1]*2)(sequence_input2)
e_3 = BatchNormalization()(e_3)
e_3 = LeakyReLU()(e_3)

e_4 = Dense(X_embedd.shape[1])(e_3)
e_4 = BatchNormalization()(e_4)
e_4 = LeakyReLU()(e_4)

e_5 = Dense(X_features.shape[1]*2)(sequence_input2)
e_5 = BatchNormalization()(e_5)
e_5 = LeakyReLU()(e_5)

e_6 = Dense(X_embedd.shape[1])(e_5)
e_6 = BatchNormalization()(e_6)
e_6 = LeakyReLU()(e_6)


#Concatenate visions
v_1 = e_2

v_2_concat = concatenate([v_1, e_4])
v_2 = Dense(256, activation='relu')(v_2_concat)

v_3_concat = concatenate([v_1, v_2, e_6])
v_3 = Dense(256, activation='relu')(v_3_concat)

out_concat = concatenate([v_1, v_2, v_3])

#Shared Inputs

shared_input = Dense(bottleneck_dim)(out_concat)
bottleneck = Dense(bottleneck_dim)(shared_input)

#decoder vision 2
d_5 = Dense(X_features.shape[1])(bottleneck)
d_5 = BatchNormalization()(d_5)
d_5 = LeakyReLU()(d_5)
dropout2 = Dropout(.2)(d_5)

d_v2 = Dense(X_features.shape[1])(dropout2)
d_v2 = BatchNormalization()(d_v2)
d_v2 = LeakyReLU()(d_v2)

#output_v1 = Dense(X_embedd.shape[1], activation='linear')(d_v1)
output_v2 = Dense(X_features.shape[1], activation='linear')(d_v2)

model = Model(inputs=[sequence_input, sequence_input2], outputs=output_v2)

model.compile(optimizer=keras.optimizers.Adam(
                learning_rate=0.0001) ,loss=keras.metrics.mean_squared_error)

In [8]:
one_hot_label = to_categorical(y_embedd)
X_train_embedd, X_valid_embedd, y_train_embedd, y_valid_embedd = train_test_split(
    X_embedd, y_embedd, stratify=one_hot_label, shuffle=True, test_size=0.2)

one_hot_label = to_categorical(y_features)
X_train_features, X_valid_features, y_train_features, y_valid_features = train_test_split(
    X_features, one_hot_label, stratify=one_hot_label, shuffle=True, test_size=0.2)

history= model.fit(
	x=[X_train_embedd, X_train_features], y=X_train_features,
	epochs=5, validation_data=([X_valid_embedd, X_valid_features], X_valid_features),
    shuffle=True, batch_size=4)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
encoder = Model(inputs=[sequence_input, sequence_input2], outputs=bottleneck)
encoder.save('autoencoder_oneoutput{}dim/encoder_{}.h5'.format(bottleneck_dim, section))

In [12]:
model_json = model.to_json()
with open('autoencoder_oneoutput{}dim/autoencoder_{}.json'.format(bottleneck_dim, section), "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights('autoencoder_oneoutput{}dim/autoencoder_{}.h5'.format(bottleneck_dim, section))
print("Saved model to disk")

Saved model to disk
