In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import pickle
from keras.callbacks import EarlyStopping
from sklearn.utils import shuffle


## Data processing

In [2]:
file = open('data_pickles/train_test_val_Human_posTags', 'rb')
X_human_train_posTags, X_human_test_posTags, X_human_val_posTags = pickle.load(file)
file.close()

Y_human_train = np.zeros(len(X_human_train_posTags))
Y_human_test = np.zeros(len(X_human_test_posTags))
Y_human_val = np.zeros(len(X_human_val_posTags))


file = open('data_pickles/train_test_val_GPT_posTags', 'rb')
X_gpt_train_posTags, X_gpt_test_posTags, X_gpt_val_posTags = pickle.load(file)
file.close()

Y_gpt_train = np.ones(len(X_gpt_train_posTags))
Y_gpt_test = np.ones(len(X_gpt_test_posTags))
Y_gpt_val = np.ones(len(X_gpt_val_posTags))

In [3]:
len(X_human_train_posTags), len(Y_human_train), len(X_human_test_posTags), len(Y_human_test), len(X_human_val_posTags), len(Y_human_val)

(435, 435, 95, 95, 93, 93)

In [5]:
X_train = np.hstack((X_human_train_posTags, X_gpt_train_posTags))
Y_train = np.hstack((Y_human_train, Y_gpt_train))

X_test = np.hstack((X_human_test_posTags, X_gpt_test_posTags))
Y_test = np.hstack((Y_human_test, Y_gpt_test))

X_val = np.hstack((X_human_val_posTags, X_gpt_val_posTags))
Y_val = np.hstack((Y_human_val, Y_gpt_val))

full_dataset = np.hstack((X_train, X_test, X_val))

In [6]:
X_train = [' '.join(data) for data in X_train]
X_test = [' '.join(data) for data in X_test]
X_val = [' '.join(data) for data in X_val]

full_dataset = [' '.join(data) for data in full_dataset]


In [7]:
vocab_size = 55
oov_tok = ''
embedding_dim = 150
max_length = 200

padding_type='post'
trunc_type='post'

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(full_dataset)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

val_sequences = tokenizer.texts_to_sequences(X_val)
val_padded = pad_sequences(val_sequences, padding='post', maxlen=max_length)

In [8]:
train_padded, Y_train = shuffle(train_padded, Y_train)
test_padded, Y_test = shuffle(test_padded, Y_test)
val_padded, Y_val = shuffle(val_padded, Y_val)

In [9]:
lstm_dataset_test= [test_padded, Y_test]

filename = 'lstm_dataset_test'
file = open(filename, 'wb')
pickle.dump(lstm_dataset_test, file)
file.close()

In [9]:
train_padded.shape, val_padded.shape

((870, 200), (186, 200))

## LSTM Model

In [10]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])


model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 150)          8250      
                                                                 
 bidirectional (Bidirectiona  (None, 128)              110080    
 l)                                                              
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 122,491
Trainable params: 122,491
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

callback = EarlyStopping(monitor='accuracy', patience=5)

In [None]:
num_epochs = 27
history = model.fit(train_padded, Y_train, 
                    epochs=num_epochs, 
                    verbose=1, 
                    shuffle=True,
                    validation_data=(val_padded, Y_val),
                    callbacks=[callback])

In [14]:
model.save('modelo_9437_9140_9157.h5')

In [13]:
score = model.evaluate(test_padded, Y_test, verbose=0)
print('Evaluation loss:', score[0])
print('Evaluation accuracy:', score[1])

Evaluation loss: 0.3139588236808777
Evaluation accuracy: 0.9157894849777222
