In [1]:
from exp8_feature_extraction import get_balanced_dataset
from scripts.cross_validate import run_cross_validate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.activations import sigmoid

import numpy as np
import gensim

import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
all_reviews = get_balanced_dataset()

In [3]:
reviews_contents = [x.review_content for x in all_reviews]
labels = [1 if x.label else 0 for x in all_reviews]

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews_contents)

In [None]:
corpus_words = tokenizer.word_index
corpus_vocab_size = len(corpus_words)+1

In [None]:
predictors_sequences = pad_sequences(tokenizer.texts_to_sequences(reviews_contents))
max_len = max([len(x) for x in predictors_sequences])

In [None]:
word_vectors = gensim.models.KeyedVectors.load_word2vec_format("../../data/GoogleNews-vectors-negative300.bin",
                                                               binary=True)

In [None]:
embedding_length = word_vectors.vector_size
embedding_matrix = np.zeros((corpus_vocab_size, embedding_length))
for word, index in corpus_words.items():
  if word in word_vectors.vocab:
    embedding_matrix[index] = np.array(word_vectors[word], dtype=np.float32)

In [None]:
def get_lstm_wv_model():
  model = Sequential([
        Embedding(corpus_vocab_size, embedding_length, weights=[embedding_matrix], input_length=max_len, trainable=False),
        LSTM(10),
        Dense(1, activation=sigmoid)
  ])
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
rnn_wv_scores = run_cross_validate(get_lstm_wv_model, predictors_sequences, labels, cv=5, verbose=1, epochs=12, batch_size=64, shuffle=True)
print(rnn_wv_scores)

Fitting with:  (128745, 997) labels (128745,)
Train on 90121 samples, validate on 38624 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Fitting with:  (128746, 997) labels (128746,)
Train on 90122 samples, validate on 38624 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12

In [14]:
import gc
gc.collect()

211

In [11]:
print("test")

test


In [12]:
rnn_wv_scores

{'accuracies': [0.6963464645357537,
  0.697424426001242,
  0.6930963773291262,
  0.693034238503599,
  0.6973218169575607]}