In [1]:
import numpy as np
import pandas as pd
from numpy import array
from numpy import cumsum
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.layers import Embedding

# word embedding
from gensim.models import Word2Vec
import multiprocessing

from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

https://machinelearningmastery.com/develop-bidirectional-lstm-sequence-classification-python-keras/#:~:text=Bidirectional%20LSTMs%20are%20an%20extension,LSTMs%20on%20the%20input%20sequence.

In [2]:
X_train = pd.read_pickle('../X_train.pickle')
X_test = pd.read_pickle('../X_test.pickle')

In [3]:
X_train['tokenized_text'] = X_train['tokenized_text'].apply(lambda x: ' '.join(x))
X_test['tokenized_text'] = X_test['tokenized_text'].apply(lambda x: ' '.join(x))

In [4]:
max_features = 100

In [5]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train['tokenized_text']))
list_tokenized_train = tokenizer.texts_to_sequences(X_train['tokenized_text'])
list_tokenized_test = tokenizer.texts_to_sequences(X_test['tokenized_text'])

In [6]:
pad_train = pad_sequences(list_tokenized_train, maxlen=300, padding='post')
pad_test = pad_sequences(list_tokenized_test, maxlen=300, padding='post')

In [7]:
vocab_size = len(tokenizer.word_index)+1

In [8]:
# load model
cbow = Word2Vec.load('../CBOW300.bin')
print(cbow)

Word2Vec(vocab=85973, size=300, alpha=0.025)


In [9]:
word_vec = cbow.wv

In [10]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 300))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        try:
            weight_matrix[i] = embedding[word]
        except:
            pass
    return weight_matrix

# get vectors in the right order
embedding_vectors = get_weight_matrix(word_vec, tokenizer.word_index)

In [11]:
embedding = Embedding(vocab_size,300,weights = [embedding_vectors],input_length=300,trainable = False)

In [None]:
# define problem properties
n_timesteps = 300
# define LSTM
model = Sequential()
model.add(embedding)
model.add(Bidirectional(LSTM(20, return_sequences=True), input_shape=(n_timesteps, 1)))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# train LSTM
# fit model for one epoch on this sequence
model.fit(pad_train, X_train['target'], epochs=1, batch_size=1, verbose=2, validation_split=0.2)

In [None]:
# evaluate LSTM
yhat = model.predict_classes(X_test, verbose=0)
for i in range(n_timesteps):
    print('Expected:', X_train['target'], 'Predicted', yhat[0, i])