In [1]:
import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, History, CSVLogger
import operator
import joblib
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import random
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import feather
ps = PorterStemmer()
# import keras.backend as K
stop_words = set(stopwords.words('english'))

In [2]:
GLOVE_DIR = '../../data/embeddings'
MAX_SEQUENCE_LENGTH = 256
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# Load preprocessed data

In [3]:
df_rev_balanced = feather.read_dataframe("../../assets/balanced_reviews.feather")

In [4]:
tokenizer = joblib.load('../../assets/tokenizer.pickle')

In [5]:
with pd.HDFStore('../../assets/yelp_x_y_test_train.h5') as h:
    X_train = h['X_train'].values
    X_test = h['X_test'].values
    y_train = h['y_train'].values
    y_test = h['y_test'].values
WORD_INDEX_SORTED = sorted(tokenizer.word_index.items(), key=operator.itemgetter(1))

# Load pretrained embeddings

In [6]:
print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, './glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [7]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(WORD_INDEX_SORTED))
embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [8]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Model

In [9]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(
    64,
    kernel_initializer='glorot_normal',
    recurrent_initializer='glorot_normal'
)(embedded_sequences)
preds = Dense(1, activation='sigmoid')(x)

model = Model(sequence_input, preds)

In [10]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [11]:
model.fit(X_train, y_train,
          batch_size=512,
          epochs=20,
          validation_data=(X_test, y_test))

Train on 96308 samples, validate on 24077 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f26203d01d0>

# Save Model

In [18]:
!mkdir -p ../../assets/sentiment_tensorflow

In [23]:
model.save_weights("../../assets/sentiment_tensorflow/model_weights.h5")
model.save("../../assets/sentiment_tensorflow/model.h5")
print("Saved model to disk")

Saved model to disk
