In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, Dropout
from keras.models import Model
from keras.callbacks import CSVLogger
import operator
import joblib
import pandas as pd
import nltk as nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import random
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
ps = PorterStemmer()
import keras.backend as K
from keras.callbacks import ModelCheckpoint, CSVLogger, History
stop_words = set(stopwords.words('english'))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
BASE_DIR = './'
GLOVE_DIR = './embeddings'
MAX_SEQUENCE_LENGTH = 256
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

Download the Glove embeddings from here https://nlp.stanford.edu/projects/glove/

In [3]:
print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, './glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


# Data preprocessing

In [4]:
def minority_balance_dataframe_by_multiple_categorical_variables(df, categorical_columns=None, downsample_by=0.1):
    """
    :param df: pandas.DataFrame
    :param categorical_columns: iterable of categorical columns names contained in {df}
    :return: balanced pandas.DataFrame
    """
    if categorical_columns is None or not all([c in df.columns for c in categorical_columns]):
        raise ValueError('Please provide one or more columns containing categorical variables')

    minority_class_combination_count = df.groupby(categorical_columns).apply(lambda x: x.shape[0]).min()
    
    minority_class_combination_count = int(minority_class_combination_count * downsample_by)
    
    df = df.groupby(categorical_columns).apply(
        lambda x: x.sample(minority_class_combination_count)
    ).drop(categorical_columns, axis=1).reset_index().set_index('level_1')

    df.sort_index(inplace=True)

    return df


def get_features_for_layer(X, trained_model, layer_number, batches=256):
    """
    :param X: Batch with dimensions according to the models first layer input-shape
    :param trained_model: Model to extract data from
    :param layer_number: Index of the layer we want to extract features from.
    :param batches: If set it will call the function in batches to save (gpu)memory
    :return: 
    """


    get_features = K.function([trained_model.layers[0].input, K.learning_phase()],
                              [trained_model.layers[layer_number].output])
    
    if batches:
        g = array_batch_yield(X, batches)
        features = []
        for batch in g:
            feature_batch = get_features([batch, 0])
            features.append(feature_batch)
            
        features = np.concatenate(features, axis=1)[0]
        
    else:
        features = get_features([X, 0])

    
    return features


def array_batch_yield(X, group_size):
    for i in xrange(0, len(X), group_size):
        yield X[i:i+group_size]
        
langdetect_count = 0
def safe_detect(s):
    try:
        global langdetect_count
        count+=1    
        if langdetect_count % 10000 == 0:
            print("Detected languages for  {} reviews".format(count))       
        return langdetect.detect(s)
    except:
        return 'unknown'

Download the Yelp Dataset from here https://www.yelp.com/dataset and read the review.json file

In [6]:
df_reviews =  pd.read_json('./data/review.json', lines=True, encoding='utf-8')
df_reviews['len'] = df_reviews.text.str.len()
df_reviews = df_reviews[df_reviews['len'].between(10, 4000)]

# balancing dataset
df_rev_balanced = minority_balance_dataframe_by_multiple_categorical_variables(
    df_reviews, 
    categorical_columns=['stars'], 
    downsample_by=0.1
)

df_rev_balanced.to_csv('balanced_reviews.csv', encoding='utf-8')

In [7]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df_rev_balanced.text.tolist())
joblib.dump(tokenizer, 'tokenizer.pickle')

WORD_INDEX_SORTED = sorted(tokenizer.word_index.items(), key=operator.itemgetter(1))

seqs = tokenizer.texts_to_sequences(df_rev_balanced.text.values)
X = pad_sequences(seqs, maxlen=MAX_SEQUENCE_LENGTH)
Y = df_rev_balanced.stars.values.astype(int)
Y_cat = [1 if y > 3 else 0 for y in Y]
assert X.shape[0] == Y.shape[0]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y_cat, test_size=0.2, random_state=9)
with pd.HDFStore('x_y_test_train.h5') as h:
    h['X_train'] = pd.DataFrame(X_train)
    h['X_test'] = pd.DataFrame(X_test)
    h['y_train'] = pd.DataFrame(y_train)
    h['y_test'] = pd.DataFrame(y_test)

# Load pre-prossed data

In [5]:
df_rev_balanced = pd.read_csv('balanced_reviews.csv')
tokenizer = joblib.load('tokenizer.pickle')
with pd.HDFStore('x_y_test_train.h5') as h:
    X_train = h['X_train'].values
    X_test = h['X_test'].values
    y_train = h['y_train'].values
    y_test = h['y_test'].values
WORD_INDEX_SORTED = sorted(tokenizer.word_index.items(), key=operator.itemgetter(1))

In [6]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(WORD_INDEX_SORTED))
embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [7]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [8]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(
    64,
    kernel_initializer='glorot_normal',
    recurrent_initializer='glorot_normal'
)(embedded_sequences)
preds = Dense(1, activation='sigmoid')(x)

model = Model(sequence_input, preds)

In [9]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 256)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 256, 100)          2000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,042,305
Trainable params: 42,305
Non-trainable params: 2,000,000
_________________________________________________________________


In [11]:
model.fit(X_train, y_train,
          batch_size=512,
          epochs=20,
          validation_data=(X_test, y_test))

Train on 174432 samples, validate on 43608 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f23705cefd0>

# Save Model

In [15]:
# serialize model to YAML
model_yaml = model.to_yaml()
with open("./output/model.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
# serialize weights to HDF5
model.save_weights("./output/model_weights.h5")
model.save("./output/model.h5")
print("Saved model to disk")

Saved model to disk


# Load Model

In [16]:
from keras.models import model_from_yaml
# load YAML and create model
yaml_file = open('./output/model.yaml', 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()
model = model_from_yaml(loaded_model_yaml)
# load weights into new model
model.load_weights("./output/model_weights.h5")
print("Loaded model from disk")

Loaded model from disk


# Keras To CoreML

In [17]:
import coremltools

In [18]:
# Convert a caffe model to a classifier in Core ML
coreml_model = coremltools.converters.keras.convert(
  model,
  input_names = 'input',
  output_names = 'output',
  class_labels = [0, 1]
)

coreml_model.author = 'Danial Khosravi'
coreml_model.license = 'MIT'
coreml_model.short_description = 'Predicts the sentiment of a tokenized string'
coreml_model.input_description['input'] = 'A String mapped according to the pre-deifned mapping'
coreml_model.output_description['output'] = 'Whether the sentence was positive or negative'

coreml_model.save('./output/model.mlmodel')

0 : input_1, <keras.engine.topology.InputLayer object at 0x7f2301760080>
1 : embedding_1, <keras.layers.embeddings.Embedding object at 0x7f23017601d0>
2 : lstm_1, <keras.layers.recurrent.LSTM object at 0x7f2301760390>
3 : dense_1, <keras.layers.core.Dense object at 0x7f23017602b0>
4 : dense_1__activation__, <keras.layers.core.Activation object at 0x7f2214d572b0>


# Keras to TensorflowJS

In [19]:
import tensorflowjs as tfjs

Instructions for updating:
Use the retry module or similar alternatives.


Instructions for updating:
Use the retry module or similar alternatives.


In [62]:
# for some reason models converted with tfjs.converters.save_keras_model are currenlty giving an error on the browser
# so we're using the bash command tensorflowjs_converter 
# tfjs.converters.save_keras_model(model, './output/sentiment_js_model')

In [21]:
!rm -rf ./output/sentiment_js_model

In [22]:
!tensorflowjs_converter --input_format keras ./output/model.h5 ./output/sentiment_js_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
Instructions for updating:
Use the retry module or similar alternatives.


# Keras to Tensorflow (Android)

In [23]:
import os
from keras import backend as K
import tensorflow as tf
from tensorflow.python.tools import freeze_graph
from tensorflow.python.tools import optimize_for_inference_lib

In [24]:
MODEL_NAME = 'SentimentModel'

In [25]:
def export_model(saver, model, input_node_names, output_node_name):
    tf.train.write_graph(K.get_session().graph_def, 'output', \
        MODEL_NAME + '_graph.pbtxt')

    saver.save(K.get_session(), 'output/' + MODEL_NAME + '.chkp')

    freeze_graph.freeze_graph('output/' + MODEL_NAME + '_graph.pbtxt', None, \
        False, 'output/' + MODEL_NAME + '.chkp', output_node_name, \
        "save/restore_all", "save/Const:0", \
        'output/frozen_' + MODEL_NAME + '.pb', True, "")

    input_graph_def = tf.GraphDef()
    with tf.gfile.Open('output/frozen_' + MODEL_NAME + '.pb', "rb") as f:
        input_graph_def.ParseFromString(f.read())

    output_graph_def = optimize_for_inference_lib.optimize_for_inference(
            input_graph_def, input_node_names, [output_node_name],
            tf.float32.as_datatype_enum)

    with tf.gfile.FastGFile('output/opt_' + MODEL_NAME + '.pb', "wb") as f:
        f.write(output_graph_def.SerializeToString())

    print("graph saved!")


In [26]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 256)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 256, 100)          2000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,042,305
Trainable params: 42,305
Non-trainable params: 2,000,000
_________________________________________________________________


In [27]:
export_model(tf.train.Saver(), model, ["input_1"], "dense_1/Sigmoid")

INFO:tensorflow:Restoring parameters from output/SentimentModel.chkp


INFO:tensorflow:Restoring parameters from output/SentimentModel.chkp


INFO:tensorflow:Froze 6 variables.


INFO:tensorflow:Froze 6 variables.


Converted 6 variables to const ops.
graph saved!


# Word Index to SQLight (Mobile)

In [28]:
import sqlite3
import sys
import re
import joblib

In [29]:
sqlite_file = './output/sentiment_db.sqlite'

In [30]:
!rm -rf ./output/sentiment_db.sqlite

In [31]:
# Connecting to the database file
conn = sqlite3.connect(sqlite_file)
c = conn.cursor()

# Creating a new SQLite table with 1 column
c.execute('CREATE TABLE word_index (key STRING, value INTEGER)')

# Committing changes and closing the connection to the database file
conn.commit()
conn.close()

In [32]:
word_index = tokenizer.word_index

In [33]:
conn = sqlite3.connect(sqlite_file)
c = conn.cursor()

count = 0
for key, value in word_index.items():
    if (key):
        c.execute("INSERT INTO word_index (key, value) VALUES (\"{x}\", {y})".\
            format(x=key, y=int(value)))
    count += 1

# Word Index to JSON (Web)

In [34]:
import json

with open('./output/word_index.json', 'w') as fp:
    json.dump(tokenizer.word_index, fp)