In [3]:
%run config.ipynb
# %run keras-bert.py
import os
import random
import math
import tensorflow as tf
import tensorflow_hub as hub

from official.modeling import tf_utils
from official import nlp
from official.nlp import optimization
from official.nlp import bert
from official.nlp.bert.tokenization import FullTokenizer
from official.nlp.bert.configs import BertConfig
from official.nlp.bert.bert_models import classifier_model

from keras_bert.bert import get_model
from keras_bert.loader import load_trained_model_from_checkpoint

from transformers import BertForSequenceClassification
from transformers import BertTokenizer

import tensorflow as tf
print("Num GPUs Available: ", tf.config.experimental.list_physical_devices('GPU'))
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, Dropout, Conv1D, GlobalMaxPool1D, GlobalAvgPool1D
from tensorflow.keras.models import Model, clone_model, Sequential, load_model
from tensorflow.keras.metrics import RootMeanSquaredError

import json
import pickle
# from keras import backend as K

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Load Data

In [2]:
%%time
import pickle
with open('data/train_test_bert.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

for key, value in X_train.items():
  print(f'{key:15s} shape: {value.shape}')

print(f'X_train_labels shape: {y_train.shape}')

for key, value in X_test.items():
  print(f'{key:15s} shape: {value.shape}')

print(f'X_train_labels shape: {y_test.shape}')

# Set up epochs and steps
epochs = 3
batch_size = 128
eval_batch_size = 32

train_data_size = len(y_train)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(2e-4, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

bert_folder = "data/bert_custom/"
tf.io.gfile.listdir(bert_folder)

INFO:absl:using Adamw optimizer


input_word_ids  shape: (268540, 307)
input_mask      shape: (268540, 307)
input_type_ids  shape: (268540, 307)
X_train_labels shape: (268540,)
input_word_ids  shape: (132267, 273)
input_mask      shape: (132267, 273)
input_type_ids  shape: (132267, 273)
X_train_labels shape: (132267,)
Wall time: 4.54 s


['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'vocab.txt']

# Split Data

In [3]:
X = df["review_text_after_cleaning"]
y = df["review_stars"]
y = y.apply(lambda x : x - 1) # convert range(1,6) to range(0,5) for more precise when adding dense layer
# y = y.apply(lambda x : 1 if x else 0)
df_train, df_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [19]:
X_train

{'input_word_ids': <tf.Tensor: shape=(268540, 307), dtype=int32, numpy=
 array([[  101,  6187,  2271, ...,     0,     0,     0],
        [  101, 24970,  2833, ...,     0,     0,     0],
        [  101,  2173, 12721, ...,     0,     0,     0],
        ...,
        [  101,  2613,  3669, ...,     0,     0,     0],
        [  101,  2293,  2717, ...,     0,     0,     0],
        [  101,  2253,  2265, ...,     0,     0,     0]])>,
 'input_mask': <tf.Tensor: shape=(268540, 307), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])>,
 'input_type_ids': <tf.Tensor: shape=(268540, 307), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])>}

In [45]:
import tensorflow.compat.v1 as tf
class BertLayer(tf.layers.Layer):
    def __init__(self, n_fine_tune_layers=4, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 256
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        config_file = os.path.join(bert_folder, 'bert_config.json')
        checkpoint_file = os.path.join(bert_folder, 'bert_model.ckpt')
        self.bert = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=256)
#         self.bert = hub.Module(
#             bert_folder,
#             trainable=self.trainable,
#             name="{}_module".format(self.name)
#         )
        trainable_vars = self.bert.variables
        
        # Remove unused layers
        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        
        # Select how many layers to fine tune
        trainable_vars = trainable_vars[-self.n_fine_tune_layers :]
        
        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
        
        # Add non-trainable weights
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        
        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
            "pooled_output"
        ]
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [46]:
# Build model
max_seq_length = 256
in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]

# Instantiate the custom Bert Layer defined above
bert_output = BertLayer()(bert_inputs)

# Build the rest of the classifier 
dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.fit(
#     [train_input_ids, train_input_masks, train_segment_ids], 
#     train_labels,
#     validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
#     epochs=1,
#     batch_size=32
# )

OperatorNotAllowedInGraphError: using a `tf.Tensor` as a Python `bool` is not allowed in Graph execution. Use Eager execution or decorate this function with @tf.function.

In [5]:
bert_config_file = os.path.join(bert_folder, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())
bert_config = bert.configs.BertConfig.from_dict(config_dict)
config_dict

{'hidden_size': 312,
 'hidden_act': 'relu',
 'initializer_range': 0.02,
 'vocab_size': 30522,
 'hidden_dropout_prob': 0.2,
 'num_attention_heads': 4,
 'type_vocab_size': 2,
 'max_position_embeddings': 308,
 'num_hidden_layers': 4,
 'intermediate_size': 768,
 'attention_probs_dropout_prob': 0.2}

In [6]:
# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(2e-4, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

bert_classifier1, bert_encoder1 = bert.bert_models.classifier_model(bert_config, num_labels=5)
bert_classifier1.compile(optimizer=optimizer, loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError('rmse'), 'mae'])
bert_classifier1.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, epochs=epochs)

INFO:absl:using Adamw optimizer


Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x169f9b16b20>

# Load Data for LSTM

In [5]:
df = pd.read_csv("data/yelp_academic_dataset_final.csv")
df.head()

Unnamed: 0,review_id,user_id,business_id,review_stars,user_review_count,user_elite,user_friends,user_fans,user_average_stars,user_total_compliments,business_name,business_stars,review_text_after_cleaning,num_user_friends
0,lXSEWDtaiaQcxj5Nuxx6JA,fWqtOUpCFv6rmaacYZdkEQ,0z010Dfuv-PLUIukFhYEHQ,5,55,0,,0,2.95,2,Hydrate Salon and Spa,4.0,great spot fair price good servic profession s...,0
1,Au1-zoHWO3VIlI48blfixA,DlGBaNwQsAeKAW75iHVEzg,0z010Dfuv-PLUIukFhYEHQ,5,15,0,"MCX2QsAl79d4-Z-H9YJPEw, DN0YgFN7S7teowvOC0mwNA...",2,3.6,1,Hydrate Salon and Spa,4.0,hair facial alway feel incred welcom everyon g...,531
2,dw6k8HwpXBZ2xD_MC-9dDg,YBqPdDh0KOdAfMv_9U4jOw,0z010Dfuv-PLUIukFhYEHQ,2,8,0,"FC8HuS-i-8XDv0HFGVgnbQ, 66MQLCs9yP2PgKZLTck66Q...",0,1.75,0,Hydrate Salon and Spa,4.0,schedul minut late also schedul color girl fro...,22
3,T8hb6_yy0iJoM-TsNtwx2Q,5kDWszBfoqNjwohHZ3vpAw,0z010Dfuv-PLUIukFhYEHQ,5,9,0,"LLapZiT6_844J5rfLQgQrg, a4_sKXaN2Cfvhb9v52WSyg...",0,4.78,0,Hydrate Salon and Spa,4.0,thier custom open door thrill attent gave hair...,4
4,QSOdfF9TSGyOMU1cUJUe5g,lNyihs-KKOviAvBBXa1xlw,0z010Dfuv-PLUIukFhYEHQ,5,52,0,"hnvx0UISDCcrJpc5ysLmnA, Fhzf1hxz2o6GzyrTncB59Q...",1,3.76,2,Hydrate Salon and Spa,4.0,want great cut color word you quach start see ...,5


In [6]:
X = df["review_text_after_cleaning"]
y = df["review_stars"].apply(lambda x : x-1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
MAX_VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

In [13]:
word2idx = tokenizer.word_index
V = len(word2idx)
print('Found %s unique tokens.' % V)

Found 222351 unique tokens.


In [14]:
X_train_text = pad_sequences(sequences_train, maxlen=150)
print('Shape of data train tensor:', X_train_text.shape)

T = X_train_text.shape[1]

X_test_text = pad_sequences(sequences_test, maxlen=T)
print('Shape of data test tensor:', X_test_text.shape)

Shape of data train tensor: (671373, 150)
Shape of data test tensor: (330677, 150)


# Load LSTM Model

In [4]:
lstm = load_model("models/lstm.h5")
lstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 150)          7500000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 150, 150)          135600    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 150)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 151       
Total params: 7,635,751
Trainable params: 7,635,751
Non-trainable params: 0
_________________________________________________________________


In [15]:
pred = lstm.predict(X_test_text)

In [16]:
pred

array([[ 3.21576   ],
       [ 3.3574724 ],
       [ 3.8535378 ],
       ...,
       [-0.15520087],
       [ 3.7164297 ],
       [ 3.9894316 ]], dtype=float32)