In [23]:
%run config.ipynb
import os
import tensorflow as tf
import tensorflow_hub as hub

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

from keras_bert.bert import get_model
from keras_bert.loader import load_trained_model_from_checkpoint

from transformers import BertForSequenceClassification
from transformers import BertTokenizer

import tensorflow as tf
print("Num GPUs Available: ", tf.config.experimental.list_physical_devices('GPU'))
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, Dropout
from tensorflow.keras.models import Model, clone_model, Sequential
from tensorflow.keras.metrics import RootMeanSquaredError

from keras import backend as K

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
df = pd.read_csv("data/yelp_academic_dataset_sample005_filter.csv")
df.head(1)

Unnamed: 0,review_id,user_id,business_id,review_stars,review_text,review_text_after_cleaning,user_review_count,user_elite,user_friends,user_fans,user_average_stars,user_total_compliments,business_name,business_categories,business_stars,to_recommend,num_user_friends
0,yNB39szX3M8mTEzTtsgoCw,Y1iCYGvLf4ifPoXlKLGq-w,o2Qh4SiGYJ7BK4hP7dfkrw,5,This is an amazing indian Bistro!!I If I do sa...,amaz bistroi say myself never cuisin glad expe...,1,0,,0,5.0,0,Saffron Indian Bistro,"Restaurants, Indian",4.5,True,0


In [3]:
X = df["review_text_after_cleaning"]
y = df["review_stars"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
bert_folder = "data/uncased_L-2_H-128_A-2"
tf.io.gfile.listdir(bert_folder)

['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'vocab.txt']

In [6]:
# Set up tokenizer to generate Tensorflow dataset
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(bert_folder, "vocab.txt"),
     do_lower_case=True)
print("Vocab size:", len(tokenizer.vocab))

Vocab size: 30522


In [7]:
def tokenize_review(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
X_train = X_train.apply(tokenize_review)

In [8]:
X_train

367564    [6187, 2271, 2868, 9530, 15900, 2072, 7661, 38...
101149    [24970, 2833, 2307, 14262, 7903, 15180, 25426,...
277913    [2173, 12721, 2196, 2272, 2067, 2153, 7309, 23...
122649    [2131, 2292, 5660, 2833, 2272, 1043, 12541, 20...
236181    [2292, 2707, 2360, 2293, 8840, 11266, 2131, 59...
                                ...                        
259178    [5156, 2175, 8840, 11266, 17704, 2126, 2188, 2...
365838    [3129, 11703, 3593, 3449, 7361, 2025, 2594, 24...
131932    [2613, 3669, 4569, 11937, 16643, 10733, 14736,...
146867    [2293, 2717, 21159, 6090, 10424, 2072, 2053, 7...
121958    [2253, 2265, 3129, 2214, 2145, 8648, 2239, 397...
Name: review_text_after_cleaning, Length: 268540, dtype: object

### Create a Bert Embedding Layer

In [42]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(self, n_fine_tune_layers=10, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.pooling = "first"
        self.trainable = True
        self.output_size = 768
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
#         config_file = os.path.join(bert_folder, 'bert_config.json')
#         checkpoint_file = os.path.join(bert_folder, 'bert_model.ckpt')
#         self.bert = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=150)
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
        self.bert = hub.Module(
            bert_path,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )
        trainable_vars = self.bert.variables
    
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]
        
        elif self.pooling == "mean":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name and not "/pooler/" in var.name]
            trainable_layers = []
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [var for var in trainable_vars if any([l in var.name for l in trainable_layers])]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
        
        # Add non-trainable weights
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        
        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["pooled_output"]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["sequence_output"]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)


In [43]:
def fit_model(model, epochs=3, batch_size=256, loss='mse', optimizer='adam', metrics=['mae', RootMeanSquaredError('rmse')]):
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    r = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
    plot(r)
    return model, r

def plot(r):
    plt.xlabel('# epochs')
    length = len(r.history[next(iter(r.history))])
    plt.xticks(np.arange(length), np.arange(1, length+1))
    plt.plot(r.history['loss'], label='loss')
    plt.plot(r.history['val_loss'], label='val_loss')
    plt.plot(r.history['mae'], label='mae')
    plt.plot(r.history['val_mae'], 'm', label='val_mae')
    plt.plot(r.history['rmse'], label='rmse')
    plt.plot(r.history['val_rmse'], 'm', label='val_rmse')
    plt.legend()


def create_model():
    D = 100
                    
    # Build model
    in_id = tf.keras.layers.Input(shape=(150,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(150,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(150,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=3)(bert_inputs)

    model = Sequential()
    model.add(bert_output)
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.2))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(5, activation='relu'))
    model.summary(105)
    return model

m, r = fit_model(create_model(), epochs=5)

RuntimeError: variable_scope bert_layer_14_module/ was unused but the corresponding name_scope was already taken.