In [18]:
%run config.ipynb
import os
import random
import math
import tensorflow as tf
import tensorflow_hub as hub

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

from keras_bert.bert import get_model
from keras_bert.loader import load_trained_model_from_checkpoint

from transformers import BertForSequenceClassification
from transformers import BertTokenizer

import tensorflow as tf
print("Num GPUs Available: ", tf.config.experimental.list_physical_devices('GPU'))
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, Dropout, Conv1D, GlobalMaxPool1D
from tensorflow.keras.models import Model, clone_model, Sequential
from tensorflow.keras.metrics import RootMeanSquaredError

from keras import backend as K

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
df = pd.read_csv("data/yelp_academic_dataset_sample005_filter.csv")
df.head(1)

Unnamed: 0,review_id,user_id,business_id,review_stars,review_text,review_text_after_cleaning,user_review_count,user_elite,user_friends,user_fans,user_average_stars,user_total_compliments,business_name,business_categories,business_stars,to_recommend,num_user_friends
0,yNB39szX3M8mTEzTtsgoCw,Y1iCYGvLf4ifPoXlKLGq-w,o2Qh4SiGYJ7BK4hP7dfkrw,5,This is an amazing indian Bistro!!I If I do sa...,amaz bistroi say myself never cuisin glad expe...,1,0,,0,5.0,0,Saffron Indian Bistro,"Restaurants, Indian",4.5,True,0


In [3]:
X = df["review_text_after_cleaning"]
y = df["review_stars"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
bert_folder = "data/uncased_L-2_H-128_A-2"
tf.io.gfile.listdir(bert_folder)

['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'vocab.txt']

In [6]:
# Set up tokenizer to generate Tensorflow dataset
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(bert_folder, "vocab.txt"),
    do_lower_case=True)
print("Vocab size:", len(tokenizer.vocab))

Vocab size: 30522


In [7]:
config_file = os.path.join(bert_folder, 'bert_config.json')
checkpoint_file = os.path.join(bert_folder, 'bert_model.ckpt')
model = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=150)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 150)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 150, 128), ( 3906816     Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 150, 128)     256         Input-Segment[0][0]              
_______________________________________________________________________________________

In [8]:
def tokenize_review(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
tokenized_reviews = X_train.apply(tokenize_review)

In [9]:
reviews_with_len = [[review, y[i], len(review)] for i, review in enumerate(tokenized_reviews)]
reviews_with_len[0]

[[6187,
  2271,
  2868,
  9530,
  15900,
  2072,
  7661,
  3805,
  25540,
  8490,
  9530,
  14028,
  2296,
  2239,
  4297,
  7630,
  2094,
  2033,
  2187,
  2868],
 5,
 20]

In [10]:
random.shuffle(reviews_with_len)
reviews_with_len.sort(key=lambda x: x[2])

In [11]:
sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]
sorted_reviews_labels

[([8257], 3),
 ([3435], 5),
 ([2067], 1),
 ([6187], 2),
 ([1058], 5),
 ([24970], 5),
 ([7929], 3),
 ([2485], 5),
 ([14123], 2),
 ([2307], 4),
 ([5404], 5),
 ([2237], 4),
 ([2293], 3),
 ([2204], 5),
 ([8288], 5),
 ([2205], 1),
 ([24970], 2),
 ([2732], 4),
 ([2769], 4),
 ([3199], 2),
 ([3095], 5),
 ([1047], 5),
 ([3819], 5),
 ([3435], 2),
 ([2560], 5),
 ([2173], 1),
 ([2485], 5),
 ([4900], 4),
 ([24970], 5),
 ([2182], 1),
 ([2514], 5),
 ([2168], 5),
 ([4497], 5),
 ([2485, 2204], 4),
 ([22448, 15916], 5),
 ([3976, 2833], 2),
 ([2307, 2051], 4),
 ([2732, 3524], 4),
 ([14262, 7903], 5),
 ([2767, 3669], 3),
 ([2739, 4149], 3),
 ([2204, 2833], 4),
 ([7610, 2330], 2),
 ([3565, 2204], 3),
 ([5932, 10497], 1),
 ([26568, 2546], 5),
 ([2191, 11132], 1),
 ([13173, 6559], 3),
 ([12183, 4761], 3),
 ([2272, 2067], 4),
 ([10250, 10050], 3),
 ([23566, 2360], 5),
 ([2388, 20572], 5),
 ([8529, 2053], 5),
 ([4190, 4183], 4),
 ([2307, 28305], 4),
 ([15180, 25426], 5),
 ([4550, 4355], 4),
 ([2106, 10036], 2)

In [12]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_reviews_labels, output_types=(tf.int32, tf.int32))
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
next(iter(batched_dataset))

(<tf.Tensor: shape=(32, 1), dtype=int32, numpy=
 array([[ 8257],
        [ 3435],
        [ 2067],
        [ 6187],
        [ 1058],
        [24970],
        [ 7929],
        [ 2485],
        [14123],
        [ 2307],
        [ 5404],
        [ 2237],
        [ 2293],
        [ 2204],
        [ 8288],
        [ 2205],
        [24970],
        [ 2732],
        [ 2769],
        [ 3199],
        [ 3095],
        [ 1047],
        [ 3819],
        [ 3435],
        [ 2560],
        [ 2173],
        [ 2485],
        [ 4900],
        [24970],
        [ 2182],
        [ 2514],
        [ 2168]])>,
 <tf.Tensor: shape=(32,), dtype=int32, numpy=
 array([3, 5, 1, 2, 5, 5, 3, 5, 2, 4, 5, 4, 3, 5, 5, 1, 2, 4, 4, 2, 5, 5,
        5, 2, 5, 1, 5, 4, 5, 1, 5, 5])>)

In [13]:
TOTAL_BATCHES = math.ceil(len(sorted_reviews_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [19]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = GlobalMaxPool1D()
        
        self.dense_1 = Dense(units=dnn_units, activation="relu")
        self.dropout = Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [15]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 5

In [20]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [21]:
text_model.compile(loss="mse", optimizer="adam", metrics=['mae', RootMeanSquaredError('rmse')])

In [None]:
text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/5
   6138/Unknown - 341s 56ms/step - loss: 9.6158 - mae: 2.7597 - rmse: 3.1009

### Create a Bert Embedding Layer

In [12]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(self, n_fine_tune_layers=10, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.pooling = "mean"
        self.trainable = True
        self.output_size = 768
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
#         config_file = os.path.join(bert_folder, 'bert_config.json')
#         checkpoint_file = os.path.join(bert_folder, 'bert_model.ckpt')
#         self.bert = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=150)
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
        self.bert = hub.Module(
            bert_path,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )
        trainable_vars = self.bert.variables
    
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]
        
        elif self.pooling == "mean":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name and not "/pooler/" in var.name]
            trainable_layers = []
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [var for var in trainable_vars if any([l in var.name for l in trainable_layers])]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
        
        # Add non-trainable weights
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        
        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["pooled_output"]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["sequence_output"]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)


In [13]:
def fit_model(model, epochs=3, batch_size=256, loss='mse', optimizer='adam', metrics=['mae', RootMeanSquaredError('rmse')]):
    model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
    r = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))
    plot(r)
    return model, r

def plot(r):
    plt.xlabel('# epochs')
    length = len(r.history[next(iter(r.history))])
    plt.xticks(np.arange(length), np.arange(1, length+1))
    plt.plot(r.history['loss'], label='loss')
    plt.plot(r.history['val_loss'], label='val_loss')
    plt.plot(r.history['mae'], label='mae')
    plt.plot(r.history['val_mae'], 'm', label='val_mae')
    plt.plot(r.history['rmse'], label='rmse')
    plt.plot(r.history['val_rmse'], 'm', label='val_rmse')
    plt.legend()


def create_model():
    D = 100
                    
    # Build model
    in_id = tf.keras.layers.Input(shape=(150,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(150,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(150,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=3)(bert_inputs)

    model = Sequential()
    model.add(bert_output)
    model.add(Dense(1))
    model.summary(105)
    return model

m, r = fit_model(create_model(), epochs=5)

RuntimeError: variable_scope bert_layer_2_module/ was unused but the corresponding name_scope was already taken.

# Reference

https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/