In [1]:
%run config.ipynb
import os
import random
import math
import tensorflow as tf
import tensorflow_hub as hub

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
from official.nlp.bert import 

from keras_bert.bert import get_model
from keras_bert.loader import load_trained_model_from_checkpoint

from transformers import BertForSequenceClassification
from transformers import BertTokenizer

import tensorflow as tf
print("Num GPUs Available: ", tf.config.experimental.list_physical_devices('GPU'))
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, Dropout, Conv1D, GlobalMaxPool1D, GlobalAvgPool1D
from tensorflow.keras.models import Model, clone_model, Sequential
from tensorflow.keras.metrics import RootMeanSquaredError

from keras import backend as K

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
df = pd.read_csv("data/yelp_academic_dataset_sample005_filter.csv")
df.head(1)

Unnamed: 0,review_id,user_id,business_id,review_stars,review_text,review_text_after_cleaning,user_review_count,user_elite,user_friends,user_fans,user_average_stars,user_total_compliments,business_name,business_categories,business_stars,to_recommend,num_user_friends
0,yNB39szX3M8mTEzTtsgoCw,Y1iCYGvLf4ifPoXlKLGq-w,o2Qh4SiGYJ7BK4hP7dfkrw,5,This is an amazing indian Bistro!!I If I do sa...,amaz bistroi say myself never cuisin glad expe...,1,0,,0,5.0,0,Saffron Indian Bistro,"Restaurants, Indian",4.5,True,0


In [3]:
X = df["review_text_after_cleaning"]
y = df["review_stars"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
bert_folder = "data/uncased_L-2_H-128_A-2"
tf.io.gfile.listdir(bert_folder)

['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'vocab.txt']

In [6]:
# Set up tokenizer to generate Tensorflow dataset
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(bert_folder, "vocab.txt"),
    do_lower_case=True)
print("Vocab size:", len(tokenizer.vocab))

Vocab size: 30522


In [7]:
config_file = os.path.join(bert_folder, 'bert_config.json')
checkpoint_file = os.path.join(bert_folder, 'bert_model.ckpt')
model = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=150)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 150)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 150, 128), ( 3906816     Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 150, 128)     256         Input-Segment[0][0]              
_______________________________________________________________________________________

In [8]:
def tokenize_review(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
tokenized_reviews = X_train.apply(tokenize_review)

In [9]:
reviews_with_len = [[review, y[i], len(review)] for i, review in enumerate(tokenized_reviews)]
reviews_with_len[0]

[[6187,
  2271,
  2868,
  9530,
  15900,
  2072,
  7661,
  3805,
  25540,
  8490,
  9530,
  14028,
  2296,
  2239,
  4297,
  7630,
  2094,
  2033,
  2187,
  2868],
 5,
 20]

In [10]:
random.shuffle(reviews_with_len)
reviews_with_len.sort(key=lambda x: x[2])

In [11]:
sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]
sorted_reviews_labels

[([2182], 1),
 ([2293], 3),
 ([4497], 5),
 ([2204], 5),
 ([2307], 4),
 ([2485], 5),
 ([2237], 4),
 ([24970], 5),
 ([2067], 1),
 ([24970], 2),
 ([1058], 5),
 ([2732], 4),
 ([3435], 5),
 ([2485], 5),
 ([4900], 4),
 ([3435], 2),
 ([3095], 5),
 ([2205], 1),
 ([2514], 5),
 ([7929], 3),
 ([2168], 5),
 ([14123], 2),
 ([2769], 4),
 ([3199], 2),
 ([8288], 5),
 ([5404], 5),
 ([1047], 5),
 ([3819], 5),
 ([8257], 3),
 ([24970], 5),
 ([2560], 5),
 ([6187], 2),
 ([2173], 1),
 ([14262, 7903], 5),
 ([7563, 17728], 4),
 ([5932, 10497], 1),
 ([2502, 4664], 5),
 ([7610, 2330], 2),
 ([15536, 8873], 4),
 ([2191, 11132], 1),
 ([2106, 10036], 2),
 ([17935, 3775], 2),
 ([2485, 2204], 4),
 ([13173, 6559], 3),
 ([22448, 15916], 5),
 ([2739, 4149], 3),
 ([4190, 4183], 4),
 ([3976, 2092], 2),
 ([2564, 4060], 5),
 ([2522, 9236], 4),
 ([15180, 25426], 5),
 ([17268, 4664], 5),
 ([4550, 4355], 4),
 ([4550, 4067], 4),
 ([1037, 4239], 5),
 ([9805, 2213], 4),
 ([3976, 2833], 2),
 ([2175, 2237], 5),
 ([15180, 25426], 5),

In [39]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_reviews_labels, output_types=(tf.int32, tf.float32))
BATCH_SIZE = 256
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
next(iter(batched_dataset))

(<tf.Tensor: shape=(256, 4), dtype=int32, numpy=
 array([[ 2182,     0,     0,     0],
        [ 2293,     0,     0,     0],
        [ 4497,     0,     0,     0],
        ...,
        [ 2293,  4748,  2953,  2173],
        [ 3435,  2767,  3669,  2709],
        [ 2247,  2395, 14980,  7140]])>,
 <tf.Tensor: shape=(256,), dtype=float32, numpy=
 array([1., 3., 5., 5., 4., 5., 4., 5., 1., 2., 5., 4., 5., 5., 4., 2., 5.,
        1., 5., 3., 5., 2., 4., 2., 5., 5., 5., 5., 3., 5., 5., 2., 1., 5.,
        4., 1., 5., 2., 4., 1., 2., 2., 4., 3., 5., 3., 4., 2., 5., 4., 5.,
        5., 4., 4., 5., 4., 2., 5., 5., 5., 3., 5., 4., 4., 5., 5., 4., 4.,
        3., 4., 1., 4., 5., 3., 5., 3., 5., 4., 2., 4., 5., 4., 5., 3., 5.,
        5., 3., 4., 5., 4., 4., 4., 1., 2., 5., 1., 5., 4., 1., 2., 4., 5.,
        4., 5., 5., 5., 5., 2., 5., 5., 2., 4., 4., 5., 1., 5., 1., 4., 3.,
        5., 4., 4., 4., 4., 2., 5., 4., 4., 5., 3., 5., 5., 4., 5., 4., 5.,
        2., 5., 4., 3., 5., 3., 5., 5., 5., 3., 5.

In [40]:
TOTAL_BATCHES = math.ceil(len(sorted_reviews_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 16
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [49]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self, vocabulary_size, embedding_dimensions=150, dnn_units=128,
                 dropout_rate=0.2, training=False,  name="bert_model"):
        
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = Embedding(vocabulary_size, embedding_dimensions)
        self.rnn_layer1 = Bidirectional(LSTM(50, dropout=dropout_rate, return_sequences=True))
        self.pooling = GlobalAveragePooling1D()
        self.last_dense = Dense(1)
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x = self.rnn_layer1(x)
        x = self.pooling(x)
        x = self.last_dense(x)
        
        return x

In [50]:
bert_model = TEXT_MODEL(vocabulary_size=len(tokenizer.vocab))
bert_model.compile(loss="mse", optimizer="adam", metrics=['mae', RootMeanSquaredError('rmse')])
bert_model.fit(train_data, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x244f15c8c40>

In [29]:
bert_model.evaluate(test_data)



[2.9042723178863525, 1.5361770391464233, 1.6715201139450073]

In [None]:
# Set up tokenizer to generate Tensorflow dataset
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(bert_folder, "vocab.txt"),
    do_lower_case=True)
print("Vocab size:", len(tokenizer.vocab))

In [None]:
model = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=150)
model.summary()

In [None]:
def create_model():
    D = 100
    num_words = len(tokenizer.vocab) + 1
    model = Sequential()
    model.add(Input(shape=(T,)))
    model.add(Embedding(num_words, D))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(5, activation='relu'))
    model.summary(105)
    return model

m, r = fit_model(create_model())

In [None]:
model.compile(loss="mse", optimizer="adam", metrics=['mae', RootMeanSquaredError('rmse')])
model.fit(tokenized_reviews, y_train)

# Reference

https://stackabuse.com/text-classification-with-bert-tokenizer-and-tf-2-0-in-python/