In [3]:
import re
import string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS

import tensorflow as tf
from keras.optimizer import adam
from keras.losses import CategoricalCrossentropy
from keras.metrics import BinaryAccuracy
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import transformers
from transformers import BertTokenizer, TFBertModel

  _dtype_to_storage = {data_type(0).dtype: data_type for data_type in _storages}


In [4]:
rest_train = pd.read_csv('./Datasets/yelp_review_polarity_csv/fixed_train.csv')
rest_test = pd.read_csv('./Datasets/yelp_review_polarity_csv/fixed_test.csv')

In [None]:
def average_word_length(x):
    x = x.split()
    return np.mean([len(i) for i in x])
for df in [rest_train, rest_test]:
    df['word count'] = df['review'].apply(lambda x: len(x.split()))
    df['character count'] = df['review'].apply(lambda x: len(x))
    df['average word length'] = df['review'].apply(average_word_length)
    df['unique word count'] = df['review'].apply(lambda x: len(set(x.split())))
    df['stopword count'] = df['review'].apply(lambda x: len([i for i in x.lower().split() if i in STOPWORDS]))
    df['stopword ratio'] = df['stopword count'] / df['word count']
    df['url count'] = df['review'].apply(lambda x: len([i for i in x.lower().split() if 'http' in i or 'https' in i]))

meta_train = StandardScaler().fit_transform(rest_train.iloc[:, 2:])
meta_test = StandardScaler().fit_transform(rest_test.iloc[:, 1:])

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [6]:
rest_train['review']=rest_train['review'].apply(lambda x : remove_URL(x))
rest_train['review']=rest_train['review'].apply(lambda x : remove_html(x))
rest_train['review']=rest_train['review'].apply(lambda x : remove_emoji(x))
rest_train['review']=rest_train['review'].apply(lambda x : remove_punct(x))

In [7]:
model_name = "bert-base-uncased"
BATCH_SIZE = 16
EPOCHS = 10
USE_META = True
ADD_DENSE = False
DENSE_DIM = 64
ADD_DROPOUT = True
DROPOUT = .2
TRAIN_BASE = True

In [9]:
bert_base = TFBertModel.from_pretrained(model_name)
TOKENIZER = BertTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


NameError: name 'BertTokenizer' is not defined

In [None]:
def bert_encode(data,maximum_len) :
    input_ids = []
    attention_masks = []
    for i in range(len(data.review)):
        encoded = TOKENIZER.encode_plus(data.review[i],
                                        add_special_tokens=True,
                                        max_length=maximum_len,
                                        pad_to_max_length=True,
                                        return_attention_mask=True)
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)
def build_model(model_layer, learning_rate, use_meta = USE_META, add_dense = ADD_DENSE,
               dense_dim = DENSE_DIM, add_dropout = ADD_DROPOUT, dropout = DROPOUT):  
    # Inputs
    input_ids = tf.keras.Input(shape=(60,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(60,),dtype='int32')
    meta_input = tf.keras.Input(shape = (meta_train.shape[1], ))
    
    # BERT layer
    transformer_layer = model_layer([input_ids,attention_masks])
    
    #choose only last hidden-state
    output = transformer_layer[1]
    
    # Meta data
    if use_meta:
        output = tf.keras.layers.Concatenate()([output, meta_input])
    
    # Dense relu layer
    if add_dense:
        print("Training with additional dense layer...")
        output = tf.keras.layers.Dense(dense_dim,activation='relu')(output)
    
    # Dropout
    if add_dropout:
        print("Training with dropout...")
        output = tf.keras.layers.Dropout(dropout)(output)
    
    # Final node for binary classification
    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
    
    # Assemble and compile
    if use_meta:
        print("Training with meta-data...")
        model = tf.keras.models.Model(inputs = [input_ids,attention_masks, meta_input],outputs = output)
    else:
        print("Training without meta-data...")
        model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(tf.keras.optimizers.Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
def plot_learning_curves(history): 
    fig, ax = plt.subplots(1, 2, figsize = (20, 10))

    ax[0].plot(history.history['accuracy'], color = '#171820')
    ax[0].plot(history.history['val_accuracy'], color = '#fdc029')

    ax[1].plot(history.history['loss'], color='#171820')
    ax[1].plot(history.history['val_loss'], color = '#fdc029')

    ax[0].legend(['train', 'validation'], loc = 'upper left')
    ax[1].legend(['train', 'validation'], loc = 'upper left')

    fig.suptitle("Model Learning Curves", fontsize=14)

    ax[0].set_ylabel('Accuracy')
    ax[0].set_xlabel('Epoch')
    ax[1].set_ylabel('Loss')
    ax[1].set_xlabel('Epoch')

    return plt.show()

In [None]:
if TRAIN_BASE:
    #get our inputs
    print('Encoding Tweets...')
    train_input_ids,train_attention_masks = bert_encode(rest_train,60)
    test_input_ids,test_attention_masks = bert_encode(rest_test,60)
    print('Tweets encoded')
    print('')

    #debugging step
    print('Train length:', len(train_input_ids))
    print('Test length:', len(test_input_ids))

In [None]:
BERT_base = build_model(bert_base, learning_rate = 1e-5)
checkpoint = tf.keras.callbacks.ModelCheckpoint('base_model.h5',
                                                monitor='val_loss', 
                                                save_best_only = True,
                                                save_weights_only = True)

In [None]:
if TRAIN_BASE:
    if USE_META:
        history = BERT_base.fit([train_input_ids,train_attention_masks, meta_train], 
                                train.target, 
                                validation_split = .2, 
                                epochs = EPOCHS, 
                                callbacks = [checkpoint], 
                                batch_size = BATCH_SIZE)
    
    else:
        history = BERT_base.fit([train_input_ids,train_attention_masks], 
                                train.target, 
                                validation_split = .2, 
                                epochs = EPOCHS, 
                                callbacks = [checkpoint], 
                                batch_size = BATCH_SIZE)

In [None]:
plot_learning_curves(history)