In [None]:
# Check if there is GPU or not
!nvidia-smi
# Install tensorflow 2.3.0
!pip install -q tensorflow==2.3.0
# Clone the TensorFlow models Repo
!git clone --depth 1 -b v2.3.0 https://github.com/tensorflow/models.git
!pip install -Uqr models/official/requirements.txt
# Imports
import sys
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
sys.path.append('models')
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization

# keras imports
from tf.keras.layers import Input, Dropout, Dense
from tf.keras.optimizers import Adam
from tf.keras.metrics import BinaryAccuracy
from tf.keras.losses import BinaryCrossentropy
from tf.keras.utils import plot_model
from tf.keras.models import Model
# Load the Quora Insincrere QUesrtion dataset.
df = pd.read_csv(
'https://archive.org/download/fine-tune-bert-tensorflow-train.csv/train.csv.zip',
				compression='zip')
df.head()
# plot the histogram of sincere and insincere question vs sincere ques
df.target.plot(kind='hist', title='Sincere (0) vs Insincere (1) distribution')


In [None]:
# split into train and validation
train_df, remaining = train_test_split(df, train_size=0.01,
									stratify=df.target.values)
valid_df, _ = train_test_split(remaining, train_size=0.001,
							stratify=remaining.target.values)
train_df.shape, valid_df.shape

# import for processing dataset
from tf.data.Dataset import from_tensor_slices
from tf.data.experimental import AUTOTUNE

# convert dataset into tensor slices
with tf.device('/cpu:0'):
train_data =from_tensor_slices((train_df.question_text.values,
												train_df.target.values))
valid_data = from_tensor_slices((valid_df.question_text.values,
												valid_df.target.values))
	
for text, label in train_data.take(2):
	print(text)
	print(label)
	
label_list = [0, 1] # Label categories
max_seq_length = 128 # maximum length of input sequences
train_batch_size = 32

# Get BERT layer and tokenizer:
bert_layer = hub.KerasLayer(
"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
							trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

# example
# convert to tokens ids and 
tokenizer.convert_tokens_to_ids(
tokenizer.wordpiece_tokenizer.tokenize('how are you?'))

# convert the dataset into the format required by BERT i.e we convert the row into
# input features (Token id, input mask, input type id ) and labels

def convert_to_bert_feature(text, label, label_list=label_list, 
			max_seq_length=max_seq_length, tokenizer=tokenizer):
example = classifier_data_lib.InputExample(guid = None,
											text_a = text.numpy(), 
											text_b = None, 
											label = label.numpy())
feature = classifier_data_lib.convert_single_example(0, example, label_list,
									max_seq_length, tokenizer)

return (feature.input_ids, feature.input_mask, feature.segment_ids, 
		feature.label_id)

# wrap the dataset around the python function in order to use the tf
# datasets map function
def to_bert_feature_map(text, label):

input_ids, input_mask, segment_ids, label_id = tf.py_function(
	convert_to_bert_feature,
	inp=[text, label],
	Tout=[tf.int32, tf.int32, tf.int32, tf.int32])

# py_func doesn't set the shape of the returned tensors.
input_ids.set_shape([max_seq_length])
input_mask.set_shape([max_seq_length])
segment_ids.set_shape([max_seq_length])
label_id.set_shape([])

x = {
		'input_word_ids': input_ids,
		'input_mask': input_mask,
		'input_type_ids': segment_ids
	}
return (x, label_id)
with tf.device('/cpu:0'):
# train
train_data = (train_data.map(to_bert_feature_map,
							num_parallel_calls=AUTOTUNE)
						#.cache()
						.shuffle(1000)
						.batch(32, drop_remainder=True)
						.prefetch(AUTOTUNE))

# valid
valid_data = (valid_data.map(to_bert_feature_map,
							num_parallel_calls=AUTOTUNE)
						.batch(32, drop_remainder=True)
						.prefetch(AUTOTUNE)) 

# example format train and valid data
print("train data format",train_data.element_spec)
print("validation data format",valid_data.element_spec)


In [None]:
# define the keras model
# Building the model
def fine_tuned_model():
input_word_ids = Input(shape=(max_seq_length,), dtype=tf.int32,
									name="input_word_ids")
input_mask = Input(shape=(max_seq_length,), dtype=tf.int32,
								name="input_mask")
input_type_ids = Input(shape=(max_seq_length,), dtype=tf.int32,
								name="input_type_ids")

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, 
											input_type_ids])

drop = Dropout(0.4)(pooled_output)
output = Dense(1, activation="sigmoid", name="output")(drop)

model = Model(
	inputs={
		'input_word_ids': input_word_ids,
		'input_mask': input_mask,
		'input_type_ids': input_type_ids
	},
	outputs=output)
return model

#compile the model
model = fine_tuned_model()
model.compile(optimizer=Adam(learning_rate=2e-5),
			loss=BinaryCrossentropy(),
			metrics=[BinaryAccuracy()])
model.summary()
#plot the model
plot_model(model=model, show_shapes=True)
# Train model
epochs = 4
history = model.fit(train_data,
					validation_data=valid_data,
					epochs=epochs,
					verbose=1)
# plot the accuracy
def plot_graphs(history, metric):
plt.plot(history.history[metric])
plt.plot(history.history['val_'+metric], '')
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend([metric, 'val_'+metric])
plt.show()
plot_graphs(history, 'binary_accuracy')


In [None]:
# check 
test_eg=['what is the current marketprice of petroleum?', 
		'who is Oswald?', 'why are you here idiot ?']
test_data =from_tensor_slices((test_eg, [0]*len(test_eg)))
# wrap test data into BERT format
test_data = (test_data.map(to_feature_map_bert).batch(1))
preds = model.predict(test_data)
print(preds)
['Insincere' if pred >=0.5 else 'Sincere' for pred in preds]
