In [None]:
import json
import re
import sys
import pandas as pd
import numpy as np

## Preprocessing

In [None]:
import load_data

In [None]:
train_df = load_data(data_file = '../data/train.data.jsonl', label_file = '../data/train.label.json', perform_stemming = False)
dev_df = load_data(data_file = '../data/dev.data.jsonl', label_file = '../data/dev.label.json', perform_stemming = False)
test_df = load_data(data_file = '../data/test.data.jsonl', label_file = None, perform_stemming = False)

## Text-only BERT with Tensorflow

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [None]:
batch_size = 4
seed = 42
class_names = ["non-rumour", "rumour"]

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((train_df.text, train_df.label.values))
train_ds = train_ds.batch(batch_size)
val_ds = tf.data.Dataset.from_tensor_slices((dev_df.text, dev_df.label.values))
val_ds = val_ds.batch(batch_size)

#combined_ds = tf.data.Dataset.from_tensor_slices((combined_df.text, combined_df.label.values))
#combined_ds = combined_ds.batch(batch_size)

test_ds = tf.data.Dataset.from_tensor_slices((test_df.text))
test_ds = test_ds.batch(batch_size)

In [None]:
bert_model_name = 'talkheads_ggelu_bert_en_base' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'talkheads_ggelu_bert_en_large':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1',
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4',
    'talkheads_ggelu_bert_en_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talkheads_ggelu_bert_en_large':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talkheads_ggelu_bert_en_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

## Experiments

In [None]:
text_test = [train_df['text'][0]]
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

## Training

In [None]:
def build_classifier_model():
    
    preprocessor = hub.load(tfhub_handle_preprocess)
    
    text_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    
    tokenize = hub.KerasLayer(preprocessor.tokenize)
    
    tokenized_inputs = tokenize(text_inputs)
    
    # pack input sequences for the Transformer encoder.
    seq_length = 512  # Your choice here.
    bert_pack_inputs = hub.KerasLayer(
        preprocessing_layer.bert_pack_inputs,
        arguments=dict(seq_length=seq_length))  # Optional argument.
    
    encoder_inputs = bert_pack_inputs(tokenized_inputs)

    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

In [None]:
tf.keras.utils.plot_model(classifier_model)

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.metrics.BinaryAccuracy()]

In [None]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
#steps_per_epoch = tf.data.experimental.cardinality(combined_ds).numpy()


num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
print(f'Training model with {tfhub_handle_encoder}')

history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

#history = classifier_model.fit(x=combined_ds,
#                               epochs=epochs)

### Plot accuracy and loss over time

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
classifier_model.save("./pure_bert/pure_bert_v32", include_optimizer=False)

## Inference

In [None]:
result = tf.sigmoid(classifier_model.predict(test_ds))

In [None]:
result = np.round(result).astype(int)

In [None]:
predicted_labels = [convert_prediction(pred) for pred in result]
output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output.json', 'w') as f:
    json.dump(submission, f)