In [16]:
import json
import re
import sys
import pandas as pd
import numpy as np

## Preprocessing

In [17]:
import dataloader

In [18]:
train_df = dataloader.load_data(data_file = '../data/train.data.jsonl', label_file = '../data/train.label.json', perform_stemming = False)
dev_df = dataloader.load_data(data_file = '../data/dev.data.jsonl', label_file = '../data/dev.label.json', perform_stemming = False)
test_df = dataloader.load_data(data_file = '../data/test.data.jsonl', label_file = None, perform_stemming = False)

In [19]:
combined_df = train_df.append(dev_df, ignore_index = True)
combined_df

Unnamed: 0,id,text,retweet_count,favorite_count,question_mark,contains_url,number_urls,contains_media,statuses_count,listed_count,...,friends_count,contains_profile_background_image,reputation_score_1,reputation_score_2,favourites_count,verified,geo_enabled,has_description,length_description,label
0,552800070199148544,How to respond to the murderous attack on Char...,228,77,True,False,0,True,27923,185,...,414,True,9.985542,0.908971,500,False,False,True,46,0
1,544388259359387648,"You can not condemn an entire race, nation or ...",352,252,False,False,0,False,745,2,...,222,True,0.704036,0.413158,428,False,False,True,115,0
2,552805970536333314,Attempts to extend blame for this to all Musli...,876,400,False,False,0,False,74137,1431,...,1658,True,21.040989,0.954630,6423,False,True,True,149,0
3,525071376084791297,"Rest in Peace, Cpl. Nathan Cirillo. Killed tod...",112,96,False,True,1,True,28103,418,...,1052,True,14.048433,0.933548,2140,True,True,True,157,1
4,498355319979143168,People DEBATING whether MikeBrown shoplifted o...,802,298,False,False,0,False,55920,65,...,914,True,2.114754,0.678947,25389,False,False,True,156,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5216,525025279803424768,The soldier shot dead in Wednesday is Ottawa a...,119,36,False,False,0,True,88483,306,...,2454,True,2.663951,0.727071,1903,True,True,True,101,1
5217,552784600502915072,Charlie Hebdo became well known for publishing...,202,41,False,False,0,False,15128,1657,...,2268,True,18.330101,0.948267,0,True,True,True,158,0
5218,499696525808001024,We got through. That is a sniper on top of a t...,432,55,False,True,1,False,61902,1627,...,521,True,54.639847,0.982027,1163,False,True,True,159,0
5219,580320612155060224,Last position of Germanwings flight 4U9525 at ...,3092,480,False,True,1,True,7991,2384,...,369,True,578.891892,0.998276,1131,True,True,True,146,1


## Text-only BERT with Tensorflow

In [20]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [39]:
batch_size = 2
seed = 42
class_names = ["non-rumour", "rumour"]

In [40]:
train_ds = tf.data.Dataset.from_tensor_slices((train_df.text, train_df.label.values))
train_ds = train_ds.batch(batch_size)
val_ds = tf.data.Dataset.from_tensor_slices((dev_df.text, dev_df.label.values))
val_ds = val_ds.batch(batch_size)

combined_ds = tf.data.Dataset.from_tensor_slices((combined_df.text, combined_df.label.values))
combined_ds = combined_ds.batch(batch_size)

test_ds = tf.data.Dataset.from_tensor_slices((test_df.text))
test_ds = test_ds.batch(batch_size)

In [41]:
bert_model_name = 'talkheads_ggelu_bert_en_base' 

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/4',
    'talkheads_ggelu_bert_en_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
    'talkheads_ggelu_bert_en_large':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_uncased_L-24_H-1024_A-16':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talkheads_ggelu_bert_en_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talkheads_ggelu_bert_en_large':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [42]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

## Experiments

In [43]:
text_test = [train_df['text'][0]]
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"]}')
print(f'Input Mask : {text_preprocessed["input_mask"]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"]}')

Keys       : ['input_type_ids', 'input_word_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [[  101  2129  2000  6869  2000  1996 25303  2886  2006  4918  2002  2497
   3527  1029  2296  3780  1999  1996  2489  2088  2323  6140  2023  1012
   5181  3830  3087  2027  2079  2025  2066  2004  3424  1011  4100  2618
   1998  3049  2127  2008  2711  1013  2194  2003  2736  1012  2053  2028
   2515  1012 10047  7507 12190  2666  5369  2497  3527  4487  9284  2054
   7036  7486 11276  2000  2424 23979  2003  2019  2012 21735  5462  1999
   2037  2171  1010  2025  1037  2061 27027  9476  1012  2748  1010  2127
   2009  4150  6737  1012  2339 15301  2111  2040  2031  2498  2000  2079
   2007  2023  1029  2111  2024 15958 15807  2011  2107  9254  1012  1998
   4445  2572  1045   999  1045  2228  2023  2038  2210  2000  2079  2007
   5025  7486  1012  6289  1010  2017  2079   102]]
Input Mask : [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 

In [44]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [45]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1
Pooled Outputs Shape:(1, 1024)
Pooled Outputs Values:[ 0.2358474   0.27138835  0.03635428 -0.35788828 -0.85647786 -0.16146618
 -0.11206891 -0.28466317  0.20091133  0.17825691  0.4247917   0.13633071]
Sequence Outputs Shape:(1, 128, 1024)
Sequence Outputs Values:[[-0.06378609  0.30163857  0.6241442  ...  0.1136073  -0.09491361
  -0.16502456]
 [-0.7809181  -0.5569376  -0.28361088 ... -0.8637838  -0.0697544
  -1.1009353 ]
 [-1.3398683   0.1587333   0.4289328  ... -0.7670979   0.4413195
  -0.46758702]
 ...
 [ 0.00456536 -0.41435242  0.37159753 ...  0.03094525 -0.39847714
  -0.0726072 ]
 [ 0.95155555 -0.27636686  0.99211895 ... -0.4553804   0.23888047
  -1.8958907 ]
 [ 0.9381537   0.5291955   0.76148033 ...  0.63327944 -0.35621405
   0.07271953]]


## Training

In [46]:
def build_classifier_model():
    
    preprocessor = hub.load(tfhub_handle_preprocess)

    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    tokenize = hub.KerasLayer(preprocessor.tokenize, name = 'tokenizer')
    
    tokenized_input = [tokenize(text_input)]
    
    # pack input sequences for the Transformer encoder
    seq_length = 512  
    bert_pack_inputs = hub.KerasLayer(
        preprocessor.bert_pack_inputs,
        arguments=dict(seq_length=seq_length), name = 'bert_pack_inputs')  # Optional argument.
    
    encoder_inputs = bert_pack_inputs(tokenized_input)

    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [47]:
"""
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)
"""

"\ndef build_classifier_model():\n    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')\n    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')\n    encoder_inputs = preprocessing_layer(text_input)\n    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')\n    outputs = encoder(encoder_inputs)\n    net = outputs['pooled_output']\n    net = tf.keras.layers.Dropout(0.1)(net)\n    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)\n    return tf.keras.Model(text_input, net)\n"

In [48]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

tf.Tensor([[0.3457717]], shape=(1, 1), dtype=float32)


In [49]:
tf.keras.utils.plot_model(classifier_model)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [50]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = [tf.metrics.BinaryAccuracy()]

In [51]:
epochs = 5
#steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
steps_per_epoch = tf.data.experimental.cardinality(combined_ds).numpy()


num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [52]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [53]:
print(f'Training model with {tfhub_handle_encoder}')
"""
history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)
"""
history = classifier_model.fit(x=combined_ds,
                               epochs=epochs)

Training model with https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1
Epoch 1/5


ResourceExhaustedError: in user code:

    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:757 train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:498 minimize
        return self.apply_gradients(grads_and_vars, name=name)
    /opt/conda/lib/python3.7/site-packages/official/nlp/optimization.py:181 apply_gradients
        experimental_aggregate_gradients=experimental_aggregate_gradients)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:604 apply_gradients
        self._create_all_weights(var_list)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:783 _create_all_weights
        self._create_slots(var_list)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/adam.py:129 _create_slots
        self.add_slot(var, 'v')
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:851 add_slot
        initial_value=initial_value)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/variables.py:262 __call__
        return cls._variable_v2_call(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/variables.py:256 _variable_v2_call
        shape=shape)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3332 creator
        return next_creator(**kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3332 creator
        return next_creator(**kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3332 creator
        return next_creator(**kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py:714 variable_capturing_scope
        lifted_initializer_graph=lifted_initializer_graph, **kwds)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/variables.py:264 __call__
        return super(VariableMetaclass, cls).__call__(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/eager/def_function.py:227 __init__
        initial_value = initial_value()
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/initializers/initializers_v2.py:139 __call__
        return super(Zeros, self).__call__(shape, dtype=_get_dtype(dtype), **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/init_ops_v2.py:154 __call__
        return array_ops.zeros(shape, dtype)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/array_ops.py:2819 wrapped
        tensor = fun(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/array_ops.py:2880 zeros
        output = fill(shape, constant(zero, dtype=dtype), name=name)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/array_ops.py:239 fill
        result = gen_array_ops.fill(dims, value, name=name)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/gen_array_ops.py:3348 fill
        _ops.raise_from_not_ok_status(e, name)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:6862 raise_from_not_ok_status
        six.raise_from(core._status_to_exception(e.code, message), None)
    <string>:3 raise_from
        

    ResourceExhaustedError: OOM when allocating tensor with shape[30522,1024] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Fill]


### Plot accuracy and loss over time

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
classifier_model.save("./pure_bert/pure_bert_v34", include_optimizer=False)

## Inference

In [None]:
result = tf.sigmoid(classifier_model.predict(test_ds))

In [None]:
result = np.round(result).astype(int)

In [None]:
predicted_labels = [dataloader.convert_prediction(pred) for pred in result]
output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output.json', 'w') as f:
    json.dump(submission, f)