In [2]:
import sys

preprocessing_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Year 3\\COMP3200\\fake-news-profiling\\classifier\\preprocessing'
if preprocessing_path not in sys.path:
    sys.path.insert(1, preprocessing_path)

notif_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Year 3\\COMP3200\\fake-news-profiling\\classifier\\notifications'
if notif_path not in sys.path:
    sys.path.insert(1, notif_path)

In [3]:
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

import ipynb.fs.full.parse_datasets as datasets
import ipynb.fs.full.preprocessing as pp
import ipynb.fs.full.bert_fake_news_classifier as bclf
from ipynb.fs.full.notif_email import send_email

## Dataset

In [4]:
tweet_data, label_data = datasets.parse_dataset("datasets", "en")

In [5]:
# Preprocess the data
tweet_preprocessor = pp.TweetPreprocessor(
    preprocess_funcs = [
        pp.tag_indicators,
        pp.replace_xml_and_html,
        pp.replace_emojis,
        pp.remove_punctuation,
        pp.replace_tags,
        pp.remove_hashtag_chars,
        pp.replace_accented_chars,
        pp.tag_numbers,
        pp.remove_stopwords,
        pp.remove_extra_spacing,
    ])
tweet_preprocessor.preprocess(tweet_data)

# Individual dataset
tweet_data_individual = tweet_preprocessor.get_individual_tweets_dataset()

# Split the data
(tweet_train, label_train, 
 tweet_val, label_val, 
 tweet_test, label_test) = datasets.split_dataset(tweet_data_individual, label_data)

In [8]:
# Bert encoder and tokenizer
small_bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"
bert_encoder_individual = hub.KerasLayer(
    small_bert_url, 
    trainable=True,
)
bert_input_size_individual = 128

individual_tokenizer = bclf.BertIndividualTweetTokenizer(bert_encoder_individual, bert_input_size_individual)

In [7]:
# Tokenizer data
tweet_individual_train = individual_tokenizer.tokenize_input(tweet_train)
label_individual_train = individual_tokenizer.tokenize_labels(label_train)
tweet_individual_val = individual_tokenizer.tokenize_input(tweet_val)
label_individual_val = individual_tokenizer.tokenize_labels(label_val)
tweet_individual_test = individual_tokenizer.tokenize_input(tweet_test)
label_individual_test = individual_tokenizer.tokenize_labels(label_test)

In [15]:
# Format data for joined BERT
tweet_individual_train_joined = tf.convert_to_tensor([[
    [tweet_individual_train['input_word_ids'][j*i],
    tweet_individual_train['input_mask'][j*i],
    tweet_individual_train['input_type_ids'][j*i],]
    for j in range(100)
] for i in range(int(len(tweet_individual_train['input_mask'])/100))])

label_individual_train_joined = tf.convert_to_tensor([
    label_individual_train.numpy()[i] 
    for i in range(0, len(tweet_individual_train['input_mask']), 100)
])

print(f"Data shape: {tweet_individual_train_joined.shape}, Label shape: {label_individual_train_joined.shape}")

Data shape: (210, 100, 3, 128), Label shape: (210,)


## Joined training model

In [16]:
# BERT model
bert_model = bclf.create_bert_model(bert_encoder_individual, bert_input_size_individual)
bert_optimizer = Adam(lr=1e-5)
bert_loss_fn = tf.keras.losses.BinaryCrossentropy()

In [17]:
# Classifier model
def create_clf_model():
    input_layer = Input(shape=(100,))
    dense_out = Dense(1, activation="sigmoid")(input_layer)
    return tf.keras.Model(inputs=input_layer, outputs=dense_out)

clf_model = create_clf_model()
clf_optimizer = Adam(lr=1e-5)
clf_loss_fn = tf.keras.losses.BinaryCrossentropy()
clf_train_acc_metric = tf.keras.metrics.BinaryAccuracy()
clf_val_acc_metric = tf.keras.metrics.BinaryAccuracy()

In [25]:
# Training dataset
train_dataset = tf.data.Dataset.from_tensor_slices((tweet_individual_train_joined, label_individual_train_joined))
train_dataset = train_dataset.batch(4) # users per batch
train_dataset

<BatchDataset shapes: ((None, 100, 3, 128), (None,)), types: (tf.int32, tf.int32)>

### Training
Used: https://www.tensorflow.org/guide/keras/writing_a_training_loop_from_scratch

In [26]:
def tweets_to_bert_batch(tweets_batch):
    res = {
        'input_word_ids': tweets_batch[:, 0],
        'input_mask': tweets_batch[:, 1],
        'input_type_ids': tweets_batch[:, 2],
    }
    return res

def to_tweet_batches(x_train, num_tweets, tweet_batch_size):
    return [
        tweets_to_bert_batch(x_train[i:i+tweet_batch_size]) 
        for i in range(0, num_tweets, tweet_batch_size)
    ]

def train_step(x_batch_train, y_batch_train, num_tweets=100, tweet_batch_size=20):
    with tf.GradientTape(watch_accessed_variables=False) as bert_tape, tf.GradientTape(watch_accessed_variables=False) as clf_tape:
        bert_tape.watch(bert_model.variables)
        bert_tape.watch(clf_model.variables)
        clf_tape.watch(clf_model.variables)
        
        clf_batch_logits = []

        # For each user in the batch
        for x_user_train in x_batch_train:

            # Predict tweet batches using BERT
            bert_user_logits = tf.convert_to_tensor([])

            for tweet_batch in to_tweet_batches(x_user_train, num_tweets, tweet_batch_size):
                bert_batch_logits = tf.reshape(
                    bert_model(tweet_batch, training=True), 
                    shape=(-1,),
                )
                bert_user_logits = tf.concat(
                    (bert_user_logits, bert_batch_logits), 
                    axis=0,
                )

            # Predict using Classifier
            clf_inputs = tf.reshape(bert_user_logits, shape=(1, -1))
            clf_logits = clf_model(clf_inputs, training=True)
            clf_batch_logits.append(clf_logits)

        # Take the loss of entire batch
        clf_batch_logits_concat = tf.concat((clf_batch_logits), axis=0)

        y_batch_train = tf.reshape(y_batch_train, shape=(-1, 1))
        clf_batch_loss = clf_loss_fn(y_batch_train, clf_batch_logits_concat)

    # Update gradients after batch
    bert_grads = bert_tape.gradient(clf_batch_loss, bert_model.trainable_weights)
    bert_optimizer.apply_gradients(zip(bert_grads, bert_model.trainable_weights))

    clf_grads = clf_tape.gradient(clf_batch_loss, clf_model.trainable_weights)
    clf_optimizer.apply_gradients(zip(clf_grads, clf_model.trainable_weights))

    # Update training metric
    clf_train_acc_metric.update_state(y_batch_train, clf_batch_logits_concat)
    
    # Return batch loss
    return clf_batch_loss

In [27]:
import time

total_loss = []

with tf.device('cpu:0'):
    epochs = 10
    for epoch in range(epochs):
        print(f"\nEpoch {epoch}/{epochs}")
        epoch_loss = []

        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            batch_loss = train_step(x_batch_train, y_batch_train)
            epoch_loss.append(batch_loss)
            print(f"> Batch {step}: loss={batch_loss}, accuracy={clf_train_acc_metric.result()}")
        
        print("Training acc over epoch: %.4f" % (float(clf_train_acc_metric.result()),))

        # Reset training metrics at the end of each epoch
        clf_train_acc_metric.reset_states()
        total_loss.append(epoch_loss)


Epoch 0/10




> Batch 0: loss=0.7599382996559143, accuracy=0.5




> Batch 1: loss=0.8582355380058289, accuracy=0.375




> Batch 2: loss=0.5944011211395264, accuracy=0.5




> Batch 3: loss=0.5949658155441284, accuracy=0.5625




> Batch 4: loss=0.5927650332450867, accuracy=0.6000000238418579
> Batch 5: loss=0.7356273531913757, accuracy=0.5833333134651184
> Batch 6: loss=0.5747919082641602, accuracy=0.6071428656578064
> Batch 7: loss=0.5927734971046448, accuracy=0.625
> Batch 8: loss=0.7303789258003235, accuracy=0.6111111044883728
> Batch 9: loss=0.7205784916877747, accuracy=0.6000000238418579
> Batch 10: loss=0.8328100442886353, accuracy=0.5681818127632141
> Batch 11: loss=0.7317829728126526, accuracy=0.5625
> Batch 12: loss=0.7422312498092651, accuracy=0.557692289352417
> Batch 13: loss=0.7663768529891968, accuracy=0.5535714030265808
> Batch 14: loss=0.6895115375518799, accuracy=0.550000011920929
> Batch 15: loss=0.7229762077331543, accuracy=0.546875
> Batch 16: loss=0.7445068955421448, accuracy=0.5441176295280457
> Batch 17: loss=0.693545401096344, accuracy=0.5416666865348816
> Batch 18: loss=0.528508186340332, accuracy=0.5657894611358643
> Batch 19: loss=0.8323354125022888, accuracy=0.550000011920929
> Batc

KeyboardInterrupt: 

In [None]:
plt.plot(np.arange(0, 90, 1), total_loss)