In [2]:
import sys

preprocessing_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Year 3\\COMP3200\\fake-news-profiling\\classifier\\preprocessing'
if preprocessing_path not in sys.path:
    sys.path.insert(1, preprocessing_path)

notif_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Year 3\\COMP3200\\fake-news-profiling\\classifier\\notifications'
if notif_path not in sys.path:
    sys.path.insert(1, notif_path)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow_hub as hub
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

import ipynb.fs.full.parse_datasets as datasets
import ipynb.fs.full.preprocessing as pp
import ipynb.fs.full.bert_fake_news_classifier as clf
from ipynb.fs.full.notif_email import send_email

## Parse the dataset

In [4]:
tweet_data, label_data = datasets.parse_dataset("datasets", "en")

## Preprocess the data

In [20]:
# Preprocess the data
tweet_preprocessor = pp.TweetPreprocessor(
    preprocess_funcs = [
        pp.tag_indicators,
        pp.replace_xml_and_html,
        pp.replace_emojis,
        pp.remove_punctuation,
        pp.replace_tags,
        pp.remove_hashtag_chars,
        pp.replace_accented_chars,
        pp.tag_numbers,
        pp.remove_stopwords,
        pp.remove_extra_spacing,
    ])
tweet_preprocessor.preprocess(tweet_data)

In [21]:
tweet_data_individual = tweet_preprocessor.get_individual_tweets_dataset()
tweet_data_feed = tweet_preprocessor.get_tweet_feed_dataset()

In [22]:
# Split the data
(tweet_train, label_train, 
 tweet_val, label_val, 
 tweet_test, label_test) = datasets.split_dataset(tweet_data_individual, label_data)

## BERT Individual Model

In [95]:
small_bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"
bert_encoder_individual = hub.KerasLayer(
    small_bert_url, 
    trainable=True,
)

bert_input_size_individual = 128

In [96]:
individual_tokenizer = clf.BertIndividualTweetTokenizer(bert_encoder_individual, bert_input_size_individual)
bert_model_individual = clf.create_bert_model(bert_encoder_individual, bert_input_size_individual)
bert_model_individual.compile(Adam(lr=1e-5), 'binary_crossentropy', ['accuracy'])

### Training

In [97]:
# Create a checkpoint for training
checkpoint_path_individual = "training/bert_training_individual_1/cp.ckpt"

bert_checkpoint_callback_individual = ModelCheckpoint(
    filepath=checkpoint_path_individual,
    save_weights_only=True,
    verbose=1
)

In [98]:
tweet_individual_train = individual_tokenizer.tokenize_input(tweet_train)
label_individual_train = individual_tokenizer.tokenize_labels(label_train)
tweet_individual_val = individual_tokenizer.tokenize_input(tweet_val)
label_individual_val = individual_tokenizer.tokenize_labels(label_val)
tweet_individual_test = individual_tokenizer.tokenize_input(tweet_test)
label_individual_test = individual_tokenizer.tokenize_labels(label_test)

#### Optimal hyper parameters

In [111]:
results = [{'batch_size': 1, 'epochs': 10, 'loss': 1.2027349472045898, 'accuracy': 0.570888876914978}]
batch_sizes = [1, 8, 16, 24, 32, 40, 48, 56, 64]
epochs = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

for batch_size in batch_sizes:
    for epoch in epochs:
        if batch_size == 1 and epoch == 10:
            continue

        # Fit the model and then evaluate
        with tf.device('gpu:0'):
            bert_encoder_individual = hub.KerasLayer(
                small_bert_url, 
                trainable=True,
            )
            bert_model_individual = clf.create_bert_model(bert_encoder_individual, bert_input_size_individual)
            bert_model_individual.compile(Adam(lr=1e-5), 'binary_crossentropy', ['accuracy'])
            
            bert_model_individual.fit(
                x=tweet_individual_train, 
                y=label_individual_train, 
                batch_size=batch_size, 
                epochs=epoch,
            )
            
            evaluated_results = bert_model_individual.evaluate(tweet_individual_val, label_individual_val)
            results.append({
                'batch_size': batch_size, 
                'epochs': epoch, 
                'loss': evaluated_results[0], 
                'accuracy': evaluated_results[1]
            })
            print(results[-1])

KeyboardInterrupt: 

In [107]:
max(results, key=lambda result: result['accuracy'])

{'batch_size': 1,
 'epochs': 1,
 'loss': 0.7084867358207703,
 'accuracy': 0.5873333215713501}

In [None]:
send_email(
    f"""
    Grid Search finished.
    Best model: 
    > batch_size: {best_results['batch_size']}
    > epochs: {best_results['epochs']}
    > loss: {best_results['loss']}
    > accuracy: {best_results['accuracy']}
    """)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(14, 6))

ax[0].plot()

#### Fitting with optimal parameters

In [None]:
with tf.device('gpu:0'):
    # Fit
    bert_model_individual.fit(
        x=tweet_individual_train, 
        y=label_individual_train, 
        batch_size=, 
        epochs=, 
        callbacks=[bert_checkpoint_callback_individual],
        validation_data=(tweet_individual_val, label_individual_val),
    )

In [None]:
bert_model_individual.evaluate(tweet_individual_test, label_individual_test)

## BERT Tweet Feed Model

In [None]:
medium_bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1"
bert_encoder_feed = hub.KerasLayer(
    medium_bert_url, 
    trainable=True,
)

bert_input_size_feed = 500

In [None]:
feed_tokenizer = clf.BertTweetFeedTokenizer(bert_encoder_feed, bert_input_size_feed)
bert_model_feed = clf.create_bert_model(bert_encoder_feed, bert_input_size_feed)
bert_model_feed.compile(Adam(lr=1e-5), 'binary_crossentropy', ['accuracy'])

### Training

In [None]:
# Create a checkpoint for training
checkpoint_path_feed = "training/bert_training_feed_2/cp.ckpt"

bert_checkpoint_callback_feed = ModelCheckpoint(
    filepath=checkpoint_path_feed,
    save_weights_only=True,
    verbose=1
)

In [None]:
tweet_feed_train = feed_tokenizer.tokenize_input(tweet_train)
label_feed_train = feed_tokenizer.tokenize_labels(label_train)
tweet_feed_val = feed_tokenizer.tokenize_input(tweet_val)
label_feed_val = feed_tokenizer.tokenize_labels(label_val)
tweet_feed_test = feed_tokenizer.tokenize_input(tweet_test)
label_feed_test = feed_tokenizer.tokenize_labels(label_test)

In [None]:
with tf.device('gpu:0'):
    # Fit
    bert_model_feed.fit(
        x=tweet_feed_train, 
        y=label_feed_train, 
        batch_size=10, 
        epochs=5, 
        callbacks=[bert_checkpoint_callback_feed],
        validation_data=(tweet_feed_val, label_feed_val),
    )