In [18]:
import sys

preprocessing_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Uni - Yr 3\\COMP3200\\fake-news-profiling\\classifier\\preprocessing'
if preprocessing_path not in sys.path:
    sys.path.insert(1, preprocessing_path)

In [19]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

import ipynb.fs.full.parse_datasets as datasets
from ipynb.fs.full.preprocessing import TweetPreprocessor
import ipynb.fs.full.bert_fake_news_classifier as clf

## Parse the dataset

In [20]:
tweet_data, label_data = datasets.parse_dataset("datasets", "en")

## Download the BERT encoder

In [21]:
import tensorflow_hub as hub

bert_encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3", 
    trainable=True
)

In [22]:
bert_input_size = 500

## Preprocess the data

In [23]:
# Preprocess the data
tweet_preprocessor = TweetPreprocessor()
tweet_preprocessor.preprocess(tweet_data)

In [24]:
tweet_data_individual = tweet_preprocessor.get_individual_tweets_dataset()
tweet_data_feed = tweet_preprocessor.get_tweet_feed_dataset()

In [25]:
# Split the data
(tweet_train, label_train, 
 tweet_val, label_val, 
 tweet_test, label_test) = datasets.split_dataset(tweet_data_individual, label_data)

## BERT Individual Model

In [26]:
individual_tokenizer = clf.BertIndividualTweetTokenizer(bert_encoder, bert_input_size)
bert_model = clf.create_bert_model(bert_encoder, bert_input_size)
bert_model.compile(Adam(lr=1e-5), 'binary_crossentropy', ['accuracy'])

### Training

In [27]:
# Create a checkpoint for training
checkpoint_path_individual = "training/bert_training_individual_1/cp.ckpt"

bert_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path_individual,
    save_weights_only=True,
    verbose=1
)

In [28]:
tweet_individual_train = individual_tokenizer.tokenize_input(tweet_train)
label_individual_train = individual_tokenizer.tokenize_labels(label_train)
tweet_individual_val = individual_tokenizer.tokenize_input(tweet_val)
label_individual_val = individual_tokenizer.tokenize_labels(label_val)
tweet_individual_test = individual_tokenizer.tokenize_input(tweet_test)
label_individual_test = individual_tokenizer.tokenize_labels(label_test)

In [29]:
# Fit
bert_model.fit(
    x=tweet_individual_train, 
    y=label_individual_train, 
    batch_size=20, 
    epochs=5, 
    callbacks=[bert_checkpoint_callback],
    validation_data=(tweet_individual_val, label_individual_val),
)

Epoch 1/5
   2/1050 [..............................] - ETA: 14:25:37 - loss: 0.6930 - accuracy: 0.5000

KeyboardInterrupt: 

## BERT Tweet Feed Model

In [None]:
feed_tokenizer = clf.BertTweetFeedTokenizer(bert_encoder, bert_input_size)
bert_model = clf.create_bert_model(bert_encoder, bert_input_size)
bert_model.compile(Adam(lr=1e-5), 'binary_crossentropy', ['accuracy'])

### Training

In [None]:
# Create a checkpoint for training
checkpoint_path_feed = "training/bert_training_feed_2/cp.ckpt"

bert_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path_feed,
    save_weights_only=True,
    verbose=1
)

In [None]:
tweet_feed_train = feed_tokenizer.tokenize_input(tweet_train)
label_feed_train = feed_tokenizer.tokenize_labels(label_train)
tweet_feed_val = feed_tokenizer.tokenize_input(tweet_val)
label_feed_val = feed_tokenizer.tokenize_labels(label_val)
tweet_feed_test = feed_tokenizer.tokenize_input(tweet_test)
label_feed_test = feed_tokenizer.tokenize_labels(label_test)

In [None]:
# Fit
bert_model.fit(
    x=tweet_feed_train, 
    y=label_feed_train, 
    batch_size=20, 
    epochs=5, 
    callbacks=[bert_checkpoint_callback],
    validation_data=(tweet_feed_val, label_feed_val),
)