In [1]:
import sys

preprocessing_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Year 3\\COMP3200\\fake-news-profiling\\classifier\\preprocessing'
if preprocessing_path not in sys.path:
    sys.path.insert(1, preprocessing_path)

notif_path = 'C:\\Users\\joshh\\Desktop\\Uni\\Soton Year 3\\COMP3200\\fake-news-profiling\\classifier\\notifications'
if notif_path not in sys.path:
    sys.path.insert(1, notif_path)

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from official.nlp import optimization

import ipynb.fs.full.parse_datasets as datasets
import ipynb.fs.full.preprocessing as processing
import ipynb.fs.full.bert_fake_news_classifier as bclf
from ipynb.fs.full.notif_email import send_email

# Load Dataset

In [6]:
# Load data
# tweet_data, label_data = datasets.parse_dataset("datasets", "en")

# Split the data
# (tweet_train, label_train, 
#  tweet_val, label_val, 
#  tweet_test, label_test) = datasets.split_dataset(
#     tweet_data, label_data, test_size=0.10, val_size=0.10)

# Save split data
# np.save(
#     "datasets/en_split_data.npy", 
#     np.asarray([tweet_train, label_train, tweet_val, label_val, tweet_test, label_test]), 
#     allow_pickle=True
# )

In [3]:
# Load the saved dataset split
def load_data():
    return np.load("datasets/en_split_data.npy", allow_pickle=True)

In [4]:
(tweet_train, label_train, 
 tweet_val, label_val, 
 tweet_test, label_test) = load_data()

In [5]:
# Preprocess dataset
tweet_preprocessor = processing.BertTweetFeedDataPreprocessor(
    transformers = [
        processing.tag_indicators,
        processing.replace_xml_and_html,
        processing.replace_emojis,
        processing.remove_punctuation,
        processing.replace_tags,
        processing.remove_hashtag_chars,
        processing.replace_accented_chars,
        processing.tag_numbers,
        processing.remove_stopwords,
        processing.remove_extra_spacing,
    ])
tweet_train_processed = tweet_preprocessor.transform(tweet_train)
tweet_val_processed = tweet_preprocessor.transform(tweet_val)
tweet_test_processed = tweet_preprocessor.transform(tweet_test)

# BERT Individual Model
* Each tweet is given the same label as their authors label, and then BERT is trained using this dataset of individual tweets

## Investigating model performance
* Investigated model performance by varying:
    * Batch size, epochs and learning rate (as suggested in the BERT paper)
* Used callbacks to log loss and accuracy to TensorBoard
* Used the BERT model with hidden size of 128, but also investigated using the one with size of 256

### BERT model with 128 input size
* Evaluate classifying individual tweets using the BERT model with a hidden size of 128, 12 hidden layers, and 2 attention heads: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1
* Investigate using the following combinations:
    * Batch sizes of: [8, 16, 24, 32, 40]
    * Epochs of: [10]
    * Learning rates of: [5e-5, 3e-5, 2e-5]
    * Optimizers: [Adam, Adam with weight decay (AdamW)]

In [None]:
bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"
bert_size = 128
models = [(batch_size, epochs, lr, optimizer_name)
         for batch_size in [8, 16, 24, 32, 40]
         for epochs in [10]
         for lr in [5e-5, 3e-5, 2e-5]
         for optimizer_name in ['adam', 'adamw']]

model_path = "training/bert_individual/initial_eval/"

for batch_size, epochs, lr, optimizer_name in models:
    with tf.device('/gpu:0'):
        model_name = f"bert128-batch_size{batch_size}-epochs{epochs}-lr{lr}-optimizer{optimizer_name}"
        model_handler = bclf.BertModelEvalHandler(
            bert_url, bert_size, bclf.BertIndividualTweetTokenizer, bclf.dense_bert_model)

        train_history = model_handler.train_bert(
            tweet_train_processed,
            label_train,
            batch_size,
            epochs,
            tweet_val_processed,
            label_val,
            optimizer_name,
            lr,
            model_path + model_name + "/cp.ckpt",
            model_path + "logs/" + model_name,
        )

        text = f"Finished training {model_name}, training history was:\n{train_history.history}"
        print(text)
        send_email(text)

### BERT model with 256 input size
* Evaluate classifying individual tweets using the BERT model with a hidden size of 256, 12 hidden layers, and 4 attention heads: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1
* Investigate using the following combinations:
    * Batch sizes of: [8, 16, 24, 32]
    * Epochs of: [10]
    * Learning rates of: [5e-5, 3e-5, 2e-5]
    * Optimizers: [Adam, Adam with weight decay (AdamW)]

In [None]:
bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1"
bert_size = 256
models = [(batch_size, epochs, lr, optimizer_name)
         for batch_size in [16, 24, 32, 8]
         for epochs in [10]
         for lr in [5e-5, 3e-5, 2e-5]
         for optimizer_name in ['adam', 'adamw']]

model_path = "training/bert_individual/initial_eval/"

for batch_size, epochs, lr, optimizer_name in models:
    with tf.device(('/gpu:0' if batch_size < 32 else '/cpu:0')):
        model_name = f"bert256-batch_size{batch_size}-epochs{epochs}-lr{lr}-optimizer{optimizer_name}"
        model_handler = bclf.BertModelEvalHandler(
            bert_url, bert_size, bclf.BertIndividualTweetTokenizer, bclf.dense_bert_model)

        train_history = model_handler.train_bert(
            tweet_train_processed,
            label_train,
            batch_size,
            epochs,
            tweet_val_processed,
            label_val,
            optimizer_name,
            lr,
            model_path + model_name + "/cp.ckpt",
            model_path + "logs/" + model_name,
        )

        text = f"Finished training {model_name}, training history was:\n{train_history.history}"
        print(text)
        send_email(text)
        with open(model_path+"train_history.txt", "a") as file:
            name = f"I_{bert_size}_{batch_size}_{str(lr)[0]}_{'A' if optimizer_name == 'adam' else 'AW'}"
            file_text = ", ,".join([
                ",".join([bert_size, batch_size, lr, optimizer_name, name]), 
                ",".join(train_history.history['loss']),
                ",".join(train_history.history['val_loss']),
                ",".join(train_history.history['binary_accuracy']),
                ",".join(train_history.history['val_binary_accuracy']),
            ])
            file.write(file_text)

### Findings:
* As noted by the original BERT paper, fine-tuning takes very few epochs, so from about 6 epochs upwards, both loss and accuracy seemed to plateau.
* Found that validation loss using AdamW was generally higher than using Adam, however accuracy was also higher

# BERT Feed Model
* An authors tweets are concatenated into one string. The string is then split into chunks, and each chunk given the same label as its author. The BERT model is then trained on these tweet chunks.

## Investigating model performance
* Investigated model performance by varyingas before.
* Using BERT models with hidden layers of 128 and 256.

### BERT model with 128 input size
* Evaluate classifying individual tweets using the BERT model with a hidden size of 128, 12 hidden layers, and 2 attention heads: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1
* Investigate using the following combinations:
    * Batch sizes of: [8, 16, 24, 32, 40]
    * Epochs of: [10]
    * Learning rates of: [5e-5, 3e-5, 2e-5]
    * Optimizers: [Adam, Adam with weight decay (AdamW)]

In [7]:
bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"
bert_size = 128
models = [(batch_size, epochs, lr, optimizer_name)
         for batch_size in [8, 16, 24, 32, 40]
         for epochs in [10]
         for lr in [5e-5, 3e-5, 2e-5]
         for optimizer_name in ['adam', 'adamw']]

model_path = "training/bert_feed/initial_eval/"

for batch_size, epochs, lr, optimizer_name in models:
    with tf.device('/gpu:0'):
        model_name = f"bert128-batch_size{batch_size}-epochs{epochs}-lr{lr}-optimizer{optimizer_name}"
        model_handler = bclf.BertModelEvalHandler(
            bert_url, bert_size, bclf.BertTweetFeedTokenizer, bclf.dense_bert_model)

        train_history = model_handler.train_bert(
            tweet_train_processed,
            label_train,
            batch_size,
            epochs,
            tweet_val_processed,
            label_val,
            optimizer_name,
            lr,
            model_path + model_name + "/cp.ckpt",
            model_path + "logs/" + model_name,
        )

        text = f"Finished training {model_name}, training history was:\n{train_history.history}"
        print(text)
        send_email(text)
        with open(model_path+"train_history.txt", "a") as file:
            def join_hist_array(array):
                return ",".join([str(i) for i in array])

            name = f"F_{bert_size}_{batch_size}_{str(lr)[0]}_{'A' if optimizer_name == 'adam' else 'AW'}"
            file_text = ", ,".join([
                ",".join([str(bert_size), str(batch_size), str(lr), optimizer_name, name]), 
                join_hist_array(train_history.history['loss']),
                join_hist_array(train_history.history['val_loss']),
                join_hist_array(train_history.history['binary_accuracy']),
                join_hist_array(train_history.history['val_binary_accuracy']),
            ]) + "\n"
            file.write(file_text)

### BERT model with 256 input size
* Evaluate classifying individual tweets using the BERT model with a hidden size of 256, 12 hidden layers, and 4 attention heads: https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1
* Investigate using the following combinations:
    * Batch sizes of: [8, 16, 24, 32]
    * Epochs of: [10]
    * Learning rates of: [5e-5, 3e-5, 2e-5]
    * Optimizers: [Adam, Adam with weight decay (AdamW)]

In [8]:
bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1"
bert_size = 256
models = [(batch_size, epochs, lr, optimizer_name)
         for batch_size in [8, 16, 24, 32]
         for epochs in [10]
         for lr in [5e-5, 3e-5, 2e-5]
         for optimizer_name in ['adam', 'adamw']]

model_path = "training/bert_feed/initial_eval/"


for batch_size, epochs, lr, optimizer_name in models:
    with tf.device(('/gpu:0' if batch_size < 32 else '/cpu:0')):
        model_name = f"bert256-batch_size{batch_size}-epochs{epochs}-lr{lr}-optimizer{optimizer_name}"
        model_handler = bclf.BertModelEvalHandler(
            bert_url, bert_size, bclf.BertTweetFeedTokenizer, bclf.dense_bert_model)

        train_history = model_handler.train_bert(
            tweet_train_processed,
            label_train,
            batch_size,
            epochs,
            tweet_val_processed,
            label_val,
            optimizer_name,
            lr,
            model_path + model_name + "/cp.ckpt",
            model_path + "logs/" + model_name,
        )

        text = f"Finished training {model_name}, training history was:\n{train_history.history}"
        print(text)
        send_email(text)
        with open(model_path+"train_history.txt", "a") as file:
            name = f"F_{bert_size}_{batch_size}_{str(lr)[0]}_{'A' if optimizer_name == 'adam' else 'AW'}"
            file_text = ", ,".join([
                ",".join([bert_size, batch_size, lr, optimizer_name, name]), 
                ",".join(train_history.history['loss']),
                ",".join(train_history.history['val_loss']),
                ",".join(train_history.history['binary_accuracy']),
                ",".join(train_history.history['val_binary_accuracy']),
            ])
            file.write(file_text)

# Preprocessing functions
* I selected the top performing BERT models: 
    * I_128_40_5_AW, I_256_8_5_AW, F_128_8_2_AW, F_128_16_2_AW, F_128_24_2_AW, F_128_32_3_AW, F_128_32_5_AW, F_128_40_5_AW, F_256_24_5_AW, F_256_64_2_AW, F_256_64_5_AW, F_256_80_5_AW
    * ({model type (individual/feed)}-{bert size}-{batch size}-{learning rate}-{optimizer (adam/adam with weight decay)})


* Train using different preprocessing functions on the input data:
    * No preprocessing functions (raw data)
    * Remove emojis, remove accented chars, replace tags
    * Remove emojis, remove accented chars, replace tags, remove punctuation, tag numbers
    * Remove emojis, remove accented chars, replace tags, remove punctuation, tag numbers, remove stopwords
    * Embed emojis, remove accented chars, replace tags, remove punctuation, tag numbers
    * Embed emojis, remove accented chars, replace tags, remove punctuation, tag numbers, remove stopwords

In [21]:
def preprocess_funcs_search(processing_funcs, model_params, X_train, y_train, X_val, y_val, save_path):
    for funcs in processing_funcs:
        preprocessor = processing.BertTweetFeedDataPreprocessor(transformers=funcs)
        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)
        
        with open(save_path + "train_history.txt", "a") as file:
            file.write(f"\n-----Preprocessing funcs are: {funcs}")

        for params in model_params:
            bert_size = params["bert_size"]
            lr = params["learning_rate"]
            optimizer_name = params["optimizer"]
            batch_size = params["batch_size"]
            epochs = 16
            name = f"{params['model_type']}_{bert_size}_{str(lr)[0]}_{'A' if optimizer_name == 'adam' else 'AW'}"

            model_handler = bclf.BertModelEvalHandler(
                params["bert_url"], bert_size, params["tokenizer"], bclf.dense_bert_model)

            with tf.device(('/gpu:0' if batch_size < 32 else '/cpu:0')):
                train_history = model_handler.train_bert(
                    X_train_processed,
                    y_train,
                    batch_size,
                    epochs,
                    X_val_processed,
                    y_val,
                    optimizer_name,
                    lr,
                    save_path + name + "/cp.ckpt",
                    save_path + "logs/" + name,
                )

            file_text = ", ,".join([
                ",".join([bert_size, batch_size, lr, optimizer_name, name]), 
                ",".join(train_history.history['loss']),
                ",".join(train_history.history['val_loss']),
                ",".join(train_history.history['binary_accuracy']),
                ",".join(train_history.history['val_binary_accuracy']),
            ])
            with open(save_path + "train_history.txt", "a") as file:
                file.write(file_text)
        
        send_email("Finished training")

In [22]:
preprocess_funcs_search(
    processing_funcs=[
        # No preprocessing functions (raw data)
        [],
        # Remove HTML, replace emojis, remove accented chars, replace tags
        [processing.replace_xml_and_html,
         processing.replace_emojis, 
         processing.replace_accented_chars, 
         processing.replace_tags, 
         processing.remove_extra_spacing],
        # Remove HTML, replace emojis, remove accented chars, replace tags, remove punctuation, tag numbers
        [processing.replace_xml_and_html,
         processing.replace_emojis, 
         processing.replace_accented_chars, 
         processing.remove_punctuation,
         processing.replace_tags,
         processing.remove_hashtag_chars,
         processing.tag_numbers,
         processing.remove_extra_spacing],
        # Remove HTML, replace emojis, remove accented chars, replace tags, remove punctuation, tag numbers, remove stopwords
        [processing.replace_xml_and_html,
         processing.replace_emojis, 
         processing.replace_accented_chars, 
         processing.remove_punctuation,
         processing.replace_tags,
         processing.remove_hashtag_chars,
         processing.tag_numbers,
         processing.remove_stopwords,
         processing.remove_extra_spacing],
    ],
    model_params=[
        # I_128_40_5_AW
        {"model_type": "I",
         "bert_size": 128, 
         "learning_rate": 5e-5, 
         "optimizer": "adamw", 
         "batch_size": 40,
         "tokenizer": bclf.BertIndividualTweetTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"},
        # I_256_8_5_AW
        {"model_type": "I",
         "bert_size": 256, 
         "learning_rate": 5e-5, 
         "optimizer": "adamw", 
         "batch_size": 8,
         "tokenizer": bclf.BertIndividualTweetTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1"},
        # F_128_8_2_AW
        {"model_type": "F",
         "bert_size": 128, 
         "learning_rate": 2e-5, 
         "optimizer": "adamw", 
         "batch_size": 8,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"},
        # F_128_16_2_AW
        {"model_type": "F",
         "bert_size": 128, 
         "learning_rate": 2e-5, 
         "optimizer": "adamw", 
         "batch_size": 16,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"},
        # F_128_24_2_AW
        {"model_type": "F",
         "bert_size": 128, 
         "learning_rate": 2e-5, 
         "optimizer": "adamw", 
         "batch_size": 24,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"},
        # F_128_32_3_AW
        {"model_type": "F",
         "bert_size": 128, 
         "learning_rate": 3e-5, 
         "optimizer": "adamw", 
         "batch_size": 32,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"},
        # F_128_32_5_AW
        {"model_type": "F",
         "bert_size": 128, 
         "learning_rate": 5e-5, 
         "optimizer": "adamw", 
         "batch_size": 32,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"},
        # F_128_40_5_AW
        {"model_type": "F",
         "bert_size": 128, 
         "learning_rate": 5e-5, 
         "optimizer": "adamw", 
         "batch_size": 40,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"},
        # F_256_24_5_AW
        {"model_type": "F",
         "bert_size": 256, 
         "learning_rate": 5e-5, 
         "optimizer": "adamw", 
         "batch_size": 24,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1"},
        # F_256_64_2_AW
        {"model_type": "F",
         "bert_size": 256, 
         "learning_rate": 2e-5, 
         "optimizer": "adamw", 
         "batch_size": 64,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1"},
        # F_256_64_5_AW
        {"model_type": "F",
         "bert_size": 256, 
         "learning_rate": 5e-5, 
         "optimizer": "adamw", 
         "batch_size": 64,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1"},
        # F_256_80_5_AW
        {"model_type": "F",
         "bert_size": 256, 
         "learning_rate": 5e-5, 
         "optimizer": "adamw", 
         "batch_size": 80,
         "tokenizer": bclf.BertTweetFeedTokenizer, 
         "bert_url": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1"},
    ],
    X_train=tweet_train,
    y_train=label_train,
    X_val=tweet_val,
    y_val=label_val,
    save_path="training/preprocessing/initial_eval/",
)

Epoch 1/16
Instructions for updating:
use `tf.profiler.experimental.stop` instead.


Instructions for updating:
use `tf.profiler.experimental.stop` instead.


 87/600 [===>..........................] - ETA: 11:33 - loss: 0.7472 - binary_accuracy: 0.5095

KeyboardInterrupt: 

### Final training + classification
* Picked the best performing models after initial evaluation and train them
* Using these models, I added a final (separate) classifier over their predictions. Below I investigate the performance of different classifiers

Top 5 models:
* batch_size=32, epochs=1, learning_rate=5e-5, bert_size=128, optimizer=Adam
* batch_size=40, epochs=3, learning_rate=2e-5, bert_size=128, optimizer=Adam
* batch_size=32, epochs=5, learning_rate=3e-5, bert_size=128, optimizer=AdamW
* batch_size=16, epochs=3, learning_rate=3e-5, bert_size=128, optimizer=AdamW
* batch_size=16, epochs=7, learning_rate=3e-5, bert_size=256, optimizer=AdamW

In [50]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss

In [51]:
# Calculate user predictions from individual tweet data
def calculate_user_predictions_from_individual_tweets(model, tweets, labels):
    # Get tweets for each user
    tweets_per_user = 100
    user_tweets = [
        {
            'input_word_ids': tweets['input_word_ids'][i:i+tweets_per_user],
            'input_mask': tweets['input_mask'][i:i+tweets_per_user],
            'input_type_ids': tweets['input_type_ids'][i:i+tweets_per_user],
        }
        for i in range(0, len(tweets['input_word_ids']), tweets_per_user)
    ]
    user_labels = np.asarray([
        labels[i].numpy() for i in range(0, len(labels), tweets_per_user)
     ])
    
    # Evaluate each user
    all_predictions = []
    for user_label, user_tweet in zip(user_labels, user_tweets):
        all_predictions.append(
            model.predict(user_tweet).flatten()
        )
    
    return np.asarray(all_predictions), user_labels

In [66]:
def train_clf_individual_models(X_train, y_train, X_val, y_val, bert_model_name):
    X_train_sorted = np.sort(X_train, axis=1)
    X_val_sorted = np.sort(X_val, axis=1)
    
    def fit_evaluate(estimator, param_grid, estimator_name):
        clf = GridSearchCV(
            estimator,
            param_grid,
            scoring=['accuracy', 'f1'],
            refit='accuracy',
        )
        
        df_results = []
        
        # Unsorted datapoints
        clf.fit(X_train, y_train)
        y_val_pred = clf.predict(X_val)
        accuracy = accuracy_score(y_val, y_val_pred)
        f1 = f1_score(y_val, y_val_pred)
        loss = log_loss(y_val, y_val_pred)
        df_results.append(
            np.asarray([bert_model_name, estimator_name, "Unsorted", loss, accuracy, f1]))
        clf_params = clf.best_params_
        
        # Sorted datapoints
        clf.fit(X_train_sorted, y_train)
        y_val_pred = clf.predict(X_val_sorted)
        accuracy_sorted = accuracy_score(y_val, y_val_pred)
        f1_sorted = f1_score(y_val, y_val_pred)
        loss_sorted = log_loss(y_val, y_val_pred)
        df_results.append(
            np.asarray([bert_model_name, estimator_name, "Sorted", loss_sorted, accuracy_sorted, f1_sorted]))
        clf_params_sorted = clf.best_params_
        
        return ({
            'unsorted-params': {
                'params': clf_params,
                'accuracy': accuracy,
                'f1': f1,
                'log_loss': loss,
            },
            'sorted': {
                'params': clf_params_sorted,
                'accuracy': accuracy_sorted,
                'f1': f1_sorted,
                'log_loss': loss_sorted,
            }
        }, df_results)
    
    result = {}
    df_result = []
    
    # Logistic Regression
    param_grid = {
        'penalty': ['l1', 'l2'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [100, 200],
    }
    data, df = fit_evaluate(LogisticRegression(), param_grid, 'LogisticRegression')
    df_result.append(df)
    result['LogisticRegression'] = data
    
    # Support Vector Classifier
    param_grid = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': [2, 3, 4, 5],
    }
    data, df = fit_evaluate(SVC(), param_grid, 'SVC')
    df_result.append(df)
    result['SVC'] = data

    # RandomForest
    param_grid = {
        'n_estimators': [50, 100, 150, 200],
        'criterion': ['gini', 'entropy'],
    }
    data, df = fit_evaluate(RandomForestClassifier(), param_grid, 'RandomForest')
    df_result.append(df)
    result['RandomForestClassifier'] = data
    
    # GradientBoosting
    param_grid = {
        'loss': ['deviance', 'exponential'],
        'learning_rate': [0.2, 0.1, 0.01, 0.001, 0.0001],
        'n_estimators': [100, 150, 200],
    }
    data, df = fit_evaluate(GradientBoostingClassifier(), param_grid, 'GradientBoosting')
    df_result.append(df)
    result['GradientBoostingClassifier'] = data
    
    return result, np.asarray(df_result)

In [67]:
def train_bert_individual_model(batch_size, epochs, learning_rate, bert_size, optimizer_name):
    # Load encoder and tokenizer
    if bert_size == 128:
        url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1"
    elif bert_size == 256:
        url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1"
    
    encoder = hub.KerasLayer(url, trainable=True)
    tokenizer = bclf.BertIndividualTweetTokenizer(encoder, bert_size)
    
    tweet_individual_train = tokenizer.tokenize_input(tweet_train_processed)
    label_individual_train = tokenizer.tokenize_labels(label_train)
    tweet_individual_val = tokenizer.tokenize_input(tweet_val_processed)
    label_individual_val = tokenizer.tokenize_labels(label_val)
    
    # Callbacks (checkpoint, tensorboard)
    name = f"batch_size{batch_size}-epochs{epochs}-lr{learning_rate}-size{bert_size}"
    path = "training/bert_individual/loss-testing-full-model-1/"
#     log_dir = path + "logs/" + name
#     tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    
    checkpoint_path = path + name + "/cp.ckpt"
#     checkpoint = ModelCheckpoint(
#         filepath=checkpoint_path,
#         save_weights_only=True,
#         save_best_only=True,
#         verbose=1,
#     )
    
    # Optimizer
#     if optimizer_name == 'adam':
#         optimizer = tf.keras.optimizers.Adam(learning_rate)
#     elif optimizer_name == 'adamw':
#         steps_per_epoch = len(tweet_individual_train['input_word_ids']) / batch_size
#         total_training_steps = epochs * steps_per_epoch
#         warmup_steps = int(0.1 * total_training_steps)
#         optimizer = optimization.create_optimizer(
#             init_lr=learning_rate,
#             num_train_steps=total_training_steps,
#             num_warmup_steps=warmup_steps,
#             optimizer_type='adamw'
#         )
    
    # BERT Model
    bert_model = bclf.create_bert_model(encoder, bert_size)
#     bert_model.compile(
#         optimizer=optimizer, 
#         loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
#         metrics=tf.metrics.BinaryAccuracy(),
#     )

    # Train BERT
#     bert_model.fit(
#         x=tweet_individual_train,
#         y=label_individual_train,
#         batch_size=batch_size,
#         epochs=epochs,
#         callbacks=[checkpoint, tensorboard],
#         validation_data=(tweet_individual_val, label_individual_val),
#     )

    print("Loading weights for model:", name)
    bert_model.load_weights(checkpoint_path)
    print("Successfully loaded weights")
    
    # Format data for final classifiers
    X_train, y_train = calculate_user_predictions_from_individual_tweets(
        bert_model, 
        tweet_individual_train, 
        label_individual_train,
    )
    X_val, y_val = calculate_user_predictions_from_individual_tweets(
        bert_model, 
        tweet_individual_val, 
        label_individual_val,
    )
    
    # Train and evaluate final classifiers
    best_clfs_data, best_clfs_df = train_clf_individual_models(X_train, y_train, X_val, y_val, name)
    text = f"Best classifiers for BERT model {name}:\n{best_clfs_data}"
    print(text)
    send_email(text)
    return best_clfs_data, best_clfs_df, name

In [68]:
# Save the best model only, through the 10 epochs
models = [
    (32, 10, 5e-5, 128, 'adam'),
    (40, 10, 2e-5, 128, 'adam'),
    (32, 10, 3e-5, 128, 'adamw'),
    (16, 10, 3e-5, 128, 'adamw'),
    (16, 10, 3e-5, 256, 'adamw'),
]

models_data = {}
models_df = []

for model in models:
    data, df, model_name = train_bert_individual_model(*model)
    models_df.append(df)
    models_data[model_name] = data

Loading weights for model: batch_size32-epochs10-lr5e-05-size128
Successfully loaded weights


Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr



Best classifiers for BERT model batch_size32-epochs10-lr5e-05-size128:
{'LogisticRegression': {'unsorted-params': {'params': {'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}, 'accuracy': 0.7666666666666667, 'f1': 0.7741935483870969, 'log_loss': 8.059154438469852}, 'sorted': {'params': {'max_iter': 200, 'penalty': 'l1', 'solver': 'liblinear'}, 'accuracy': 0.8333333333333334, 'f1': 0.8275862068965518, 'log_loss': 5.75651603898046}}, 'SVC': {'unsorted-params': {'params': {'degree': 2, 'kernel': 'rbf'}, 'accuracy': 0.8, 'f1': 0.8000000000000002, 'log_loss': 6.907835238725156}, 'sorted': {'params': {'degree': 2, 'kernel': 'linear'}, 'accuracy': 0.8, 'f1': 0.8000000000000002, 'log_loss': 6.907835238725156}}, 'RandomForestClassifier': {'unsorted-params': {'params': {'criterion': 'gini', 'n_estimators': 50}, 'accuracy': 0.7333333333333333, 'f1': 0.7500000000000001, 'log_loss': 9.210473638214548}, 'sorted': {'params': {'criterion': 'entropy', 'n_estimators': 200}, 'accuracy': 0.8, 'f1'

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr



Best classifiers for BERT model batch_size40-epochs10-lr2e-05-size128:
{'LogisticRegression': {'unsorted-params': {'params': {'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}, 'accuracy': 0.6666666666666666, 'f1': 0.6875, 'log_loss': 11.513085384456266}, 'sorted': {'params': {'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}, 'accuracy': 0.7666666666666667, 'f1': 0.7407407407407408, 'log_loss': 8.059101131974506}}, 'SVC': {'unsorted-params': {'params': {'degree': 2, 'kernel': 'rbf'}, 'accuracy': 0.7666666666666667, 'f1': 0.7741935483870969, 'log_loss': 8.059154438469852}, 'sorted': {'params': {'degree': 3, 'kernel': 'poly'}, 'accuracy': 0.8, 'f1': 0.7692307692307692, 'log_loss': 6.90778193222981}}, 'RandomForestClassifier': {'unsorted-params': {'params': {'criterion': 'gini', 'n_estimators': 150}, 'accuracy': 0.7, 'f1': 0.7096774193548386, 'log_loss': 10.36176618471157}, 'sorted': {'params': {'criterion': 'gini', 'n_estimators': 50}, 'accuracy': 0.8, 'f1': 0.800000000000







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































Loading weights for model: batch_size32-epochs10-lr3e-05-size128
Successfully loaded weights


Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr



Best classifiers for BERT model batch_size32-epochs10-lr3e-05-size128:
{'LogisticRegression': {'unsorted-params': {'params': {'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}, 'accuracy': 0.7666666666666667, 'f1': 0.7741935483870969, 'log_loss': 8.059154438469852}, 'sorted': {'params': {'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}, 'accuracy': 0.7666666666666667, 'f1': 0.7586206896551724, 'log_loss': 8.059127785222179}}, 'SVC': {'unsorted-params': {'params': {'degree': 2, 'kernel': 'linear'}, 'accuracy': 0.7666666666666667, 'f1': 0.7741935483870969, 'log_loss': 8.059154438469852}, 'sorted': {'params': {'degree': 2, 'kernel': 'poly'}, 'accuracy': 0.7666666666666667, 'f1': 0.7586206896551724, 'log_loss': 8.059127785222179}}, 'RandomForestClassifier': {'unsorted-params': {'params': {'criterion': 'gini', 'n_estimators': 150}, 'accuracy': 0.7666666666666667, 'f1': 0.7586206896551724, 'log_loss': 8.059127785222179}, 'sorted': {'params': {'criterion': 'entropy', 'n_estimat

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr



Best classifiers for BERT model batch_size16-epochs10-lr3e-05-size128:
{'LogisticRegression': {'unsorted-params': {'params': {'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}, 'accuracy': 0.7666666666666667, 'f1': 0.7586206896551724, 'log_loss': 8.059127785222179}, 'sorted': {'params': {'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}, 'accuracy': 0.7333333333333333, 'f1': 0.7142857142857142, 'log_loss': 9.210420331719202}}, 'SVC': {'unsorted-params': {'params': {'degree': 2, 'kernel': 'rbf'}, 'accuracy': 0.8333333333333334, 'f1': 0.8275862068965518, 'log_loss': 5.75651603898046}, 'sorted': {'params': {'degree': 2, 'kernel': 'linear'}, 'accuracy': 0.7333333333333333, 'f1': 0.7142857142857142, 'log_loss': 9.210420331719202}}, 'RandomForestClassifier': {'unsorted-params': {'params': {'criterion': 'entropy', 'n_estimators': 200}, 'accuracy': 0.8, 'f1': 0.7857142857142856, 'log_loss': 6.907808585477483}, 'sorted': {'params': {'criterion': 'gini', 'n_estimators': 100}, 'accuracy'

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\users\joshh\desktop\uni\soton year 3\comp3200\fake-news-profiling\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_tr

Best classifiers for BERT model batch_size16-epochs10-lr3e-05-size256:
{'LogisticRegression': {'unsorted-params': {'params': {'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}, 'accuracy': 0.7333333333333333, 'f1': 0.7333333333333333, 'log_loss': 9.210446984966874}, 'sorted': {'params': {'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}, 'accuracy': 0.7666666666666667, 'f1': 0.7407407407407408, 'log_loss': 8.059101131974506}}, 'SVC': {'unsorted-params': {'params': {'degree': 2, 'kernel': 'rbf'}, 'accuracy': 0.7333333333333333, 'f1': 0.7142857142857142, 'log_loss': 9.210420331719202}, 'sorted': {'params': {'degree': 2, 'kernel': 'sigmoid'}, 'accuracy': 0.7333333333333333, 'f1': 0.7142857142857142, 'log_loss': 9.210420331719202}}, 'RandomForestClassifier': {'unsorted-params': {'params': {'criterion': 'entropy', 'n_estimators': 150}, 'accuracy': 0.7333333333333333, 'f1': 0.7142857142857142, 'log_loss': 9.210420331719202}, 'sorted': {'params': {'criterion': 'entropy', 'n_estimator

In [82]:
models_df = np.asarray([clf_sorted 
            for bert_models in models_df 
            for clf_models in bert_models 
            for clf_sorted in clf_models])

In [101]:
df = pd.DataFrame(
    models_df, 
    columns=("BERT model", "Classifier", "Classifier data", "Log loss", "Accuracy", "F1")
)
df = df.astype({"Log loss": "float32", "Accuracy": "float32", "F1": "float32"})
df.sort_values(["Log loss", "Accuracy", "F1"], ascending=[True, False, False])

Unnamed: 0,BERT model,Classifier,Classifier data,Log loss,Accuracy,F1
1,batch_size32-epochs10-lr5e-05-size128,LogisticRegression,Sorted,5.756516,0.833333,0.827586
26,batch_size16-epochs10-lr3e-05-size128,SVC,Unsorted,5.756516,0.833333,0.827586
37,batch_size16-epochs10-lr3e-05-size256,RandomForest,Sorted,5.756516,0.833333,0.827586
11,batch_size40-epochs10-lr2e-05-size128,SVC,Sorted,6.907782,0.8,0.769231
28,batch_size16-epochs10-lr3e-05-size128,RandomForest,Unsorted,6.907809,0.8,0.785714
2,batch_size32-epochs10-lr5e-05-size128,SVC,Unsorted,6.907835,0.8,0.8
3,batch_size32-epochs10-lr5e-05-size128,SVC,Sorted,6.907835,0.8,0.8
5,batch_size32-epochs10-lr5e-05-size128,RandomForest,Sorted,6.907835,0.8,0.8
13,batch_size40-epochs10-lr2e-05-size128,RandomForest,Sorted,6.907835,0.8,0.8
15,batch_size40-epochs10-lr2e-05-size128,GradientBoosting,Sorted,6.907862,0.8,0.8125


### Investigating BERT output classifiers
* Investigate using other classifiers ontop of BERT's pooled output.
    * Previously used a single Dense layer, now investigate using: RNN, Transformer decoder, Multilayered Perceptron

In [139]:
def train_bert_individual_model(
    url, batch_size, epochs, learning_rate, bert_size, optimizer_name, bert_model_func):
    # Load encoder and tokenizer
    encoder = hub.KerasLayer(url, trainable=True)
    tokenizer = bclf.BertIndividualTweetTokenizer(encoder, bert_size)
    
    tweet_individual_train = tokenizer.tokenize_input(tweet_train_processed)
    label_individual_train = tokenizer.tokenize_labels(label_train)
    tweet_individual_val = tokenizer.tokenize_input(tweet_val_processed)
    label_individual_val = tokenizer.tokenize_labels(label_val)
    
    # Callbacks (checkpoint, tensorboard)
    name = f"batch_size{batch_size}-epochs{epochs}-lr{learning_rate}-size{bert_size}"
    path = "training/bert_individual/loss-testing-full-model-1/"
    log_dir = path + "logs/" + name
    tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    
    checkpoint_path = path + name + "/cp.ckpt"
    checkpoint = ModelCheckpoint(
        filepath=checkpoint_path,
        save_weights_only=True,
        save_best_only=True,
        verbose=1,
    )
    
    # Optimizer
    if optimizer_name == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate)
    elif optimizer_name == 'adamw':
        steps_per_epoch = len(tweet_individual_train['input_word_ids']) / batch_size
        total_training_steps = epochs * steps_per_epoch
        warmup_steps = int(0.1 * total_training_steps)
        optimizer = optimization.create_optimizer(
            init_lr=learning_rate,
            num_train_steps=total_training_steps,
            num_warmup_steps=warmup_steps,
            optimizer_type='adamw'
        )
    
    # BERT Model
    bert_model = bert_model_func(encoder, bert_size)
    bert_model.compile(
        optimizer=optimizer, 
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
        metrics=tf.metrics.BinaryAccuracy(),
    )

#     Train BERT
    bert_model.fit(
        x=tweet_individual_train,
        y=label_individual_train,
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[checkpoint, tensorboard],
        validation_data=(tweet_individual_val, label_individual_val),
    )
    
    return bert_model

In [141]:
bert_model_with_lstm = train_bert_individual_model(
    "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1",
    128,
    32,
    10,
    5e-05,
    'adam',
    bclf.bert_model_lstm
)

AttributeError: module 'ipynb.fs.full.bert_fake_news_classifier' has no attribute 'bert_model_lstm'

In [14]:
small_bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-2/1"
bert_encoder_individual = hub.KerasLayer(
    small_bert_url, 
    trainable=True,
)
bert_input_size_individual = 256

KeyboardInterrupt: 

In [None]:
individual_tokenizer = bclf.BertIndividualTweetTokenizer(bert_encoder_individual, bert_input_size_individual)

tweet_individual_train = individual_tokenizer.tokenize_input(tweet_train)
label_individual_train = individual_tokenizer.tokenize_labels(label_train)
tweet_individual_val = individual_tokenizer.tokenize_input(tweet_val)
label_individual_val = individual_tokenizer.tokenize_labels(label_val)
tweet_individual_test = individual_tokenizer.tokenize_input(tweet_test)
label_individual_test = individual_tokenizer.tokenize_labels(label_test)

### Finding optimal hyper parameters (batch_size, epochs)

In [13]:
results = [{'batch_size': 1, 'epochs': 10, 'loss': 1.2027349472045898, 'accuracy': 0.570888876914978}, {'batch_size': 8, 'epochs': 10, 'loss': 0.8909910321235657, 'accuracy': 0.5973333120346069}, {'batch_size': 8, 'epochs': 50, 'loss': 2.1527915000915527, 'accuracy': 0.5951111316680908}, {'batch_size': 8, 'epochs': 100, 'loss': 2.6716349124908447, 'accuracy': 0.5737777948379517}, {'batch_size': 32, 'epochs': 10, 'loss': 0.8014382719993591, 'accuracy': 0.5855555534362793}, {'batch_size': 32, 'epochs': 50, 'loss': 1.791918396949768, 'accuracy': 0.5933333039283752}, {'batch_size': 32, 'epochs': 100, 'loss': 2.2307729721069336, 'accuracy': 0.5973333120346069}]
batch_sizes = [64]
epochs = [10, 50, 100]

for batch_size in batch_sizes:
    for epoch in epochs:

        # Fit the model and then evaluate
        with tf.device('gpu:0'):
            bert_encoder_individual_test = hub.KerasLayer(
                small_bert_url, 
                trainable=True,
            )
            bert_model_individual_test = clf.create_bert_model(bert_encoder_individual_test, bert_input_size_individual)
            bert_model_individual_test.compile(Adam(lr=1e-5), 'binary_crossentropy', ['accuracy'])
            
            bert_model_individual_test.fit(
                x=tweet_individual_train, 
                y=label_individual_train, 
                batch_size=batch_size, 
                epochs=epoch,
            )
            
            evaluated_results = bert_model_individual_test.evaluate(tweet_individual_val, label_individual_val)
            results.append({
                'batch_size': batch_size, 
                'epochs': epoch, 
                'loss': evaluated_results[0], 
                'accuracy': evaluated_results[1]
            })
            print(results[-1])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
{'batch_size': 64, 'epochs': 10, 'loss': 0.725799024105072, 'accuracy': 0.6437777876853943}
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
{'batch_size': 64, 'epochs': 50, 'loss': 1.4171788692474365, 'accuracy': 0.6006666421890259}
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch

Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
{'batch_size': 64, 'epochs': 100, 'loss': 2.0028185844421387, 'accuracy': 0.6006666421890259}


In [21]:
best_result = max(results, key=lambda result: result['accuracy'])
best_result

{'batch_size': 64,
 'epochs': 10,
 'loss': 0.725799024105072,
 'accuracy': 0.6437777876853943}

In [15]:
send_email(
    f"""
    Grid Search finished.
    Best model: 
    > batch_size: {best_result['batch_size']}
    > epochs: {best_result['epochs']}
    > loss: {best_result['loss']}
    > accuracy: {best_result['accuracy']}
    
    All results: {results}
    
    Now training model.
    """)

In [None]:
results = [{'batch_size': 1,
  'epochs': 10,
  'loss': 1.2027349472045898,
  'accuracy': 0.570888876914978},
 {'batch_size': 8,
  'epochs': 10,
  'loss': 0.8909910321235657,
  'accuracy': 0.5973333120346069},
 {'batch_size': 8,
  'epochs': 50,
  'loss': 2.1527915000915527,
  'accuracy': 0.5951111316680908},
 {'batch_size': 8,
  'epochs': 100,
  'loss': 2.6716349124908447,
  'accuracy': 0.5737777948379517},
 {'batch_size': 32,
  'epochs': 10,
  'loss': 0.8014382719993591,
  'accuracy': 0.5855555534362793},
 {'batch_size': 32,
  'epochs': 50,
  'loss': 1.791918396949768,
  'accuracy': 0.5933333039283752},
 {'batch_size': 32,
  'epochs': 100,
  'loss': 2.2307729721069336,
  'accuracy': 0.5973333120346069},
 {'batch_size': 64,
  'epochs': 10,
  'loss': 0.725799024105072,
  'accuracy': 0.6437777876853943},
 {'batch_size': 64,
  'epochs': 50,
  'loss': 1.4171788692474365,
  'accuracy': 0.6006666421890259},
 {'batch_size': 64,
  'epochs': 100,
  'loss': 2.0028185844421387,
  'accuracy': 0.6006666421890259}]

#### Fitting with optimal parameters

In [16]:
with tf.device('gpu:0'):
    # Fit
    bert_model_individual.fit(
        x=tweet_individual_train, 
        y=label_individual_train, 
        batch_size=best_result['batch_size'], 
        epochs=best_result['epochs'], 
        callbacks=[bert_checkpoint_callback_individual],
        validation_data=(tweet_individual_val, label_individual_val),
    )

Epoch 1/10
Epoch 00001: saving model to training/bert_training_individual_1\cp.ckpt
Epoch 2/10
Epoch 00002: saving model to training/bert_training_individual_1\cp.ckpt
Epoch 3/10
Epoch 00003: saving model to training/bert_training_individual_1\cp.ckpt
Epoch 4/10
Epoch 00004: saving model to training/bert_training_individual_1\cp.ckpt
Epoch 5/10
Epoch 00005: saving model to training/bert_training_individual_1\cp.ckpt
Epoch 6/10
Epoch 00006: saving model to training/bert_training_individual_1\cp.ckpt
Epoch 7/10
Epoch 00007: saving model to training/bert_training_individual_1\cp.ckpt
Epoch 8/10
Epoch 00008: saving model to training/bert_training_individual_1\cp.ckpt
Epoch 9/10
Epoch 00009: saving model to training/bert_training_individual_1\cp.ckpt
Epoch 10/10
Epoch 00010: saving model to training/bert_training_individual_1\cp.ckpt


In [17]:
eval_result = bert_model_individual.evaluate(tweet_individual_test, label_individual_test)



In [18]:
send_email(
    f"""
    Individual model fit finished.
    > loss: {eval_result[0]}
    > accuracy: {eval_result[1]}
    """)

### Evaluating for each user (rather than each tweet)

In [12]:
def is_true_positive(label, prediction):
    return label == 1 and prediction == 1
    
def is_false_positive(label, prediction):
    return label == 0 and prediction == 1

def is_false_negative(label, prediction):
    return label == 1 and prediction == 0 
    
def is_true_negative(label, prediction):
    return label == 0 and prediction == 0

# Evaluate the model, returning accuracy, recall, f1, etc
# predictions should be of type [(label, [predictions])]
def evaluate_model(predictions, labels):
    tp = 0
    fp = 0
    fn = 0
    tn = 0
    for prediction, label in zip(predictions, labels):
        # Take the mean of the users predictions and compare to threshold
        if is_true_positive(label, prediction):
            tp += 1
        elif is_false_positive(label, prediction):
            fp += 1
        elif is_false_negative(label, prediction):
            fn += 1
        elif is_true_negative(label, prediction):
            tn += 1
        else:
            print("Error:", label, prediction)
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else float("NaN")
    recall = tp / (tp + fn) if (tp + fn) > 0 else float("NaN")
    f1 = 2 * (precision * recall) / (precision + recall) if precision > 0 and recall > 0 else float("NaN")
            
    return {
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn,
        'true_negatives': tn,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [98]:
def create_test_model(trainable=True):
    bert_encoder_individual_test = hub.KerasLayer(
        small_bert_url, 
        trainable=trainable,
    )
    bert_model_individual_test = bclf.create_bert_model(bert_encoder_individual_test, bert_input_size_individual)
    bert_model_individual_test.compile(Adam(lr=1e-5), 'binary_crossentropy', ['accuracy'])
    return bert_encoder_individual_test, bert_model_individual_test

In [95]:
# Train and save best individual models
pairs = [(8, 10), (32, 100), (64, 10)]

for batch_size, epoch in pairs:
    with tf.device('gpu:0'):
        # Create checkpoint
        checkpoint = ModelCheckpoint(
            filepath=f"training/bert_individual/batch{batch_size}-epoch{epoch}-2/cp.ckpt",
            save_weights_only=True,
            verbose=1
        )
        
        # Create model
        bert_encoder_individual_test = hub.KerasLayer(
            small_bert_url, 
            trainable=True,
        )
        bert_model_individual_test = bclf.create_bert_model(bert_encoder_individual_test, bert_input_size_individual)
        bert_model_individual_test.compile(Adam(lr=1e-5), 'binary_crossentropy', ['accuracy'])

        bert_model_individual_test.fit(
            x=tweet_individual_train, 
            y=label_individual_train, 
            batch_size=batch_size, 
            epochs=epoch, 
            callbacks=[checkpoint],
        )

Epoch 1/10




Epoch 00001: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 2/10
Epoch 00002: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 3/10
Epoch 00003: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 4/10
Epoch 00004: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 5/10
Epoch 00005: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 6/10
Epoch 00006: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 7/10
Epoch 00007: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 8/10
Epoch 00008: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 9/10
Epoch 00009: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 10/10
Epoch 00010: saving model to training/bert_individual/batch8-epoch10-2\cp.ckpt
Epoch 1/100
Epoch 00001: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 2/100
Epoch

Epoch 00051: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 52/100
Epoch 00052: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 53/100
Epoch 00053: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 54/100
Epoch 00054: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 55/100
Epoch 00055: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 56/100
Epoch 00056: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 57/100
Epoch 00057: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 58/100
Epoch 00058: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 59/100
Epoch 00059: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 60/100
Epoch 00060: saving model to training/bert_individual/batch32-epoch100-2\cp.ckpt
Epoch 61/100
Epoch 00061: saving model to training/bert_individual/batch3

In [154]:
send_email("Finished training")

#### Classifying BERT predictions

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [96]:
def train_classifier(clf, X_train, y_train, X_val, y_val, batch_size, epochs):
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_val)
    return {
        'batch_size': batch_size,
        'epochs': epochs,
        'predictions_val': predictions,
        'labels_val': y_val,
    }

In [99]:
pairs = [(8, 10), (32, 100), (64, 10)]
predictions = {
    'logistic_regression': [],
    'logistic_regression_sorted': [],
    'svm': [],
    'svm_sorted': [],
}

for batch_size, epoch in pairs:
    # Load the BERT model
    print(f"Loading weights for batch_size:{batch_size}, epochs: {epoch}")
    encoder, model = create_test_model(trainable=False)
    model.load_weights(
        f"training/bert_individual/batch{batch_size}-epoch{epoch}-2/cp.ckpt"
    ).expect_partial()
    
    # Predict training and validation set data
    X_train, y_train = calculate_user_predictions_from_individual_tweets(
        model, 
        tweet_individual_train, 
        label_individual_train,
    )
    X_val, y_val = calculate_user_predictions_from_individual_tweets(
        model, 
        tweet_individual_val, 
        label_individual_val,
    )
    
    # Train Logistic Regression model
    print("Training LR")
    predictions['logistic_regression'].append(
        train_classifier(
            LogisticRegression(), 
            X_train, 
            y_train, 
            X_val, 
            y_val, 
            batch_size, 
            epoch
        )
    )
    
    # Train Logistic Regression when training data sorted
    print("Training LR sorted")
    X_train_sorted = np.sort(X_train, axis=1)
    X_val_sorted = np.sort(X_val, axis=1)
    predictions['logistic_regression_sorted'].append(
        train_classifier(
            LogisticRegression(), 
            X_train_sorted, 
            y_train, 
            X_val_sorted, 
            y_val, 
            batch_size, 
            epoch
        )
    )

    # Train SVM model
    print("Training SVM")
    predictions['svm'].append(
        train_classifier(
            SVC(probability=True), 
            X_train, 
            y_train, 
            X_val, 
            y_val, 
            batch_size, 
            epoch
        )
    )

    # Train SVM when training data sorted
    print("Training SVM sorted")
    predictions['svm_sorted'].append(
        train_classifier(
            SVC(probability=True), 
            X_train_sorted, 
            y_train, 
            X_val_sorted, 
            y_val, 
            batch_size, 
            epoch
        )
    )

Loading weights for batch_size:8, epochs: 10
Training LR
Training LR sorted
Training SVM
Training SVM sorted
Loading weights for batch_size:32, epochs: 100
Training LR
Training LR sorted
Training SVM
Training SVM sorted
Loading weights for batch_size:64, epochs: 10
Training LR
Training LR sorted
Training SVM
Training SVM sorted


In [47]:
send_email(f"Finished training, predictions:\n{predictions}")
predictions

{'logistic_regression': [{'batch_size': 8,
   'epochs': 10,
   'predictions_val': array([0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
          1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
          1]),
   'labels_val': array([0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
          1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
          1])},
  {'batch_size': 32,
   'epochs': 100,
   'predictions_val': array([0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
          1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
          1]),
   'labels_val': array([0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
          1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
          1])},
  {'batch_size': 64,
   'epochs': 10,
   'predictions_val': array([0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
          1, 1, 0, 0, 1, 0, 1, 1

In [100]:
import pandas as pd

results = []
for key, val in predictions.items():
    for clf_info in val:
        res = []
        res.append(key)
        res.append(clf_info['batch_size'])
        res.append(clf_info['epochs'])
        
        eval_scores = evaluate_model(
            clf_info['predictions_val'], 
            clf_info['labels_val']
        )
        res.append(eval_scores['accuracy'])
        res.append(eval_scores['precision'])
        res.append(eval_scores['recall'])
        res.append(eval_scores['f1'])
        results.append(res)

df = pd.DataFrame(results, columns=['final classifier', 'batch_size', 'epochs', 'accuracy', 'precision', 'recall', 'f1'])

In [101]:
df.sort_values(by=['accuracy', 'f1'], ascending=False)

Unnamed: 0,final classifier,batch_size,epochs,accuracy,precision,recall,f1
1,logistic_regression,32,100,0.777778,0.75,0.875,0.807692
9,svm_sorted,8,10,0.755556,0.782609,0.75,0.765957
7,svm,32,100,0.733333,0.772727,0.708333,0.73913
4,logistic_regression_sorted,32,100,0.711111,0.72,0.75,0.734694
2,logistic_regression,64,10,0.711111,0.73913,0.708333,0.723404
10,svm_sorted,32,100,0.711111,0.73913,0.708333,0.723404
0,logistic_regression,8,10,0.711111,0.761905,0.666667,0.711111
3,logistic_regression_sorted,8,10,0.711111,0.761905,0.666667,0.711111
8,svm,64,10,0.711111,0.789474,0.625,0.697674
5,logistic_regression_sorted,64,10,0.711111,0.823529,0.583333,0.682927


In [None]:
send_email("Finished")

#### Predict the probability of a user being a fake news spreader

In [31]:
# Load the BERT model
print(f"Loading weights for batch_size: 32, epochs: 100")
encoder, model = create_test_model()
model.load_weights(
    f"training/bert_individual/batch32-epoch100/cp.ckpt"
).expect_partial()

# Predict training and validation set data
X_train, y_train = calculate_user_predictions_from_individual_tweets(
    model, 
    tweet_individual_train, 
    label_individual_train,
)
X_val, y_val = calculate_user_predictions_from_individual_tweets(
    model, 
    tweet_individual_val, 
    label_individual_val,
)

# Train Logistic Regression model
print("Training LR")
log_reg_clf = LogisticRegression()
log_reg_clf.fit(X_train, y_train)

# Train SVM
print("Training SVM")
svm_clf = SVC(probability=True)
svm_clf.fit(X_train, y_train)

# Train Logistic Regression when training data sorted
print("Training LR sorted")
X_train_sorted = np.sort(X_train, axis=1)
X_val_sorted = np.sort(X_val, axis=1)
log_reg_sorted_clf = LogisticRegression()
log_reg_sorted_clf.fit(X_train_sorted, y_train)

Loading weights for batch_size: 32, epochs: 100
























Training LR
Training SVM
Training LR sorted


LogisticRegression()

In [32]:
print("LR User 1 predict probability:", 
      log_reg_clf.predict_proba([X_val[0]]))
print("LR User 1 prediction:", log_reg_clf.predict([X_val[0]]))
print("LR sorted User 1 predict probability:", 
      log_reg_sorted_clf.predict_proba([X_val_sorted[0]]))
print("LR sorted User 1 prediction:", log_reg_sorted_clf.predict([X_val_sorted[0]]))
print("SVM User 1 predict probability:", 

      svm_clf.predict_proba([X_val[0]]))
print("SVM User 1 prediction:", svm_clf.predict([X_val[0]]))
print("User 1 label:", y_val[0])

LR User 1 predict probability: [[0.52118894 0.47881106]]
LR User 1 prediction: [0]
LR sorted User 1 predict probability: [[0.74938058 0.25061942]]
LR sorted User 1 prediction: [0]
SVM User 1 predict probability: [[0.792287 0.207713]]
SVM User 1 prediction: [0]
User 1 label: 0


### Training BERT individual + Logistic Regression sorted
BERT Model:
* BERT L-12, Input 128
* Individual tweets
* batch_size 32, epochs 100

Logistic Regression Model:
* Predict training set using BERT and sort each datapoint - this will be the LR training data
* Logistic Regression

In [84]:
# Load the above BERT model
# bert_individual_model = bclf.create_bert_model(
#     bert_individual_encoder, 
#     bert_individual_size,
# )
# bert_individual_model.load_weights(
#     f"training/bert_individual/batch32-epoch100/cp.ckpt"
# ).expect_partial()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1398e39eac8>

In [90]:
from joblib import dump, load

# Train Logistic Regression classifier
X_train, y_train = calculate_user_predictions_from_individual_tweets(
    bert_individual_model, 
    tweet_individual_train, 
    label_individual_train,
)
X_train_sorted = np.sort(X_train, axis=1)

clf = LogisticRegression()
clf.fit(X_train_sorted, y_train)
dump(clf, model_path + "logistic_regressor.joblib") 

['training/bert_individual/best-batch_size32-epochs-100-2/logistic_regressor.joblib']

In [92]:
# Evaluate the model
X_val, y_val = calculate_user_predictions_from_individual_tweets(
    bert_individual_model, 
    tweet_individual_val, 
    label_individual_val,
)
X_val_sorted = np.sort(X_val, axis=1)

X_test, y_test = calculate_user_predictions_from_individual_tweets(
    bert_individual_model, 
    tweet_individual_test, 
    label_individual_test,
)
X_test_sorted = np.sort(X_test, axis=1)

pred_val = clf.predict(X_val_sorted)
pred_test = clf.predict(X_test_sorted)

result = f"Validation eval:\n{evaluate_model(pred_val, y_val)}\nTest eval:\n{evaluate_model(pred_test, y_test)}"
send_email(result)
print(result)

Validation eval:
{'true_positives': 19, 'false_positives': 8, 'false_negatives': 5, 'true_negatives': 13, 'accuracy': 0.7111111111111111, 'precision': 0.7037037037037037, 'recall': 0.7916666666666666, 'f1': 0.7450980392156864}
Test eval:
{'true_positives': 14, 'false_positives': 12, 'false_negatives': 4, 'true_negatives': 15, 'accuracy': 0.6444444444444445, 'precision': 0.5384615384615384, 'recall': 0.7777777777777778, 'f1': 0.6363636363636364}


## BERT Tweet Feed Model

In [11]:
medium_bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1"
bert_encoder_feed = hub.KerasLayer(
    medium_bert_url, 
    trainable=True,
)

bert_input_size_feed = 512

### Training

In [12]:
feed_tokenizer = bclf.BertTweetFeedTokenizer(bert_encoder_feed, bert_input_size_feed)

tweet_feed_train = feed_tokenizer.tokenize_input(tweet_train)
label_feed_train = feed_tokenizer.tokenize_labels(label_train)
tweet_feed_val = feed_tokenizer.tokenize_input(tweet_val)
label_feed_val = feed_tokenizer.tokenize_labels(label_val)
tweet_feed_test = feed_tokenizer.tokenize_input(tweet_test)
label_feed_test = feed_tokenizer.tokenize_labels(label_test)

In [1]:
def bert_loss_testing(model_path, bert_size, bert_encoder_url, X_train, y_train, X_val, y_val):
    setup = [(b, e, lr) 
             for b in [16] 
             for e in [10] 
             for lr in [5e-5, 3e-5, 2e-5, 1e-5]]

    for batch_size, epochs, learning_rate in setup:
        # TensorBoard callback for logging loss
        model_name = f"batch_size{batch_size}-epochs{epochs}-lr{learning_rate}"
        log_dir = model_path + "logs/" + model_name
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

        # Checkpoint to save model
        checkpoint_path = model_path + model_name + "/cp.ckpt"
        checkpoint_callback = ModelCheckpoint(
            filepath=checkpoint_path,
            save_weights_only=True,
            verbose=1,
        )

        # BERT model
        bert_encoder = hub.KerasLayer(
            bert_encoder_url, 
            trainable=True,
        )
        bert_model = bclf.create_bert_model(
            bert_encoder, 
            bert_size,
        )
        bert_model.compile(
            Adam(learning_rate=learning_rate), 
            'binary_crossentropy', 
            ['accuracy'],
        )

        # Train BERT
        bert_model.fit(
            x=X_train,
            y=y_train,
            batch_size=batch_size,
            epochs=epochs,
            callbacks=[checkpoint_callback, tensorboard_callback],
            validation_data=(X_val, y_val),
        )

In [2]:
with tf.device("/cpu:0"):
    bert_loss_testing(
        "training/bert_feed/loss-testing/", 
        bert_input_size_feed,    
        medium_bert_url, 
        tweet_feed_train, 
        label_feed_train, 
        tweet_feed_val, 
        label_feed_val,
    )

NameError: name 'tf' is not defined