In [1]:
import pickle
import numpy as np
import sys
sys.path.append('../ml/BERT')
from Config import Config
from Vectorisation import Vectorisation
import masking
import BERT

import tensorflow as tf
from sklearn.model_selection import KFold
from itertools import product




In [2]:
hyperparameters = {
    #"EMBED_DIM": [32, 64, 128, 256],
    #"NUM_HEAD": [2, 4, 8],
    #"FF_DIM": [32, 64, 128, 256],
    #"NUM_LAYERS": [1, 2, 4],
    "LR": [0.0001, 0.001, 0.01],
    #"EPOCH": [10, 20, 30]
}
num_folds = 10

In [3]:
with open("../../data/ml4science_data.pkl", "rb") as fp:
    full_data = pickle.load(fp)

data_list = [full_data["sequences"][i]["sequence"] for i in range(len(full_data["sequences"]))]

In [4]:
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

In [5]:
for hyperparameter in hyperparameters.items():
    print(f"Testing {hyperparameter[0]}")
    accuracy_list = []

    for value in hyperparameter[1]:
        print(f"\n\nTesting {hyperparameter[0]} with value {value}")
        match hyperparameter[0]:
            case "EMBED_DIM":
                config = Config(EMBED_DIM=value)
            case "NUM_HEAD":
                config = Config(NUM_HEAD=value)
            case "FF_DIM":
                config = Config(FF_DIM=value)
            case "NUM_LAYERS":
                config = Config(NUM_LAYERS=value)
            case "LR":
                config = Config(LR=value)
            case "EPOCH":
                config = Config(EPOCH=value)

        vectorisation = Vectorisation(config)
        accuracies = []

        for i, (train_index, test_index) in enumerate(kf.split(data_list)):
            print("\n\n=========================================")
            print(f"In fold {i + 1}\n")

            train_data = [data_list[i] for i in train_index]
            test_data = [data_list[i] for i in test_index]

            train_data_encoded = vectorisation.encode(train_data)
            test_data_encoded = vectorisation.encode(test_data)

            x_masked_train, y_masked_train, sample_weights_train = masking.mask_input_and_labels(train_data_encoded, config.TOKEN_DICT, seed=32)
            x_masked_test, y_masked_test, sample_weights_test = masking.mask_input_and_labels(test_data_encoded, config.TOKEN_DICT, seed=32)

            mlm_ds_train = tf.data.Dataset.from_tensor_slices((x_masked_train, y_masked_train, sample_weights_train))
            mlm_ds_train = mlm_ds_train.shuffle(1000).batch(config.BATCH_SIZE)
            mlm_ds_test = tf.data.Dataset.from_tensor_slices((x_masked_test, y_masked_test, sample_weights_test))
            mlm_ds_test = mlm_ds_test.shuffle(1000).batch(config.BATCH_SIZE)

            bert_masked_model = BERT.create_masked_language_bert_model(config)
            bert_masked_model.fit(mlm_ds_train, epochs=20, validation_data=mlm_ds_test)

            predictions = bert_masked_model.predict(x_masked_test)
            predictions_max = np.argmax(predictions, axis=2)

            accuracy = np.sum((predictions_max == y_masked_test) * (x_masked_test == config.TOKEN_DICT['[MASK]'])) / np.sum(x_masked_test == config.TOKEN_DICT['[MASK]'])
            accuracies.append(accuracy)

        accuracy_list.append(np.mean(accuracies))

    print(f"The best value for {hyperparameter[0]} is {hyperparameter[1][np.argmax(accuracy_list)]} with an accuracy of {np.max(accuracy_list)}")


Testing LR


Testing LR with value 0.0001


In fold 1


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In fold 2

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In fold 3

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In fold 4

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

KeyboardInterrupt: 