In [1]:
import pickle
import numpy as np

# This adds the BERT path to the python path, needed for the imports inside BERT modules
import sys
sys.path.append('../ml/BERT') 

from Config import Config
from Vectorisation import Vectorisation
import masking as masking
import BERT as BERT

import tensorflow as tf
from sklearn.model_selection import KFold
from itertools import product




In [2]:
hyperparameters = {
    #"EMBED_DIM": [32, 64, 128, 256],
    #"NUM_HEAD": [2, 4, 8],
    #"FF_DIM": [32, 64, 128, 256],
    #"NUM_LAYERS": [1, 2, 4],
    "LR": [0.0001, 0.001, 0.01],
    #"EPOCH": [10, 20, 30]
}
num_folds = 10

In [3]:
with open("../../data/ml4science_data.pkl", "rb") as fp:
    full_data = pickle.load(fp)

data_list = [full_data["sequences"][i]["sequence"] for i in range(len(full_data["sequences"]))]

best_hyperparameters = {}

In [4]:
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

In [5]:
for hyperparameter in hyperparameters.items():
    accuracy_seq_list = []
    accuracy_mask_list = []

    for value in hyperparameter[1]:
        print(f"\n\nTesting {hyperparameter[0]} with value {value}")

        match hyperparameter[0]:
            case "EMBED_DIM":
                config = Config(EMBED_DIM=value)
            case "NUM_HEAD":
                config = Config(NUM_HEAD=value)
            case "FF_DIM":
                config = Config(FF_DIM=value)
            case "NUM_LAYERS":
                config = Config(NUM_LAYERS=value)
            case "LR":
                config = Config(LR=value)
            case "EPOCH":
                config = Config(EPOCH=value)

        vectorisation = Vectorisation(config)
        accuracies_seq = []
        accuracies_mask = []

        for i, (train_index, test_index) in enumerate(kf.split(data_list)):
            print("\n\n=========================================")
            print(f"In fold {i + 1}\n")

            train_data = [data_list[i] for i in train_index]
            test_data = [data_list[i] for i in test_index]

            seps_train = vectorisation.sep_from_seq(train_data)
            seps_test = vectorisation.sep_from_seq(test_data)

            train_data_encoded = vectorisation.encode(train_data, seps_train)
            test_data_encoded = vectorisation.encode(test_data, seps_test)

            x_masked_train, y_masked_train, sample_weights_train = masking.mask_input_and_labels(train_data_encoded, config.TOKEN_DICT, seed=32)
            x_masked_test, y_masked_test, sample_weights_test = masking.mask_input_and_labels(test_data_encoded, config.TOKEN_DICT, seed=32)

            mlm_ds_train = tf.data.Dataset.from_tensor_slices((x_masked_train, y_masked_train, sample_weights_train))
            mlm_ds_train = mlm_ds_train.shuffle(1000).batch(config.BATCH_SIZE)
            mlm_ds_test = tf.data.Dataset.from_tensor_slices((x_masked_test, y_masked_test, sample_weights_test))
            mlm_ds_test = mlm_ds_test.shuffle(1000).batch(config.BATCH_SIZE)

            bert_masked_model = BERT.create_masked_language_bert_model(config)
            bert_masked_model.fit(mlm_ds_train, epochs=config.bert.epoch)

            predictions = bert_masked_model.predict(x_masked_test)
            predictions_max = np.argmax(predictions, axis=2)

            print("Predictions: ", predictions)
            print("Predictions max: ", predictions_max)

            accuracy_seq = np.sum((predictions_max == y_masked_test) * (y_masked_test != 0)) / np.sum(y_masked_test != 0)
            accuracies_seq.append(accuracy_seq)
            where_equal = (predictions_max == y_masked_test)
            where_masked = (x_masked_test == config.TOKEN_DICT['[MASK]'])
            print("Where equal: ", where_equal)
            print("Where masked: ", where_masked)
            accuracy_mask = np.sum(where_equal * where_masked) / np.sum(where_masked)
            accuracies_mask.append(accuracy_mask)

        accuracy_seq_list.append(np.mean(accuracies_seq))
        accuracy_mask_list.append(np.mean(accuracies_mask))

    best_value_seq = hyperparameter[1][np.argmax(accuracy_seq_list)]
    best_accuracy_seq = np.max(accuracy_seq_list)
    best_value_mask = hyperparameter[1][np.argmax(accuracy_mask_list)]
    best_accuracy_mask = np.max(accuracy_mask_list)
    best_hyperparameters[hyperparameter[0]] = {"value_seq": best_value_seq, 
                                                "accuracy_seq": best_accuracy_seq, 
                                                "all_accuracies_seq": accuracy_seq_list, 
                                                "value_mask": best_value_mask, 
                                                "accuracy_mask": best_accuracy_mask, 
                                                "all_accuracies_mask": accuracy_mask_list}

print(f"The best hyperparameters and their values are:")
for key, value in best_hyperparameters.items():
    print("For the sequence")
    print(f"{key}: {value['value_seq']} with an accuracy of {value['accuracy_seq']} and all accuracies: {value['all_accuracies_seq']}")
    print("For the mask")
    print(f"{key}: {value['value_mask']} with an accuracy of {value['accuracy_mask']}, and all accuracies: {value['all_accuracies_mask']}")



Testing LR with value 0.0001


In fold 1


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Predictions:  [[[0.0024542  0.00077884 0.00472155 ... 0.00126099 0.02739791 0.726801  ]
  [0.00929288 0.01509577 0.01511447 ... 0.03010885 0.13646008 0.25156644]
  [0.01004651 0.01519384 0.02108739 ... 0.03220861 0.12192103 0.22669448]
  ...
  [0.01081514 0.01597601 0.02393954 ... 0.00565434 0.01583618 0.06979881]
  [0.01351568 0.00876976 0.02863104 ... 0.00592915 0.0130367  0.05819433]
  [0.01490683 0.00672174 0.03014431 ... 0.00819041 0.01203134 0.05799625]]

 [[0.00244857 0.0007744  0.00473749 ... 0.00125639 0.0275134  0.7279529 ]
  [0.00926534 0.01500619 0.01509888 ... 0.03001172 0.13684691 0.25266665]
  [0.01002599 0.0150897  0.02106877 ... 0.03208173 0.12235204 0.22797628]
  ...
  [0.01080402 0.01591119 0.02402068 ... 0.00564597 0.01592094 0.07029166]
  [0.01349989 0.00873455 0.02872166 ... 0.00591905 0.01310517 0.05861259]
  