## Load and Preprocess data

In [1]:
import TE_lib as te
import os
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
# select one of the two lines of code below

#df, tok = te.prep_tvt_from_func_1()  # compute data from original files
df, tok = te.read_tvt()               # read data from file

In [3]:
df

Unnamed: 0,origin,chunk,set,tokens,token_ids,attention_masks,labels
0,100.fasta,1,test,"[mcl00578, mcl00294, mcl01096, mcl01000, mcl01...","[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,100.fasta,2,test,"[mcl01236, mcl00376, mcl04030, mcl00368, mcl01...","[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,100.fasta,3,test,"[mcl00784, mcl01263, mcl00374, mcl01238, mcl00...","[111, 112, 113, 114, 115, 116, 117, 118, 119, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,100.fasta,4,test,"[mcl03193, mcl02304, mcl00809, mcl01966, mcl02...","[160, 161, 162, 163, 164, 165, 166, 167, 168, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,100.fasta,5,test,"[mcl02582, mcl03806, mcl03733, mcl03779, mcl03...","[210, 211, 212, 213, 214, 215, 216, 217, 218, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
230222,9998.fasta,85,training,"[mcl01916, mcl00745, mcl06673, mcl06098, mcl00...","[3154, 3153, 6115, 6116, 3150, 3145, 3144, 314...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
230223,9998.fasta,86,training,"[mcl04304, mcl04602, mcl05925, mcl12035, mcl12...","[6139, 6140, 3116, 6141, 6142, 6143, 6144, 614...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
230224,9999.fasta,1,training,"[mcl09252, mcl09004, mcl11223, mcl09170, mcl02...","[6181, 6182, 6183, 6184, 3445, 3444, 6185, 618...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
230225,9999.fasta,2,training,"[mcl07933, mcl08867, mcl07944, mcl08857, mcl07...","[6227, 6228, 6229, 6230, 6231, 6232, 6233, 623...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [4]:
print("number of unique tokens in the data: ", len(tok.keys()))

number of unique tokens in the data:  32634


In [5]:
print("length of each chunk in the data: ", len(df["token_ids"].iloc[0]))

length of each chunk in the data:  150


## Build masked training and validation data

In [6]:
mask_token_id = 1 # to be checked if the model requires another pre-defined token-id

def get_masked_input_and_labels(encoded_texts):
    # 15% BERT masking
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
    # Do not mask special tokens
    inp_mask[encoded_texts <= 11] = False
    # Set targets to -1 by default, it means ignore
    #labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    #labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    encoded_texts_masked[inp_mask] = mask_token_id
    
    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels


In [7]:
df_training = df[df["set"] == "training"]["token_ids"]
df_validation = df[df["set"] == "validation"]["token_ids"]

In [8]:
training_masked, training_labels = get_masked_input_and_labels(np.array([x for x in df_training.tolist()]))
validation_masked, validation_labels = get_masked_input_and_labels(np.array([x for x in df_validation.tolist()]))

In [9]:
training_attention_mask = np.array([x for x in df[df["set"] == "training"]["attention_masks"]])
validation_attention_mask = np.array([x for x in df[df["set"] == "validation"]["attention_masks"]])

In [10]:
training_masked[0]

array([ 440, 4846, 4847, 4848, 4849, 4850, 4851, 4852, 4853, 4854, 4855,
       4856, 4857, 4858, 4859, 4860,    1, 4862,    1, 4864,    1, 4866,
          1, 4868, 4869,    1, 4871, 4872, 4873, 4874,    1, 4876, 4877,
       4878, 4879, 4880, 4881, 4882, 4883, 4884,    1, 1210,    1, 4887,
       4888, 4889, 4890, 4891, 4892, 4893, 4894, 4895, 2785,    1,  407,
       4897, 4898,    1, 4900, 2943, 4901,    1, 4902,  373,  374, 4903,
       4904, 4905, 4906, 4907,    1,    1, 4910, 4911, 4912, 4913, 4914,
       4915, 4916, 4917, 4918, 4919, 4920,    1, 4922, 4923,    1, 4925,
          1, 4927,    1, 4929,    1, 4931, 4932, 2137, 4933, 4934, 1878,
       4935, 4936, 4937, 4938, 4939, 4940,    1, 4942, 4943, 4944, 4945,
       4946, 4947, 4948, 4949, 4950, 4951, 4952, 4953, 4954, 4955, 4956,
          1, 4958,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [11]:
training_labels[0]

array([ 440, 4846, 4847, 4848, 4849, 4850, 4851, 4852, 4853, 4854, 4855,
       4856, 4857, 4858, 4859, 4860, 4861, 4862, 4863, 4864, 4865, 4866,
       4867, 4868, 4869, 4870, 4871, 4872, 4873, 4874, 4875, 4876, 4877,
       4878, 4879, 4880, 4881, 4882, 4883, 4884, 4885, 1210, 4886, 4887,
       4888, 4889, 4890, 4891, 4892, 4893, 4894, 4895, 2785, 4896,  407,
       4897, 4898, 4899, 4900, 2943, 4901, 4637, 4902,  373,  374, 4903,
       4904, 4905, 4906, 4907, 4908, 4909, 4910, 4911, 4912, 4913, 4914,
       4915, 4916, 4917, 4918, 4919, 4920, 4921, 4922, 4923, 4924, 4925,
       4926, 4927, 4928, 4929, 4930, 4931, 4932, 2137, 4933, 4934, 1878,
       4935, 4936, 4937, 4938, 4939, 4940, 4941, 4942, 4943, 4944, 4945,
       4946, 4947, 4948, 4949, 4950, 4951, 4952, 4953, 4954, 4955, 4956,
       4957, 4958,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [12]:
training_attention_mask[0]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
validation_masked[0]

array([3551, 3552, 3553,    1, 5903, 5904, 5905, 5906, 3557, 3558, 3559,
       3560, 3561, 1940, 1904, 1903, 1902, 1901, 1900, 3912,  472,  473,
        474,  475,  476,  477,    1,  479, 3563, 3564, 3565, 3566, 3567,
       7347, 3574, 3575, 3576,  247, 5910, 5911, 5912, 3577, 6372, 3578,
       3579, 3580, 3581,    1, 2093,    1, 2091, 3582, 3583, 3584, 3585,
       3588, 5913, 6973, 5914, 5915, 5916, 6974,    1, 5918, 6975, 3589,
       5919, 3590, 4509, 4510, 4511, 4512, 4513, 4514,    1, 5921,    1,
       5922, 5923, 5924,    1, 2062, 5925,    1, 4616, 6370, 4618,    1,
       6978, 5929, 3594, 3595, 3596, 3597, 3598, 3599, 3600, 3601, 3602,
       3603, 3604,    1, 3606, 5930, 5931, 3608, 3609, 3610, 3611, 3612,
       3613, 3614, 3615, 3616, 3617, 3618, 3619, 3620, 3621, 3622, 5934,
       3623, 3625, 3624,    1,    1, 3627, 3628,    1, 3630, 3632, 5936,
       3633, 3634,    1, 3636,    1, 3638, 3639, 3640, 3641, 6979, 5937,
       5938, 3642, 3643,    1,    1, 3646, 3647])

In [14]:
validation_labels

array([[3551, 3552, 3553, ..., 3645, 3646, 3647],
       [2091, 3582, 3583, ..., 3701, 3702, 3704],
       [3604, 3605, 3606, ..., 3936, 3937, 3938],
       ...,
       [3208, 3207, 3206, ..., 3063, 3062, 6165],
       [3158, 5084, 5083, ..., 2999, 2998, 2997],
       [6138, 3123, 3122, ...,    0,    0,    0]])

## Initialize the model
    freshly initialize a DistilBERT model. We’ll use the same configuration for our model as for the distilbert-base-uncased model, so we load the pretrained configuration, make sure that the tokenizer size matches the model vocabulary size and max length of embeddings matches length of our chunks. 


In [15]:
from transformers import TFDistilBertForMaskedLM, AutoConfig

config = AutoConfig.from_pretrained(
    "distilbert-base-uncased",
    vocab_size=len(tok.keys())+10,   # +10 for the reserved special tokens
    max_position_embeddings=len(df["token_ids"].iloc[0])
)

## Load a new (not pretrained) model
Load a new model. Note that we don’t use the from_pretrained() function, since we’re actually initializing a model ourself:

In [16]:
model = TFDistilBertForMaskedLM(config)
model(model.dummy_inputs)  # Builds the model
model.summary()

Model: "tf_distil_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 67714560  
 nLayer)                                                         
                                                                 
 vocab_transform (Dense)     multiple                  590592    
                                                                 
 vocab_layer_norm (LayerNorm  multiple                 1536      
 alization)                                                      
                                                                 
 vocab_projector (TFDistilBe  multiple                 25219972  
 rtLMHead)                                                       
                                                                 
Total params: 68,339,332
Trainable params: 68,339,332
Non-trainable params: 0
__________________________

## Log in to huggingface

from huggingface_hub import notebook_login

notebook_login()

## Compile the model
configure the training hyperparameters and call compile() and fit(). We’ll use a learning rate schedule with some warmup to improve the stability of training:

In [17]:
# with tensorflow

from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(training_masked[0:100])   ## change this when training with the full data
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


### Convert the data dictionaries


    there are three possibilities you can use to gather all the input Tensors in the first positional argument :

    a single Tensor with input_ids only and nothing else: model(inputs_ids)
    a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: model([input_ids, attention_mask])
    a dictionary with one or several input Tensors associated to the input names given in the docstring: model({"input_ids": input_ids})


In [18]:
# data has been reduced to first 100 rows for training and 20 rows for validation.
# This should be changed to the full data as soon as we see the model.fit running without bugs

tf_train_dict = { 'input_ids': tf.convert_to_tensor(training_masked[0:100]),
           'attention_mask': tf.convert_to_tensor(training_attention_mask[0:100]),
           'labels': tf.convert_to_tensor(training_labels[0:100])
         }

In [19]:
tf_val_dict = { 'input_ids': tf.convert_to_tensor(validation_masked[0:20]),
           'attention_mask': tf.convert_to_tensor(validation_attention_mask[0:20]),
           'labels': tf.convert_to_tensor(validation_labels[0:20])
         }

## Train the model with reduced data

In [20]:
short_training_masked = training_masked[0: 10]
short_training_attention_mask = training_attention_mask[0: 10]
short_training_labels = training_labels[0: 10]

In [21]:
mini_batch_size = 3
epochs = 2

In [31]:
#from transformers.keras_callbacks import PushToHubCallback
#callback = PushToHubCallback(output_dir="codeparrot-ds", tokenizer=tokenizer)

for epoch in range(0,epochs):
    print("start epoch ", epoch+1)
    for start in range(0, len(short_training_masked), mini_batch_size):
        end = min(start + mini_batch_size, len(short_training_masked)) 
        mini_batch = { 'input_ids': tf.convert_to_tensor(short_training_masked[start: end]),
                      'attention_mask': tf.convert_to_tensor(short_training_attention_mask[start: end]),
                      'labels': tf.convert_to_tensor(short_training_labels[start: end])
         }

        model.fit(mini_batch, validation_data=tf_val_dict, epochs=1, batch_size = mini_batch_size, verbose=1) #, callbacks=[callback])

print("--------------------------- end ---------------------------------")

start epoch  1
start epoch  2
--------------------------- end ---------------------------------


## Train the model with full data

In [32]:
mini_batch_size = 32
epochs = 1

In [33]:
#from transformers.keras_callbacks import PushToHubCallback
#callback = PushToHubCallback(output_dir="codeparrot-ds", tokenizer=tokenizer)

for epoch in range(0,epochs):
    print("start epoch ", epoch+1)
    for start in range(0, len(training_masked), mini_batch_size):
        end = min(start + mini_batch_size, len(training_masked)) 
        mini_batch = { 'input_ids': tf.convert_to_tensor(training_masked[start: end]),
                      'attention_mask': tf.convert_to_tensor(training_attention_mask[start: end]),
                      'labels': tf.convert_to_tensor(training_labels[start: end])
         }

        model.fit(mini_batch, validation_data=tf_val_dict, epochs=1, batch_size = mini_batch_size, verbose=1) #, callbacks=[callback])

print("--------------------------- end ---------------------------------")

start epoch  1


KeyboardInterrupt: 

## Get an output from the newly pretrained model

In [23]:
def get_prediction(i, predictions):
    probs = tf.nn.softmax(predictions[0, i])
    result = tf.math.top_k(probs, k=3)
    pred = [result.indices.numpy(), result.values.numpy()]

    return pred

#### Define input: Select a single chunk input from data

In [24]:
test_element = 101
input = validation_masked[test_element]
target = validation_labels[test_element]

#### Predict with the pretrained model

In [34]:
outputs = model(input)

#### Evaluate the result

In [26]:
input

array([   1, 2302, 2301, 2300,    1, 2298, 2297, 5136, 3086, 5137, 5138,
       5139, 5140, 6538, 5142, 6539, 5143, 7115, 5144, 2296, 2295,    1,
       5145, 2289, 2288, 2287, 5146,    1,    1, 2284, 2283, 2279, 2278,
       2277, 2276,    1,    1, 2273, 2272, 2271, 2270, 2269, 2268, 2267,
       2266, 2265,    1,    1, 7113,    1, 7111, 7110, 6990, 4986,    1,
       7109, 5147, 5148, 2263, 2262, 2261,    1, 2259, 5149, 2258, 2257,
       2256, 2255,    1, 2253,    1, 2251, 2246, 5150,    1, 5152, 5153,
       5154, 2212, 2211,    1, 2209, 2208, 2207,    1,    1, 7108, 2205,
       2204, 2203, 2202, 2201, 2200, 2199, 5156, 7107, 5157, 2197, 2196,
       2195,    1, 2193, 2192, 2191, 2190, 2189,    1, 2186,    1,    1,
       2183, 2179,    1, 2177, 2176, 2175,    1, 2173,    1, 2170,    1,
       2168, 2167, 2166, 2165, 2164, 2163, 2162, 5159, 2160, 2159, 2158,
       2157, 2156, 2155, 2154, 2151, 2150, 7105,    1, 7103,    1, 7101,
       7100, 7099,    1, 7097, 7096, 2148, 2147])

In [27]:
target

array([2307, 2302, 2301, 2300, 2299, 2298, 2297, 5136, 3086, 5137, 5138,
       5139, 5140, 6538, 5142, 6539, 5143, 7115, 5144, 2296, 2295, 2291,
       5145, 2289, 2288, 2287, 5146, 2286, 2285, 2284, 2283, 2279, 2278,
       2277, 2276, 2275, 2274, 2273, 2272, 2271, 2270, 2269, 2268, 2267,
       2266, 2265, 2264, 7114, 7113, 7112, 7111, 7110, 6990, 4986, 4985,
       7109, 5147, 5148, 2263, 2262, 2261, 2260, 2259, 5149, 2258, 2257,
       2256, 2255, 2254, 2253, 2252, 2251, 2246, 5150, 5151, 5152, 5153,
       5154, 2212, 2211, 2210, 2209, 2208, 2207, 2206, 5155, 7108, 2205,
       2204, 2203, 2202, 2201, 2200, 2199, 5156, 7107, 5157, 2197, 2196,
       2195, 2194, 2193, 2192, 2191, 2190, 2189, 5158, 2186, 2187, 2184,
       2183, 2179, 2178, 2177, 2176, 2175, 2174, 2173, 2171, 2170, 2169,
       2168, 2167, 2166, 2165, 2164, 2163, 2162, 5159, 2160, 2159, 2158,
       2157, 2156, 2155, 2154, 2151, 2150, 7105, 7104, 7103, 7102, 7101,
       7100, 7099, 7098, 7097, 7096, 2148, 2147])

In [35]:
i = 0
for x in input:
    if x == 1:
        print("[mask] at position: ", i, 
              "\n\tprediction is:", get_prediction(i, outputs[0])[0],
              "\n\tweight is:", get_prediction(i, outputs[0])[1],
              "\n\ttarget was: ", target[i])
    i = i+1
            

[mask] at position:  0 
	prediction is: [27660  8948 25219] 
	weight is: [0.00019314 0.00018957 0.00017921] 
	target was:  2307
[mask] at position:  4 
	prediction is: [10654 19123 25007] 
	weight is: [0.00019097 0.00016114 0.00015968] 
	target was:  2299
[mask] at position:  21 
	prediction is: [27660    20 30564] 
	weight is: [0.00018934 0.00018866 0.00016391] 
	target was:  2291
[mask] at position:  27 
	prediction is: [26252 23175 16251] 
	weight is: [0.00019577 0.00018884 0.0001659 ] 
	target was:  2286
[mask] at position:  28 
	prediction is: [ 1441  9074 21146] 
	weight is: [0.00019922 0.00019419 0.00017677] 
	target was:  2285
[mask] at position:  35 
	prediction is: [ 5532  6394 16893] 
	weight is: [0.00017323 0.00016954 0.00015445] 
	target was:  2275
[mask] at position:  36 
	prediction is: [24433 17428 29763] 
	weight is: [0.00021104 0.00018698 0.0001784 ] 
	target was:  2274
[mask] at position:  46 
	prediction is: [ 2910 14889 12568] 
	weight is: [0.00022252 0.00019489 0.