# TEdetection: NER_application_V1

## Load and Preprocess data

In [1]:
import transformers
print(transformers.__version__)

4.19.2


In [2]:
import os
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import shelve
from datetime import datetime
import matplotlib.pyplot as plt

In [3]:
results = shelve.open(os.path.join("..", "data", "prep_trainvalidationtest_1"))
df = results["chunks"]
tokenize_table = results["tokenize_table"]
results.close

<bound method Shelf.close of <shelve.DbfilenameShelf object at 0x0000026781DF1F40>>

In [4]:
df.head(2)

Unnamed: 0,origin,chunk,set,tokens,token_ids,attention_masks,labels
0,100.fasta,1,test,"[mcl00578, mcl00294, mcl01096, mcl01000, mcl01...","[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,100.fasta,2,test,"[mcl01236, mcl00376, mcl04030, mcl00368, mcl01...","[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## extract training and validation data for NER task

In [5]:
df_training_tkn = df[df["set"] == "training"]["token_ids"]
training_tokens = np.array([x for x in df_training_tkn.tolist()])
df_training_lbs = df[df["set"] == "training"]["labels"]
training_labels = np.array([x for x in df_training_lbs.tolist()])
df_training_att = df[df["set"] == "training"]["attention_masks"]
training_attention_masks = np.array([x for x in df_training_att.tolist()])

df_validation_tkn = df[df["set"] == "validation"]["token_ids"]
validation_tokens = np.array([x for x in df_validation_tkn.tolist()])
df_validation_lbs = df[df["set"] == "validation"]["labels"]
validation_labels = np.array([x for x in df_validation_lbs.tolist()])
df_validation_att = df[df["set"] == "validation"]["attention_masks"]
validation_attention_masks = np.array([x for x in df_validation_att.tolist()])

##Check the prepared data


*   df                        : dataframe with all input data (training, validation, test)
*  
*   training_tokens           : np_array with training tokens
*   training_labels           : np_array with training labels (named entities)
*   training_attention_masks  : np_array with training attention masks
*  
*   validation_tokens           : np_array with validation tokens
*   validation_labels           : np_array with validation labels (named entities)
*   validation_attention_masks  : np_array with validation attention masks




In [6]:
validation_tokens[110][44:62]

array([1788, 1787, 1786, 1785, 1784, 1783, 1782, 1780, 1779, 1778, 1777,
       1776, 5238, 1775, 1774, 1773, 1772, 1771])

In [7]:
validation_labels[110][44:62]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [8]:
validation_attention_masks[110][44:62]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## Load the pretrained model

In [9]:
from transformers import TFAutoModelForTokenClassification
  
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = TFAutoModelForTokenClassification.from_pretrained("FritzOS/TEdetection_distiBERT_NER_V2")

All model checkpoint layers were used when initializing TFDistilBertForTokenClassification.

All the layers of TFDistilBertForTokenClassification were initialized from the model checkpoint at FritzOS/TEdetection_distiBERT_NER_V2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForTokenClassification for predictions without further training.


In [10]:
model.summary()

Model: "tf_distil_bert_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 67715328  
 nLayer)                                                         
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 67,716,866
Trainable params: 67,716,866
Non-trainable params: 0
_________________________________________________________________


## Get an output from this NER model

In [11]:
chunk_len = 150
chunk_offset = 50

In [12]:
def prediction_metrics(prediction, target):
    p_bool = prediction.astype(bool)
    t_bool = target.astype(bool)
    c_p = np.sum(p_bool & t_bool) # correct positives
    f_n = np.sum(np.logical_not(p_bool) & t_bool) # false negative
    f_p = np.sum(p_bool & np.logical_not(t_bool)) # false positives
    c_n = np.sum(np.logical_not(p_bool) & np.logical_not(t_bool)) # correct negatives
    p = np.sum(t_bool)
    n = np.sum(np.logical_not(t_bool))
    return p, n, c_p, f_n, f_p, c_n

In [13]:
def merge_chunks(sequence_chunks):
    n = np.shape(sequence_chunks)[0]
    l = np.shape(sequence_chunks)[1]
    output_sequence = np.zeros(n * l - (n-1) * (l-chunk_offset))
    
    p = 0
    for chunk in sequence_chunks:
        output_sequence[p:p+l] = output_sequence[p:p+l] + chunk
        p = p + chunk_offset
    
    return output_sequence.astype(bool).astype(int)

In [14]:
def get_prediction_element(i, predictions):
    probs = tf.nn.softmax(predictions[0, i])
    result = tf.math.top_k(probs, k=2)
    pred = [result.indices.numpy(), result.values.numpy()]

    return pred[0][0]

In [18]:
def get_prediction_chunk(input_chunk):
    outputs = model(input_chunk)
    i = 0
    pred_arr = np.zeros(150)
    for x in input_chunk:
        pred_arr[i] = get_prediction_element(i, outputs[0])
        i = i+1
    return pred_arr.astype(int)

In [16]:
def get_prediction_sequence(input_sequence_chunks):
    output_sequence_chunks = np.zeros(np.shape(input_sequence_chunks))
    i = 0
    for chunks in input_sequence_chunks:
        output_sequence_chunks[i] = get_prediction_chunk(input_sequence_chunks[i])
        i = i + 1
    return merge_chunks(output_sequence_chunks)
    

### Predict  TEs in a single chunk

In [19]:
test_element = 1100#1300#1100 #110
input_chunk = validation_tokens[test_element]
target = validation_labels[test_element]

print("prediction:", list(get_prediction_chunk(input_chunk)))
print("target    :",list(target))

prediction: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]
target    : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]


### Predict TEs in a full sequence

In [20]:
df[df["set"] == "validation"]["origin"].unique()

array(['10161.fasta', '10164.fasta', '10183.fasta', '10187.fasta',
       '10227.fasta', '10267.fasta', '10284.fasta', '10332.fasta',
       '10340.fasta', '10421.fasta', '10424.fasta', '10477.fasta',
       '10505.fasta', '10530.fasta', '10544.fasta', '10608.fasta',
       '10618.fasta', '10619.fasta', '10628.fasta', '10713.fasta',
       '10716.fasta', '10781.fasta', '10784.fasta', '1107.fasta',
       '1144.fasta', '1173.fasta', '1214.fasta', '1222.fasta',
       '1229.fasta', '1250.fasta', '1283.fasta', '1336.fasta',
       '1346.fasta', '1380.fasta', '1403.fasta', '1414.fasta',
       '1422.fasta', '1425.fasta', '1439.fasta', '145.fasta',
       '1479.fasta', '1492.fasta', '1502.fasta', '1514.fasta',
       '1540.fasta', '1549.fasta', '1552.fasta', '1567.fasta',
       '1575.fasta', '1587.fasta', '1588.fasta', '1647.fasta',
       '1656.fasta', '1667.fasta', '1681.fasta', '1691.fasta',
       '1706.fasta', '1770.fasta', '1781.fasta', '180.fasta',
       '1807.fasta', '1809.fasta',

In [21]:
test_origin = "9917.fasta"

In [22]:
df[df["origin"] == test_origin].head()

Unnamed: 0,origin,chunk,set,tokens,token_ids,attention_masks,labels
231652,9917.fasta,1,validation,"[mcl02669, mcl10438, mcl19039, mcl13554, mcl07...","[3086, 6942, 8633, 7914, 6827, 8632, 2666, 863...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
231653,9917.fasta,2,validation,"[mcl07562, mcl06161, mcl02540, mcl07348, mcl13...","[7461, 8396, 3930, 6320, 6319, 6318, 6317, 630...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
231654,9917.fasta,3,validation,"[mcl08882, mcl09187, mcl00035, mcl08902, mcl08...","[6245, 6244, 786, 6243, 6242, 6241, 6240, 6239...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [23]:
count = df[df["origin"] == test_origin]["attention_masks"].count()
len_sequence = chunk_offset*(count-1)+df[(df["origin"] == test_origin) & (df["chunk"] == count)]["attention_masks"].iloc[0].sum()

c_arr = df[df["origin"] == test_origin]["token_ids"].to_numpy()
c_arr = np.array(c_arr.tolist())
predictions = get_prediction_sequence(c_arr)[0:len_sequence]

c_arr = df[df["origin"] == test_origin]["labels"].to_numpy()
c_arr = np.array(c_arr.tolist())
targets = merge_chunks(c_arr)[0:len_sequence]

In [24]:
print("Evaluation for genome sequence: ", test_origin)
print("prediction:", list(predictions))
print("target    :",list(targets))

Evaluation for genome sequence:  9917.fasta
prediction: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
target    : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [25]:
p, n, c_p, f_n, f_p, c_n = prediction_metrics(predictions, targets)

In [26]:
print("correct positives", c_p , "/", p)
print("false   negatives", f_n , "/", p)
print("false   positives", f_p , "/", n)
print("correct negatives", c_n , "/", n)

correct positives 8 / 8
false   negatives 0 / 8
false   positives 3 / 224
correct negatives 221 / 224


### Predict validation set

In [None]:
p = 0
n = 0
c_p = 0
f_n = 0
f_p = 0
c_n = 0

for test_origin in df[df["set"] == "validation"]["origin"].unique():
    count = df[df["origin"] == test_origin]["attention_masks"].count()
    len_sequence = chunk_offset*(count-1)+df[(df["origin"] == test_origin) & (df["chunk"] == count)]["attention_masks"].iloc[0].sum()

    c_arr = df[df["origin"] == test_origin]["token_ids"].to_numpy()
    c_arr = np.array(c_arr.tolist())
    predictions = get_prediction_sequence(c_arr)[0:len_sequence]

    c_arr = df[df["origin"] == test_origin]["labels"].to_numpy()
    c_arr = np.array(c_arr.tolist())
    targets = merge_chunks(c_arr)[0:len_sequence]
    
    hp, hn, hc_p, hf_n, hf_p, hc_n = prediction_metrics(predictions, targets)
    
    p = p + hp
    n = n + hn
    c_p = c_p + hc_p
    f_n = f_n + hf_n
    f_p = f_p + hf_p
    c_n = c_n + hc_n
    
print("correct positives", c_p , "/", p)
print("false   negatives", f_n , "/", p)
print("false   positives", f_p , "/", n)
print("correct negatives", c_n , "/", n)

## Evaluate Model on Test-Set

In [None]:
from tqdm import tqdm
df_test = pd.read_pickle(os.path.join(path,"test_set.pkl"))
# df_test = df[df["set"] == "test"]
threshold = 0.5
batch_size = 128

all_token_ids = df_test["token_ids"].tolist()
all_attention_masks = df_test["attention_masks"].tolist()
nested_labels = df_test["labels"].tolist()

all_predictions = []
all_labels = []

for i in tqdm(range(df_test.shape[0] // batch_size)):
    start, end = (i * batch_size, i * batch_size + batch_size)
    input_ids = tf.convert_to_tensor(all_token_ids[start:end])
    attention_mask = tf.convert_to_tensor(all_attention_masks[start:end])
    batched_labels = nested_labels[start:end]
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    probs = tf.nn.softmax(outputs["logits"])
    for batch in range(batch_size):
        predictions = [int(prob) for prob in probs[batch][:,1]>threshold]
        labels = batched_labels[batch]
        all_predictions.extend(predictions)
        all_labels.extend(labels)

print("Percentage of positives:")
print(f"{sum(all_labels)/len(all_labels)*100}%")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
def compute_metrics(pred, labels):

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    cm = confusion_matrix(all_labels, all_predictions)
    cm_n = confusion_matrix(all_labels, all_predictions, normalize="true")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, "cm":cm, "cm_normalized":cm_n}

print(f"Using threshold {threshold}...\n")
metrics = compute_metrics(all_predictions, all_labels)

disp = ConfusionMatrixDisplay(metrics["cm"])
disp.plot()
plt.show()

print(f'\nAccuracy:   {metrics["accuracy"]*100:.2f}%')
print(f'precision:  {metrics["precision"]*100:.2f}%')
print(f'recall:     {metrics["recall"]*100:.2f}%')
print(f'f1:         {metrics["f1"]*100:.2f}%')
print(f'\nConfusion matrix:\n{metrics["cm"]}')