# TEdetection: NER_predictions_from_files_V1

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import shelve

2022-06-03 11:16:30.262593: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-06-03 11:16:30.830021: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /gxfs_work1/gxfs_home_interim/sw/spack/spack0.16.0/usr/opt/spack/linux-rhel8-x86_64/gcc-10.2.0/miniconda3-4.9.2-eyiscf7bqaswgve4zxjovfoycbmzfy5g/lib
2022-06-03 11:16:30.830042: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## set parameters

In [None]:
input_dir = os.path.join("..", "data", "test_input_files", "contigs")
output_file = os.path.join("..", "data", "test_input_files", "contig.filter")

chunk_len = 150
chunk_offset = 50

## get tokenize_table

In [None]:
results = shelve.open(os.path.join("..", "data", "prep_trainvalidationtest_1"))
tokenize_table = results["tokenize_table"]
results.close

## read data

In [None]:
def prep_chunks_from_files(
               input_dir,
               tokenize_table,
               chunk_len = 150,
               chunk_offset = 50):
               
    """-------------------------------------------------------------
    prepare data
    Read original data from input_dir, tokenize the protein families
    with the tokenize_table.
    
    Create chunks from this data of size "chunk_len" and with an 
    offset of "chunk_offset". Define these parameters below.
    
    Output:
    
    "df_chunked" (pandas DataFrame)
        contains all chunks with token_ids.  
        Other columns in this dataframe serve for finding the data 
        quickly in the original files  
    -------------------------------------------------------------"""
    
    col_origin = []
    col_tokens = []
    col_length = []

    for sequence in os.listdir(input_dir):
        df_c = pd.read_csv(os.path.join("..", "data", "original_data", "contigs_func", sequence), 
                           sep = "\t", 
                           names = ("tokens", "strand"), 
                           skiprows = (1) )
        contig_id = sequence.split(".")[0]        
        tokens = df_c["tokens"].to_list()
        length = len(tokens)
        
        col_origin.append(sequence)
        col_tokens.append(tokens)
        col_length.append(length)
        
    df_a = pd.DataFrame({ "origin": col_origin,
                          "tokens": col_tokens,
                          "length": col_length 
                        })
                        
    # assign token_ids to tokens
    df_a["token_ids"] = df_a["tokens"].apply(lambda x: [ tokenize_table[t] for  t in x])
    
    # compute the chunks
    origin = []
    chunk = []
    dset = []
    tokens = []
    token_ids = []
    attention_masks = []

    for index, row in df_a.iterrows():
        l = len(row["token_ids"])
        raw_t = np.zeros(max(l + chunk_offset, chunk_len)).astype(int)
        raw_t[0: l] = row["token_ids"]
        raw_masks = np.zeros(max(l + chunk_offset, chunk_len)).astype(int)
        raw_masks[0:l] = np.ones(l).astype(int)
        raw_tokens = ['pad' for i in range(0, max(l + chunk_offset, chunk_len))]
        raw_tokens[0: l] = row["tokens"]
        chunk_n = 1

        for i in range(0,max(1, l + chunk_offset - chunk_len), chunk_offset):
            origin.append(row["origin"])
            chunk.append(chunk_n)
            token_ids.append(raw_t[i:i+chunk_len])
            attention_masks.append(raw_masks[i:i+chunk_len])
            tokens.append(raw_tokens[i:i+chunk_len])
            chunk_n = chunk_n + 1


    df_chunked = pd.DataFrame({"origin": origin,
                               "chunk": chunk,
                               "tokens": tokens,
                               "token_ids": token_ids,
                               "attention_masks" : attention_masks
                             })
    
    return df_chunked

In [5]:
df = prep_chunks_from_files(
               input_dir,
               tokenize_table,
               chunk_len = chunk_len,
               chunk_offset = chunk_offset)

In [6]:
df

Unnamed: 0,origin,chunk,tokens,token_ids,attention_masks
0,10775.fasta,1,"[mcl0000X, mcl01971, mcl02107, mcl01276, mcl01...","[31, 2986, 2985, 2984, 2982, 2981, 2980, 2979,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,10775.fasta,2,"[mcl03972, mcl02583, mcl02524, mcl06470, mcl04...","[4984, 4985, 4986, 4987, 4988, 4989, 4990, 499...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,10775.fasta,3,"[mcl01932, mcl01345, mcl02291, mcl03225, mcl00...","[2923, 2922, 2921, 2920, 2918, 2917, 2916, 291...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,10775.fasta,4,"[mcl01333, mcl03580, mcl03596, mcl02544, mcl02...","[2875, 5004, 5005, 2874, 2873, 2872, 3815, 500...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,10775.fasta,5,"[mcl00194, mcl00306, mcl00812, mcl02106, mcl02...","[2837, 2836, 2834, 2833, 2832, 2831, 2830, 691...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...
82,5327.fasta,4,"[mcl20537, mcl20920, mcl00008, mcl00050, mcl21...","[12591, 12592, 1249, 3223, 12594, 12595, 19875...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
83,5327.fasta,5,"[mcl07809, mcl07830, mcl07917, mcl08918, mcl08...","[6238, 6239, 6240, 6241, 6242, 6243, 786, 6244...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
84,5327.fasta,6,"[mcl19310, mcl14030, mcl19643, mcl14008, mcl04...","[19898, 10716, 19899, 10695, 14440, 17792, 322...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
85,5327.fasta,7,"[mcl05427, mcl05987, mcl05502, mcl05175, mcl05...","[6281, 6282, 6283, 6284, 6285, 6297, 6298, 207...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## extract training and validation data for NER task

In [7]:
df_tkn = df["token_ids"]
tokens = np.array([x for x in df_tkn.tolist()])
df_att = df["attention_masks"]
attention_masks = np.array([x for x in df_att.tolist()])

## load the NER model

In [8]:
from transformers import TFAutoModelForTokenClassification
  
#tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = TFAutoModelForTokenClassification.from_pretrained("FritzOS/TEdetection_distiBERT_NER_V2")

All model checkpoint layers were used when initializing TFDistilBertForTokenClassification.

All the layers of TFDistilBertForTokenClassification were initialized from the model checkpoint at FritzOS/TEdetection_distiBERT_NER_V2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForTokenClassification for predictions without further training.


In [9]:
model.summary()

Model: "tf_distil_bert_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 67715328  
 nLayer)                                                         
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 67,716,866
Trainable params: 67,716,866
Non-trainable params: 0
_________________________________________________________________


## Functions

In [10]:
def merge_chunks(sequence_chunks):
    n = np.shape(sequence_chunks)[0]
    l = np.shape(sequence_chunks)[1]
    output_sequence = np.zeros(n * l - (n-1) * (l-chunk_offset))
    
    p = 0
    for chunk in sequence_chunks:
        output_sequence[p:p+l] = output_sequence[p:p+l] + chunk
        p = p + chunk_offset
    
    return output_sequence.astype(bool).astype(int)

In [11]:
def get_prediction_element(i, predictions):
    probs = tf.nn.softmax(predictions[0, i])
    result = tf.math.top_k(probs, k=2)
    pred = [result.indices.numpy(), result.values.numpy()]

    return pred[0][0]

In [12]:
def get_prediction_chunk(input_chunk):
    outputs = model(input_chunk)           # call the model to predict TE positions
    i = 0
    pred_arr = np.zeros(150)
    for x in input_chunk:
        pred_arr[i] = get_prediction_element(i, outputs[0])
        i = i+1
    return pred_arr.astype(int)

In [13]:
def get_prediction_sequence(input_sequence_chunks):
    output_sequence_chunks = np.zeros(np.shape(input_sequence_chunks))
    i = 0
    for chunks in input_sequence_chunks:
        output_sequence_chunks[i] = get_prediction_chunk(input_sequence_chunks[i])
        i = i + 1
    return merge_chunks(output_sequence_chunks)
    

## Predict all data

In [14]:
preds = []
orgs = []

for test_origin in df["origin"].unique():
    count = df[df["origin"] == test_origin]["attention_masks"].count()
    len_sequence = chunk_offset*(count-1)+df[(df["origin"] == test_origin) & (df["chunk"] == count)]["attention_masks"].iloc[0].sum()

    c_arr = df[df["origin"] == test_origin]["token_ids"].to_numpy()
    c_arr = np.array(c_arr.tolist())
    predictions = get_prediction_sequence(c_arr)[0:len_sequence]
    preds.append(predictions)
    orgs.append(test_origin)
    

df_predictions = pd.DataFrame({ "origin" : orgs,
                                "predictions": preds
                             })
df_predictions

Unnamed: 0,origin,predictions
0,10775.fasta,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,5106.fasta,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ..."
2,5327.fasta,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ..."


## create the filter output

In [15]:
contig_ID = []
start = []
end = []
len = []

s = 0
i = 0
in_TE = False
for row in df_predictions.iterrows():

    for i in range(0, np.shape(row[1]["predictions"])[0]):
        if row[1]["predictions"][i]:
            if in_TE:
                e = i + 1
            else:
                c_ID = row[1]["origin"].split(".")[0]
                s = i + 1
                e = i + 1
                in_TE = True
        else:
            if in_TE:
                contig_ID.append(c_ID)
                start.append(s)
                end.append(e)
                len.append(e - s + 1)
                in_TE = False
                
    if in_TE: # nach wechsel auf nächstes origin, wenn im vorherigen origin noch TE am Ende war
        contig_ID.append(c_ID)
        start.append(s)
        end.append(e)
        len.append(e  -s + 1)
        in_TE = False

df_filter = pd.DataFrame({ "contig_ID": contig_ID,
                           "start": start,
                           "end": end,
                           "len": len
                        })

## save filter to file

In [16]:
df_filter.to_csv(output_file, sep='\t', header=True, index=True)