# LLM Encoder models for Genomics

## Embedded text classification

In [None]:
def get_sentence_vector(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

In [None]:
import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

text1 = "King rules all the land"
text2 = "Queens rule all of the land"
vec1 = get_sentence_vector(text1)
vec2 = get_sentence_vector(text2)
cos_sim = torch.nn.functional.cosine_similarity(vec1, vec2)
print(cos_sim.item())


0.9007086157798767


In [None]:
vec1.shape

torch.Size([1, 768])

In [None]:
from transformers import pipeline

PRETRAINED = "raynardj/ner-gene-dna-rna-jnlpba-pubmed"
ner = pipeline(task="ner",model=PRETRAINED, tokenizer=PRETRAINED)
ner("Proteins extraction is an important task ", aggregation_strategy="first")

Device set to use cuda:0


[]

In [None]:
import pandas as pd
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)

def clean_output(outputs):
    results = []
    current = []
    last_idx = 0
    # make to sub group by position
    for output in outputs:
        print(output)
    return outputs

def entity_table(pipeline, **pipeline_kw):
    if "aggregation_strategy" not in pipeline_kw:
        pipeline_kw["aggregation_strategy"] = "first"
    def create_table(text):
        return pd.DataFrame(
            clean_output(
                pipeline(text, **pipeline_kw)
            )
        )
    return create_table

In [None]:
# will return a dataframe
# token > start of an important span
#       > middle of an important span
#       > end of an important span
entity_table(ner)("It consists of 25 exons encoding a 1,278-amino acid glycoprotein that is composed of 13 transmembrane domains.")

{'entity_group': 'protein', 'score': np.float32(0.9554617), 'word': ' 1,278-amino acid glycoprotein', 'start': 35, 'end': 64}
{'entity_group': 'protein', 'score': np.float32(0.8704146), 'word': ' transmembrane domains.', 'start': 88, 'end': 110}


Unnamed: 0,entity_group,score,word,start,end
0,protein,0.955462,"1,278-amino acid glycoprotein",35,64
1,protein,0.870415,transmembrane domains.,88,110


## DNA BERT for Token Representation Learning

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
hf_model = AutoModel.from_pretrained("roychowdhuryresearch/dna2vec", trust_remote_code=True)
hf_tokenizer = AutoTokenizer.from_pretrained("roychowdhuryresearch/dna2vec", trust_remote_code=True)

config.json:   0%|          | 0.00/510 [00:00<?, ?B/s]

configuration_dna2vec.py:   0%|          | 0.00/988 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/roychowdhuryresearch/dna2vec:
- configuration_dna2vec.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_dna2vec.py:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/roychowdhuryresearch/dna2vec:
- modeling_dna2vec.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/220M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/725k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
!pip install einops
import torch
from transformers import AutoTokenizer, AutoModel, BertModel

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)



tokenizer_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/168k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/468M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/468M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at zhihan1996/DNABERT-2-117M and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.attention.self.key.bias', 'encoder.layer.1.attention.self.key.weight', 'encoder.layer.1.attention.self.query.bias', 'encoder.layer.1.attention.self.query.weight', 'encoder.layer.1.attention.self.value.bias', 'encoder.layer.1.attention.self.value.weight', 'encoder.layer.1.intermediate.dense.b

In [None]:
tokenizer.convert_ids_to_tokens([i for i in range(100)])

['[UNK]',
 '[CLS]',
 '[SEP]',
 '[PAD]',
 '[MASK]',
 'A',
 'C',
 'G',
 'T',
 'AA',
 'TT',
 'TG',
 'CA',
 'CC',
 'TA',
 'GG',
 'TC',
 'GA',
 'AAA',
 'GC',
 'TAA',
 'TTTT',
 'TCA',
 'TGA',
 'TTA',
 'GAA',
 'TCC',
 'CAA',
 'CTG',
 'CTT',
 'GTG',
 'GTT',
 'GCA',
 'GGA',
 'CCA',
 'GTA',
 'GCC',
 'CTA',
 'TAAA',
 'AAAA',
 'CTC',
 'GTC',
 'TGTG',
 'TATT',
 'CACA',
 'GAAA',
 'TATA',
 'TCTT',
 'TGTT',
 'CAAA',
 'GAGA',
 'CATT',
 'TGAA',
 'CAGG',
 'TCTG',
 'CAGA',
 'TCAA',
 'GGAA',
 'TAAAA',
 'CTGA',
 'GCTT',
 'GTGA',
 'GCTG',
 'CTCA',
 'CCTT',
 'CATG',
 'GCAA',
 'GTCA',
 'GTAA',
 'TTTTA',
 'TATG',
 'GAGG',
 'CGG',
 'GATT',
 'CCTG',
 'TCTC',
 'CCAA',
 'GTTA',
 'CTCC',
 'CTAA',
 'TACA',
 'CTTA',
 'TCCA',
 'GATG',
 'TTAA',
 'GAAAA',
 'TTTG',
 'GTTTT',
 'TCTA',
 'GCCA',
 'GTCC',
 'CTTTT',
 'GGGG',
 'CGA',
 'TTTA',
 'CCCA',
 'CAAAA',
 'TGGG',
 'TAGA',
 'TAGG']

In [None]:
import random

# Generate random DNA sequence
def generate_dna_sequence(length):
    return ''.join(random.choices('ATGC', k=length))

# Generate 50 reads of length 30
dna_sequence_list = [generate_dna_sequence(30) for _ in range(50)]
print(dna_sequence_list)
#dna_sequence_list[0] = "A"

print(hf_tokenizer(dna_sequence_list, return_tensors = 'pt', padding=True).keys())
inputs = hf_tokenizer(dna_sequence_list, return_tensors = 'pt', padding=True)["input_ids"]
print(inputs[0])
print(dna_sequence_list[0], inputs[0])
print(dna_sequence_list[1], inputs[1])
hidden_states = hf_model(inputs)[0] # [50, sequence_length, 768]

['TCATCGCGTAGCACTCAGGTTTTCGTCTAG', 'CATCTGCTTATCGTTATCGTTGTACGACGC', 'CGGCCGTGGTTAGTCCCGCAGTAGGGCTTT', 'CTGAGTTCGAACAAAAGCCGGCCAATGGGT', 'TCGCAAGATGTATTGTGTAGTCAGCTTACA', 'AAGATTCTCTACCAGATGCCTCGCAGGCTG', 'GCCTGTCCGTTGCCAGTATCACTTGAGATG', 'TCGCCTAAATGTCATGTCCTACACCCGACC', 'CCATTCCGACCAAACCAGGTGGAAGTGACA', 'AGGTCGATCAGACAAGCTTCGCGCTGAGAG', 'CAAGCTACGCTTAGCGGAACAGCTTTCCCC', 'GTACAACCGCCCCCGAAGGATGTAGGCCAA', 'TCGTGACCAAGCACACAGGACACGAGAATT', 'AACGATGAAAGTTGTATCTCGTGGAACATA', 'CCAATCCGAGTGTTATTTCCGTAAACTATC', 'CCAGTATTACTTCCAATCCAAGGCCCCCTA', 'ATTCAGTTGAACCGAGCTTGTGTTGTGAGC', 'TAAGTAACCAGTGCTTTATCTCCTACAGCT', 'TTTTCTGTGTGAAATGCTTTTACCCAGAGG', 'TTCCGGCAGGCCCATAAATTATTTAGGCCC', 'AGTGAGAGGAGCTTGCTAGTCCGATGATGT', 'AGCCGTCAAGGCAAGGCTTCTAACGAGTTG', 'TACGGCCATGAGTAGCGGGGGGCGCGAAAG', 'TTAGATCCCGCATTTCATGCATCTCGGTAC', 'CATGCGGGGCAACTTTGATGCACGGCGTAA', 'GGGGTAGTCTACGTGCATGGCCTCGACCCT', 'GCGACTTAAAAGCGCGGGATAGAAGATAAT', 'CATAACCCTTAGATTACGAATTAGATCCAA', 'TTCTCATATCTGTTCCTACCCCGCTAATAC', 'TTGCCGCAACGA

In [None]:
hidden_states.shape

torch.Size([12, 1020])

In [None]:
inputs[0].shape

torch.Size([12])

In [None]:
hidden_states.shape

torch.Size([12, 1020])

In [None]:
import random

# Generate random DNA sequence
def generate_dna_sequence(length):
    return ''.join(random.choices('ATGC', k=length))

# Generate 50 reads of length 30
dna_sequence_list = [generate_dna_sequence(30) for _ in range(50)]
print(dna_sequence_list)
dna_sequence_list[0] = "A"

print(tokenizer(dna_sequence_list, return_tensors = 'pt', padding=True).keys())
inputs = tokenizer(dna_sequence_list, return_tensors = 'pt', padding=True)["input_ids"]
print(dna_sequence_list[0], inputs[0])
print(dna_sequence_list[1], inputs[1])
hidden_states = model(inputs)[0] # [50, sequence_length, 768]
print(hidden_states.shape)

['AATCGTGCCTTCTCGTATGGATGTAGGGCG', 'ATGTTCTCAGCGTTTCAACGTCGATTCAGA', 'CAAGAGAATAGCATTTGCTCCATCGATTGT', 'GGAAAGCGGATGACGCGGTACCCTCTCTAC', 'GTGCAGAGGTCCTGGGACAGAATCTGAGCG', 'ACCAGGCCTCTAAGTCGAGTGTCTTGTCCT', 'CGGACTCCCGTCCACTGTTGAAGTTACGTT', 'AAGGGACACATATGGTAGACCCGAAACCAC', 'GCGTCTTTGCTTCAGGTTTATGAAGAGGCT', 'AGTTCCTGCAAAAAAATGTTCTCTCAATTG', 'GATTCGGCATTTCAATTTCCTCGAGGAGAT', 'CGCATACTATGAAAGCATTTGACGAGGAGG', 'CCATGCATGCCACATTATAGTTTAAGACCG', 'TATTGAATCTCCGTGCGCATAGTTCTATCG', 'GACTCTATGTCAACTCGAAGCACGCCGTAA', 'AGTAATGCATTGCTCCCAGCTCTATTAACA', 'GCTTAAGTCTCTGATCAAAGTCGGATACGA', 'ACGTCGCCTTTTAGTGTGGCATTGCAAATA', 'GCCCGTGGTTAGCCCAATTTTCATAAATGT', 'CGCACTAAAATAGTTCAAATGGACAAGGCA', 'ATCCTTGGGCCAAATGGAAGCCGTCTCGCT', 'TTATTCCTCATTCATCTCCAATAAATGTAC', 'TTTAAGGCCAGACTCGCCTATACACTCTAT', 'ACTTGGCGAATAGCCACCGCTTCTGAGGCC', 'GGCGCTCGAGATATACACGGCTCTGTCTCA', 'AGCAGTGTCCTCGACAGCGGTCGGAAACTT', 'ACGCGGCTCATTTGCGTCCCCGCCACAAAC', 'TGTTCAGGGTGTCTGAGAAACGCAACGTAT', 'AGACCAGACCTAACAAGGAGGCTTATTTTG', 'CTGCGCTTCCTA

In [None]:
hidden_states.shape

torch.Size([50, 11, 768])

In [None]:
# embedding with mean pooling
embedding_mean = torch.mean(hidden_states, dim=1)
print(embedding_mean.shape) # expect to be 768

torch.Size([50, 768])


# Woohoooo! Last challenge!
## Mystery Code: What does this do?

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def mystery_function(read):

    inputs = tokenizer([read], return_tensors = 'pt', padding=True)["input_ids"]
    hidden_states = model(inputs)[0]
    read_representation = torch.mean(hidden_states, dim=1)
    similarities = cosine_similarity(read_representation.detach().numpy(), embedding_mean.detach().numpy())
    top_indices = similarities.argsort()[0][-5:][::-1]
    for index in top_indices:
        print(dna_sequence_list[index], "mystery score: ", similarities[0][index])


In [None]:
mystery_function('ACAGCTCTCCCC')

CGGCCGACGCCCCGGGGATATGACACAAGG mystery score:  0.98929393
ACCAGGCCTCTAAGTCGAGTGTCTTGTCCT mystery score:  0.9889093
TATTGAATCTCCGTGCGCATAGTTCTATCG mystery score:  0.9876513
TGTTCAGGGTGTCTGAGAAACGCAACGTAT mystery score:  0.98730135
AGTAATGCATTGCTCCCAGCTCTATTAACA mystery score:  0.9871589
