In [6]:
from transformers import PreTrainedTokenizer

In [None]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import vcf 
from Bio import SeqIO
import tqdm


def making_tokens(seq):
    tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-t2t')
    tokens = tokenizer.tokenize(seq,truncation=True,max_length=512)
    encoded_sequence = tokenizer.encode_plus(sequence,return_offsets_mapping=True,truncation=True,max_length=512)
    offsets = encoded_sequence["offset_mapping"]
    print(tokens)
    token_ids = np.array(encoded_sequence["input_ids"])
    tokens_coord={}
    for i in range(1, len(token_ids) - 1):
        start_char = offsets[i][0]
        end_char = offsets[i][1]
        token = tokenizer.decode([token_ids[i]])
        tokens_coord[token]=(start_char,end_char)
        
    return tokens_coord



def changing_tokens_via_vcf(path_to_VCF, chrom, seq, seq_start, seq_end, dict_with_tokens_and_coord):
    reader = vcf.Reader(filename=path_to_VCF)
    reader.fetch(chrom,start=seq_start,end=seq_end)
    tokens_array=dict_with_tokens_and_coord.keys()
    tokens_coord=dict_with_tokens_and_coord.values()
    tokens_starts = [item[0] + seq_start for item in tokens_coord]
    tokens_ends = [item[1] + seq_start for item in tokens_coord]
    tokens_data=pd.DataFrame({'tokens':tokens_array,'start':tokens_starts,'end':tokens_ends})
    count_total_record_in_tokens=0
    new_unique_tokens=0
    for record in reader:
        record_in_token=tokens_data[(tokens_data['start']<=record.start) & (tokens_data['end']>=record.end)]
        if len(record_in_token)==1:
            count_total_record_in_tokens+=1
            start_in_token=record.start-record_in_token['start'].values + 1
            end_in_token=record.end - record_in_token['start'].values + 1
            new_token = record_in_token['tokens'].values
            new_token = ''.join(new_token)
            record_str=''.join(map(str, record.ALT))
            new_token = new_token[:int(start_in_token)] + record_str + new_token[int(end_in_token):]
            if new_token not in tokens_array:
                new_unique_tokens+=1
    return(new_unique_tokens*100/count_total_record_in_tokens)

            
        

def get_sequence_from_fasta(fasta_file, chromosome, start, end):
    for record in SeqIO.parse(fasta_file, "fasta"):
        if record.id == chromosome:
            return record.seq[start-1:end] 


chrom_to_choose='chr1'
vcf_path='/beegfs/data/hpcws/ws1/popov-popov_gnomad/GNOMAD/gnomad/gnomad.genomes.v4.0.sites.'+chrom_to_choose+'.vcf.gz'
chromsizes_for_org=pd.read_csv('hg38.chrom.sizes',sep='\t',header=None)
len_of_chrom=chromsizes_for_org[chromsizes_for_org[0]==chrom_to_choose][1]
start_random_coordinates=np.random.randint(0,int(len_of_chrom)-6000,size=500)
end_random_coordinates=start_random_coordinates+6000
prop_of_new_tokens=[]
for i in tqdm.tqdm(range(len(start_random_coordinates))):
    sequence=get_sequence_from_fasta('hg38.fa',chrom_to_choose, start_random_coordinates[i], end_random_coordinates[i]).upper()
    sequence='NNNAGCTNNNNNNNNNNN'
    dict_tokens=making_tokens(sequence)
    prop_of_new_tokens.append(changing_tokens_via_vcf(vcf_path,chrom_to_choose, sequence, start_random_coordinates[i], end_random_coordinates[i],dict_tokens))
data=pd.DataFrame({'prop_of_new_tokens':prop_of_new_tokens})
data.to_csv('prop_of_new_tokens.txt',sep='\t',index=False,header=None)



In [128]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import vcf 
from Bio import SeqIO
import tqdm
from pysam import FastaFile
import time 

def making_tokens(seq):
    tokenizer = AutoTokenizer.from_pretrained('AIRI-Institute/gena-lm-bert-base-t2t')
    tokens = tokenizer.tokenize(seq)
    encoded_sequence = tokenizer.encode_plus(sequence,return_offsets_mapping=True)
    offsets = encoded_sequence["offset_mapping"]
    token_ids = np.array(encoded_sequence["input_ids"])
    tokens_coord={}
    general_tokens_dict=tokenizer.vocab.keys()
    for i in range(1, len(token_ids) - 1):
        start_char = offsets[i][0]
        end_char = offsets[i][1]
        token = tokenizer.decode([token_ids[i]])
        tokens_coord[(start_char,end_char)]=token
    return tokens_coord,general_tokens_dict



def changing_tokens_via_vcf(path_to_VCF, chrom, seq, seq_start, seq_end, dict_with_tokens_and_coord,general_tokens_dict):
    reader = vcf.Reader(filename=path_to_VCF)
    reader.fetch(chrom,start=seq_start,end=seq_end)
    tokens_coord=dict_with_tokens_and_coord.keys()
    tokens_array=dict_with_tokens_and_coord.values()
    tokens_starts = [item[0] + seq_start for item in tokens_coord]
    tokens_ends = [item[1] + seq_start for item in tokens_coord]
    tokens_data=pd.DataFrame({'tokens':tokens_array,'start':tokens_starts,'end':tokens_ends})
    count_total_record_in_tokens=0
    new_unique_tokens=0
    count_wrong=0
    af_list=[]
    time_list=[]
    for record in reader:
        record_in_token=tokens_data[(tokens_data['start']<=record.start) & (tokens_data['end']>=record.end)]
        if len(record_in_token)==1:
            start_time = time.time()
            count_total_record_in_tokens+=1
            start_in_token=record.start-record_in_token['start'].values
            end_in_token=record.end - record_in_token['start'].values
            new_token = record_in_token['tokens'].values
            new_token = ''.join(new_token)
            record_str=''.join(map(str, record.ALT))
            new_token = new_token[:(start_in_token[0])] + record_str + new_token[(end_in_token[0]):]
            af_list=record.INFO['AF']
            time_per_token = time.time() - start_time
            time_list.append(time_per_token)
        else:
            record_in_token=tokens_data[((tokens_data['start']<=record.start) & (tokens_data['end']<=record.end) & (record.start<tokens_data['end'])) | 
                                        ((tokens_data['start']>=record.start) & (tokens_data['start']-1<=record.end) & (record.end<=tokens_data['end']))].reset_index()
            if len(record_in_token)!=1:
                start_time = time.time()
                count_total_record_in_tokens+=2
                start_in_token=record.start-record_in_token.loc[0,'start']
                new_token = record_in_token.loc[0,'tokens']
                end_in_token=len(new_token)
                new_token = ''.join(new_token)
                record_str=''.join(map(str, record.ALT))
                new_token_0 = new_token[:int(start_in_token)] + record_str
                start_in_token=record.end - record_in_token.loc[1,'start']
                new_token = record_in_token.loc[1,'tokens']
                new_token_1 = new_token[int(start_in_token):]
                time_per_token = (time.time() - start_time)/2
                time_list.append(time_per_token)
                af_list=record.INFO['AF']
    return np.mean(time_list)  
        
genome='/data/aapopov/Projects/gnomad/gnomad_tokens_experiment/hg38.fa'
sequences_object = FastaFile(genome)
chrom_to_choose='chr14'
vcf_path='/data/aapopov/Projects/gnomad/gnomad_tokens_experiment/DICER1_chr14_94987924_95184532.vcf.gz'
chromsizes_for_org=pd.read_csv('hg38.chrom.sizes',sep='\t',header=None)
len_of_chrom=chromsizes_for_org[chromsizes_for_org[0]==chrom_to_choose][1]
start_random_coordinates=np.random.randint(94987924,95184532-6000,size=100)
end_random_coordinates=start_random_coordinates+6000
prop_of_new_tokens=[]
for i in tqdm.tqdm(range(len(start_random_coordinates))):
    sequence=sequences_object.fetch(chrom_to_choose, start_random_coordinates[i], end_random_coordinates[i]).upper()
    dict_tokens,general_tokens_dict=making_tokens(sequence)
    prop_of_new_tokens.append(changing_tokens_via_vcf(vcf_path,chrom_to_choose, sequence, start_random_coordinates[i], end_random_coordinates[i],dict_tokens,general_tokens_dict))
print(f'Среднее время на один токен:{np.mean(prop_of_new_tokens)}')
data=pd.DataFrame({'prop_of_new_tokens':prop_of_new_tokens})
data.to_csv('prop_of_new_tokens.txt',sep='\t',index=False,header=None)



100%|█████████████████████████████████████████| 100/100 [02:23<00:00,  1.43s/it]

Среднее время на один токен:6.581866178536202e-05



