In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import re
import math
import copy
import random
import time

In [6]:
# 预定义的变量
save_path = 'D:\\sorf_models\\model\\tests\\model_saves' # 储存模型结果的文件夹，以\\结尾

# 模型超参数
window_size = 51  # 每次输入的序列长度，为奇数,且至少为5
dropout = 0
batch_size = 32
num_layers = 2  # LSTM层数
hidden_dim = 128  # LSTM隐藏层大小
proj_size = 64  # LSTM映射层大小
weight_scale=0.02
TIS_max_num=40
sqrt_scale=7.47518547954589

j = 'all_10'

# writer
run_header="train_2_15"

In [7]:
# 读取和处理数据：
padding_size = int(window_size / 2)
dict = {'P': np.array([1, 0, 0, 0, 0]),
        'A': np.array([0, 1, 0, 0, 0]),
        'G': np.array([0, 0, 1, 0, 0]),
        'C': np.array([0, 0, 0, 1, 0]),
        'T': np.array([0, 0, 0, 0, 1])
        }

def read_fasta(input): #用def定义函数read_fasta()，并向函数传递参数用变量input接收
    with open(input,'r') as f: # 打开文件
        fasta = {} # 定义一个空的字典
        for line in f:
            line = line.strip() # 去除末尾换行符
            if line[0] == '>':
                header = line[1:]
                ind=True
            else:
                if ind:
                    sequence = line.upper()
                    if all([i in ['A','T','C','G'] for i in set(sequence)]):
                        fasta[header] = fasta.get(header,'') + sequence
                    else:
                        if header in fasta.keys():
                            del fasta[header]
                        ind=False
    return fasta

def is_validate(k,sig_ratio_cutoff=2):
    if vectors[iii][k-padding_size]==0:
        return 0
    else:
        background_pos=[i for i in set(range(k-3,k+4))-set(pos3) if i>=padding_size and i<=len(vectors[iii])+padding_size-1]
        if len(background_pos)==0:
            print('Error: background_pos len=0 for iii='+str(iii)+' and position='+str(k-padding_size))
        background_signal=np.mean(vectors[iii][[i-padding_size for i in background_pos]])
        if background_signal==0:
            return 1
        else:
            signal_ratio=vectors[iii][k-padding_size]/background_signal
            if signal_ratio>=sig_ratio_cutoff:
                return 1
            else:
                return 0
    
def form_target(k,left_diff,right_diff):
    if left_diff>=3:
        left_value=vectors[iii][k-padding_size-1]
    elif left_diff==1:
        left_value=0
    elif left_diff==0 and k==padding_size:
        left_value=0
    elif left_diff==0 and k>padding_size:
        left_value=vectors[iii][k-padding_size-1]
    else:
        left_value=0.5*vectors[iii][k-padding_size-1]
    if right_diff>=3 or right_diff==0:
        right_value=vectors[iii][k-padding_size+1]
    elif right_diff==1:
        right_value=0
    else:
        right_value=0.5*vectors[iii][k-padding_size+1]
    return left_value+vectors[iii][k-padding_size]+right_value

def ifleftequalzero(j):
    if j==0:
        return 0
    else:
        return pos[j]-pos[j-1]

def ifrightequalmax(j):
    if j==(len(pos)-1):
        return 0
    else:
        return pos[j+1]-pos[j]

In [8]:
class MyLSTM(nn.Module):
    def __init__(self, dropout, num_layers, hidden_dim, proj_size, input_dim=62):
        # 即表示每一个window（31个碱基）对应一个cell，再加上一个frame indicator
        super(MyLSTM, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, (5,3), padding=(2,1))
        self.conv2 = nn.Conv2d(16, 16, (5,3), padding=(2,1))
        self.fc0 = nn.Linear(16 * 5 * window_size, input_dim)
        self.rnn = nn.LSTM(input_size=input_dim+2, hidden_size=hidden_dim, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, int(hidden_dim/2))
        self.fc2 = nn.Linear(int(hidden_dim/2), 1)
        self.lrelu1 = nn.LeakyReLU(0.05)
        self.lrelu2 = nn.LeakyReLU(0.05)

    def forward(self, x, sizes, diss):
        output = F.relu(self.conv1(x))  # (sum_seq_len, 1, window_size, 5)
        output = F.relu(self.conv2(output))  # (sum_seq_len, 16, window_size, 5)
        output = F.relu(self.fc0(torch.flatten(output, 1))) # (sum_seq_len, 16*window_size*5)
        output = torch.cat((output, diss), 1)  # (sum_seq_len, 62)
        output = nn.utils.rnn.PackedSequence(data=output, batch_sizes=sizes)  # (sum_seq_len, 64)
        output, _ = self.rnn(output)  # (packed_len, 64)
        output = self.lrelu1(output.data)  # (sum_seq_len, 64)
        output = self.lrelu2(self.fc1(output))  # (sum_seq_len, 64)
        output = self.fc2(output).squeeze()  # (sum_seq_len, 32)
        output = nn.utils.rnn.PackedSequence(data=output, batch_sizes=sizes)  # (packed_len)
        output = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)  # (batch_size, max_seq_length)
        return output[0]

cuda_avail = torch.cuda.is_available()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(cuda_avail)

# 由于每个sample长度不一，需要自定义Dataloader的collate_fn:
def collate_fn_padd(batch):
    batch_target = [i[1] for i in batch]
    target_lengths = torch.tensor([t.shape[0] for t in batch_target])
    batch_seq = [i[0] for i in batch]
    batch_weight = [i[2] for i in batch]
    batch_dis = [i[3] for i in batch]
    batch_trans = [i[4] for i in batch]
    ## padd
    target_lengths, idx = target_lengths.sort(0, descending=True)
    batch_seq = [batch_seq[i] for i in idx]
    batch_target = [batch_target[i] for i in idx]
    batch_weight = [batch_weight[i] for i in idx]
    batch_dis = [batch_dis[i] for i in idx]
    batch_trans = [batch_trans[i] for i in idx]
    batch_seq = [t.type(torch.float).to(device) for t in batch_seq]
    batch_target = [t.type(torch.float).to(device) for t in batch_target]
    batch_weight = [t.type(torch.float).to(device) for t in batch_weight]
    batch_dis = [t.type(torch.float).to(device) for t in batch_dis]

    batch_seq = pad_sequence(batch_seq, batch_first=True)  # (batch_size, max_seq_length, window_size, 4)
    batch_seq = pack_padded_sequence(input=batch_seq, lengths=target_lengths, batch_first=True)
    # (sum_seq_length, window_size, 4)
    batch_weight = pad_sequence(batch_weight, batch_first=True)
    #batch_weight = pack_padded_sequence(input=batch_weight, lengths=target_lengths, batch_first=True).data
    # (sum_seq_length, 3)
    batch_target = pad_sequence(batch_target, batch_first=True)  # (batch_size, max_seq_length)
    #batch_target = pack_padded_sequence(input=batch_target, lengths=target_lengths, batch_first=True).data
    # (sum_seq_len)
    batch_dis = pad_sequence(batch_dis, batch_first=True)  # (batch_size, max_seq_length)
    batch_dis = pack_padded_sequence(input=batch_dis, lengths=target_lengths, batch_first=True).data
    return torch.unsqueeze(batch_seq.data, 1), batch_seq.batch_sizes, batch_target, batch_weight, batch_dis, batch_trans

True


In [9]:
#预测一条：
class testoneMyDataset(Dataset):
    def __init__(self, inputset, targetset, weightset, disset, transcriptset):
        self.inputset=inputset
        self.targetset=targetset
        self.weightset=weightset
        self.disset=disset
        self.transcriptset=transcriptset
        
    def __getitem__(self, idx):
        input = self.inputset[idx]
        label = self.targetset[idx]
        weights = self.weightset[idx]
        diss = self.disset[idx]
        transcriptt = self.transcriptset[idx]
        return input, label, weights, diss, transcriptt

    def __len__(self):
        return len(self.targetset)

In [10]:
# use_own=False
# sseq=sseqs[2].values[ss]
# test_o=sseqs[0].values[ss]
# test_o="ENST00000343053.6"
# sseq="ATTAGAGTCTGTGCTTCACTTCCGTTCCAGCCTCAGCGGCAGCTGGATCGCTCGACGGAGTGCCTCTGGTAGTTGGCCAAGACGCCGAATATCAAAATCTTCAGCGGCAGCTCCCACCAGGACTTATCCCAGAAAATTGCTGACCGCCTGGGCCTGGAGCTAGGCAAGGTGGTGACTAAGAAATTCAGCAACCAGGAGACCTGCGTGGAAATTGATGAGAGTGTGCGTGGAGAGGATGTCTACATCGTTCAGAGTGGTTGTGGCGAAATCAACGACAGTCTAATGGAGCTTTTGATCATGATTAATGCCTGCAAGATTGCTTCAGCTAGCCGAGTTACTGCAGTCATCCCATGCTTCCCTTATGCCCGACAGGATAAGAAGGATAAGAGCCGGTCCCCAATCTCTGCCAAGCTTGTTGCAAATATG"
# if use_own==False:
#     sseq=fa[test_o]
# my_padded_seq="P"*padding_size+sseq+"P"*padding_size

# pattern=re.compile(r'(?=(ATG|TTG|CTG|GTG|AAG|ACG|AGG|ATA|ATT|ATC))')
# it=re.finditer(pattern,my_padded_seq)
# pos3=[i.span()[0] for i in it][0:(TIS_max_num+3)]
# pos2=pos3[0:(TIS_max_num+1)]
# pos=pos2[0:TIS_max_num]
# if len(pos2)==len(pos):
#     pos2=pos2+[pos2[len(pos2)-1]+20]
# my_seq=torch.stack([torch.tensor([dict[j] for j in list(my_padded_seq[(p-padding_size):(p+padding_size+1)])], dtype=torch.long) for p in pos])
# my_target=torch.tensor([1 for j in range(len(pos))])
# my_weight=torch.tensor([1]*len(pos))
# pos1=[padding_size]+pos
# my_dis=torch.stack([torch.tensor([pos1[i+1]-pos1[i] for i in range(len(pos1)-1)]),torch.tensor([pos2[i+1]-pos2[i] for i in range(len(pos2)-1)])],1)
# my_dis=my_dis/sqrt_scale

# print(test_o)
# MyvalData = testoneMyDataset(inputset=[my_seq], targetset=[my_target], weightset=[my_weight], disset=[my_dis], transcriptset=[test_o])
# Myvalloader = torch.utils.data.DataLoader(MyvalData, batch_size=1, shuffle=False, num_workers=0, collate_fn=collate_fn_padd, drop_last=True)
# outputss=pd.DataFrame()
# for t_run in range(1,11):
#     writer_title='run_all_'+str(t_run)
#     state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
#     Mymodel = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
#     Mymodel.to(device)
#     optimizer = optim.SGD(Mymodel.parameters(), lr=lr, momentum=momentum)
#     Mymodel.load_state_dict(state['state_dict'])
#     optimizer.load_state_dict(state['optimizer'])
#     epoch = state['epoch']
#     jj = state['jj']
#     converge_level = state['converge_level']
#     running_loss = state['running_loss']
#     inspect_loss = state['inspect_loss']
#     loss_ratio = state['loss_ratio']
#     sigma_index = state['sigma_index']
#     print(save_path + "\\"+run_header+"\\"+writer_title+".pkl loaded.")
#     with torch.no_grad():
#         iter0=iter(enumerate(Myvalloader, 0))
#         iii, batch=next(iter0)
#         inputs, sizes, labels, weights, diss, transcriptt = batch
#         outputs = Mymodel(x=inputs, sizes=sizes, diss=diss)
#         loss = criterion(outputs, labels, weights, torch.sum(sizes), loss_ratio)
#         outputss['run_'+str(t_run)]=np.array(outputs.cpu().flatten(0))

# outputss.to_csv('final_true_vs_predicts\\inspect_output.txt',sep='\t',index=False)
# np.savetxt('final_true_vs_predicts\\inspect.txt',np.array([1]*len(sseq)),header=test_o)
# #pos=[i[1] for i in find_TIS(test_o)]

# pattern=re.compile(r'(?=(ATG|TTG|CTG|GTG|AAG|ACG|AGG|ATA|ATT|ATC))')
# it=re.finditer(pattern,sseq)
# TIS=[(i.group(1),i.span()[0]) for i in it]
# pos=[i[1] for i in TIS[0:TIS_max_num]]

# np.savetxt('final_true_vs_predicts\\inspect_pos.txt',np.array(pos),header=test_o)
# ss+=1

In [11]:
##突变模拟
lnc_fa=read_fasta('all_lnc_new_plane.fa')
print(len(lnc_fa))

323960


In [12]:
t_run=1
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel1 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel1.to(device)
Mymodel1.load_state_dict(state['state_dict'])
Mymodel1.eval()

t_run=2
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel2 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel2.to(device)
Mymodel2.load_state_dict(state['state_dict'])
Mymodel2.eval()

t_run=3
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel3 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel3.to(device)
Mymodel3.load_state_dict(state['state_dict'])
Mymodel3.eval()

t_run=4
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel4 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel4.to(device)
Mymodel4.load_state_dict(state['state_dict'])
Mymodel4.eval()

t_run=5
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel5 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel5.to(device)
Mymodel5.load_state_dict(state['state_dict'])
Mymodel5.eval()

t_run=6
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel6 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel6.to(device)
Mymodel6.load_state_dict(state['state_dict'])
Mymodel6.eval()

t_run=7
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel7 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel7.to(device)
Mymodel7.load_state_dict(state['state_dict'])
Mymodel7.eval()

t_run=8
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel8 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel8.to(device)
Mymodel8.load_state_dict(state['state_dict'])
Mymodel8.eval()

t_run=9
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel9 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel9.to(device)
Mymodel9.load_state_dict(state['state_dict'])
Mymodel9.eval()

t_run=10
writer_title='run_all_'+str(t_run)
state = torch.load(save_path + "\\"+run_header+"\\"+writer_title+".pkl")
Mymodel10 = MyLSTM(dropout=dropout, hidden_dim=hidden_dim, num_layers=num_layers, proj_size=proj_size)
Mymodel10.to(device)
Mymodel10.load_state_dict(state['state_dict'])
Mymodel10.eval()

MyLSTM(
  (conv1): Conv2d(1, 16, kernel_size=(5, 3), stride=(1, 1), padding=(2, 1))
  (conv2): Conv2d(16, 16, kernel_size=(5, 3), stride=(1, 1), padding=(2, 1))
  (fc0): Linear(in_features=4080, out_features=62, bias=True)
  (rnn): LSTM(64, 128, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (lrelu1): LeakyReLU(negative_slope=0.05)
  (lrelu2): LeakyReLU(negative_slope=0.05)
)

In [13]:
my_lnc=list(pd.read_csv('expression_profiles_HPA/max10955lnc.txt',header=None)[0].values)
my_lnc=list(set(lnc_fa.keys()) & set(my_lnc))
my_lnc[0:10]

['HSALNT0183798',
 'HSALNT0148586',
 'HSALNT0209455',
 'HSALNT0017544',
 'HSALNT0100282',
 'HSALNT0155210',
 'HSALNT0388907',
 'HSALNT0043156',
 'HSALNT0253846',
 'HSALNT0047803']

In [15]:
print(len(my_lnc))

10953


In [14]:
random.seed(121)
dic1={"A": 0,"T": 1,"G": 2,"C": 3}
ATGC=list('ATGC')
HIS={}
LNC=[]
smut1=[]
smut2=[]
def mutate(x,a,b):
    return x[:a]+ATGC[(dic1[x[a]]+b+1) % 4]+x[(a+1):]

t0 = time.time()
with torch.no_grad():
    for lnc in my_lnc:
        lnc_len=len(lnc_fa[lnc])
        mut1_num=min(250,lnc_len)
        his=[(-1,-1)]
        padded_lnc_list=["P"*padding_size+lnc_fa[lnc]+"P"*padding_size]

        for i in range(mut1_num):
            for j in range(3):
                his.append((i,j))
                padded_lnc_list.append("P"*padding_size+mutate(lnc_fa[lnc],i,j)+"P"*padding_size)
        my_seq = []
        my_target=[]
        my_weight=[]
        my_dis=[]
        for iii in range(len(padded_lnc_list)):
            ii=padded_lnc_list[iii]
            pattern=re.compile(r'(?=(ATG|TTG|CTG|GTG|AAG|ACG|AGG|ATA|ATT|ATC))')
            it=re.finditer(pattern,ii)
            pos3=[i.span()[0] for i in it][0:(TIS_max_num+3)]
            pos2=pos3[0:(TIS_max_num+1)]
            pos=pos2[0:TIS_max_num]
            if len(pos2)==len(pos):
                pos2=pos2+[pos2[len(pos2)-1]+20]
            my_seq.append(torch.stack([torch.tensor([dict[j] for j in list(ii[(p-padding_size):(p+padding_size+1)])], dtype=torch.long) for p in pos]))
            my_target.append(torch.tensor([1 for j in range(len(pos))]))
            my_weight.append(torch.tensor([1]*len(pos)))
            pos1=[padding_size]+pos
            my_dis.append(torch.stack([torch.tensor([pos1[i+1]-pos1[i] for i in range(len(pos1)-1)]),torch.tensor([pos2[i+1]-pos2[i] for i in range(len(pos2)-1)])],1))
        my_dis=[i/sqrt_scale for i in my_dis]
        MyvalData = testoneMyDataset(inputset=my_seq, targetset=my_target, weightset=my_weight, disset=my_dis, transcriptset=his)
        Myvalloader = torch.utils.data.DataLoader(MyvalData, batch_size=len(MyvalData), shuffle=False, num_workers=0, collate_fn=collate_fn_padd, drop_last=True)
        iter0=iter(enumerate(Myvalloader, 0))
        iii, batch=next(iter0)
        inputs, sizes, labels, weights, diss, transcriptt = batch
        outputs=torch.stack((Mymodel1(x=inputs, sizes=sizes, diss=diss),
                            Mymodel2(x=inputs, sizes=sizes, diss=diss),
                            Mymodel3(x=inputs, sizes=sizes, diss=diss),
                            Mymodel4(x=inputs, sizes=sizes, diss=diss),
                            Mymodel5(x=inputs, sizes=sizes, diss=diss),
                            Mymodel6(x=inputs, sizes=sizes, diss=diss),
                            Mymodel7(x=inputs, sizes=sizes, diss=diss),
                            Mymodel8(x=inputs, sizes=sizes, diss=diss),
                            Mymodel9(x=inputs, sizes=sizes, diss=diss),
                            Mymodel10(x=inputs, sizes=sizes, diss=diss)))

        outputs=torch.mean(outputs/torch.sum(torch.abs(outputs),2,keepdim=True),0,keepdim=False)
        max_peak,indices = torch.max(outputs,1,keepdim=False)
        outputs0=outputs[0][indices]
        outputs0[outputs0<0.01]=0.01
        outputs1=max_peak/outputs0
        outputs2=max_peak/max(outputs[0])
        
        max_peak=np.array(max_peak.cpu())
        outputs11=np.array(outputs1.cpu())
        outputs22=np.array(outputs2.cpu())
        HIS[lnc]=[(his[i][0],his[i][1],max_peak[i],outputs11[i],outputs22[i]) for i in range(len(his))]
        outputs1[outputs1<1]=1
        outputs2[outputs2<1]=1
        LNC.append(lnc)
        smut1.append(np.array(torch.mean(outputs1[1:]).cpu()))
        smut2.append(np.array(torch.mean(outputs2[1:]).cpu()))
t1 = time.time()
print("time used:",t1-t0,"seconds")
print("average time:",(t1-t0)/len(LNC),"seconds")
pd.DataFrame({"lncRNA": LNC,"self_foldchange": smut1,"overall_foldchange": smut2}).to_csv("lnc_res.txt",sep="\t")
f=open('lnc_mutation_result.txt','wb')
pickle.dump(HIS,f)
f.close()

time used: 52374.52177000046 seconds
average time: 4.7817512800146496 seconds


In [55]:
HIS['HSALNT0068489']

[(-1, -1, 0.30230683, 1.0, 1.0),
 (0, 0, 0.31989497, 1.0581797, 1.0581797),
 (0, 1, 0.32658347, 21.068188, 1.0803046),
 (0, 2, 0.32697716, 1.081607, 1.081607),
 (1, 0, 0.30928198, 1.0230731, 1.0230731),
 (1, 1, 0.31292292, 20.186934, 1.0351169),
 (1, 2, 0.32571325, 21.012049, 1.0774261),
 (2, 0, 0.30073124, 0.9947881, 0.9947881),
 (2, 1, 0.28486934, 0.94231856, 0.94231856),
 (2, 2, 0.30588508, 19.732916, 1.0118365),
 (3, 0, 0.26456642, 0.8751586, 0.8751586),
 (3, 1, 0.29339495, 0.97052044, 0.97052044),
 (3, 2, 0.3261601, 1.0789042, 1.0789042),
 (4, 0, 0.30475062, 1.0080838, 1.0080838),
 (4, 1, 0.26919207, 0.8904598, 0.8904598),
 (4, 2, 0.27387664, 0.90595585, 0.90595585),
 (5, 0, 0.3223824, 1.0664079, 1.0664079),
 (5, 1, 0.28005254, 2.7420347, 0.9263851),
 (5, 2, 0.2653809, 2.5983822, 0.8778528),
 (6, 0, 0.2879587, 0.9525379, 0.9525379),
 (6, 1, 0.23265074, 2.2779167, 0.7695848),
 (6, 2, 0.25319597, 0.8375463, 0.8375463),
 (7, 0, 0.25420165, 25.420166, 0.840873),
 (7, 1, 0.32389924, 1.

In [None]:
# print(list(lnc_fa.keys())[0:3])
# random.seed(121)
# dic1={"A": 0,"T": 1,"G": 2,"C": 3}
# ATGC=list('ATGC')
# HIS={}
# LNC=[]
# smut=[]
# dmut=[]
# def mutate(x,a,b):
#     return x[:a]+ATGC[(dic1[x[a]]+b+1) % 4]+x[(a+1):]

# for lnc in list(lnc_fa.keys())[0:3]:
#     lnc_len=len(lnc_fa[lnc])
#     mut1_num=min(250,lnc_len)
#     his=[set([(-1,-1),(-1,-1)])]
#     padded_lnc_list=["P"*padding_size+lnc_fa[lnc]+"P"*padding_size]
    
#     for i in range(mut1_num):
#         for j in range(3):
#             his.append(set([(i,j),(-1,-1)]))
#             padded_lnc_list.append("P"*padding_size+mutate(lnc_fa[lnc],i,j)+"P"*padding_size)
#             for k in range(50):
#                 m=list(set(list(range(mut1_num)))-set([i]))[random.randint(0,mut1_num-2)]
#                 n=random.randint(0,2)
#                 while set([(i,j),(m,n)]) in his:
#                     m=list(set(list(range(mut1_num)))-set([i]))[random.randint(0,mut1_num-2)]
#                     n=random.randint(0,2)
#                 his.append(set([(i,j),(m,n)]))
#                 padded_lnc_list.append("P"*padding_size+mutate(mutate(lnc_fa[lnc],i,j),m,n)+"P"*padding_size)
#     my_seq = []
#     my_target=[]
#     my_weight=[]
#     my_dis=[]
#     for iii in range(len(padded_lnc_list)):
#         ii=padded_lnc_list[iii]
#         pattern=re.compile(r'(?=(ATG|TTG|CTG|GTG|AAG|ACG|AGG|ATA|ATT|ATC))')
#         it=re.finditer(pattern,ii)
#         pos3=[i.span()[0] for i in it][0:(TIS_max_num+3)]
#         pos2=pos3[0:(TIS_max_num+1)]
#         pos=pos2[0:TIS_max_num]
#         if len(pos2)==len(pos):
#             pos2=pos2+[pos2[len(pos2)-1]+20]
#         my_seq.append(torch.stack([torch.tensor([dict[j] for j in list(ii[(p-padding_size):(p+padding_size+1)])], dtype=torch.long) for p in pos]))
#         my_target.append(torch.tensor([1 for j in range(len(pos))]))
#         my_weight.append(torch.tensor([1]*len(pos)))
#         pos1=[padding_size]+pos
#         my_dis.append(torch.stack([torch.tensor([pos1[i+1]-pos1[i] for i in range(len(pos1)-1)]),torch.tensor([pos2[i+1]-pos2[i] for i in range(len(pos2)-1)])],1))
#     my_dis=[i/sqrt_scale for i in my_dis]
#     MyvalData = testoneMyDataset(inputset=my_seq, targetset=my_target, weightset=my_weight, disset=my_dis, transcriptset=his)
#     Myvalloader = torch.utils.data.DataLoader(MyvalData, batch_size=len(MyvalData), shuffle=False, num_workers=0, collate_fn=collate_fn_padd, drop_last=True)
#     iter0=iter(enumerate(Myvalloader, 0))
#     iii, batch=next(iter0)
#     inputs, sizes, labels, weights, diss, transcriptt = batch
#     o1=Mymodel1(x=inputs, sizes=sizes, diss=diss)
#     o2=Mymodel2(x=inputs, sizes=sizes, diss=diss)
#     o3=Mymodel3(x=inputs, sizes=sizes, diss=diss)
#     o4=Mymodel4(x=inputs, sizes=sizes, diss=diss)
#     o5=Mymodel5(x=inputs, sizes=sizes, diss=diss)
#     o6=Mymodel6(x=inputs, sizes=sizes, diss=diss)
#     o7=Mymodel7(x=inputs, sizes=sizes, diss=diss)
#     o8=Mymodel8(x=inputs, sizes=sizes, diss=diss)
#     o9=Mymodel9(x=inputs, sizes=sizes, diss=diss)
#     o10=Mymodel10(x=inputs, sizes=sizes, diss=diss)
#     outputs=torch.stack((o1,o2,o3,o4,o5,o6,o7,o8,o9,o10))
#     del o1,o2,o3,o4,o5,o6,o7,o8,o9,o10
    
#     outputs=torch.mean(outputs/torch.sum(torch.abs(outputs),2,keepdim=True),0,keepdim=False)
#     max_peak,indices = torch.max(outputs,1,keepdim=False)
#     outputs=max_peak/outputs[0][indices]
    
#     LNC.append(lnc)
#     smut.append(np.array(torch.mean(outputs[list(range(1,len(outputs),51))]).cpu()))
#     dmut.append(np.array(torch.mean(outputs[list(set(range(len(outputs)))-set([0]+list(range(1,len(outputs),51))))]).cpu()))
    
#     max_peak=np.array(max_peak.cpu())
#     outputs=np.array(outputs.cpu())
#     HIS[lnc]=[(list(i)[0][0],list(i)[0][1],list(i)[1][0],list(i)[1][1],outputs[i],max_peak[i]) for i in range(len(his))]
#     #pd.DataFrame([[list(i)[0][0],list(i)[0][1],list(i)[1][0],list(i)[1][1]] for i in his]).to_csv("test.txt",sep="\t")
# pd.DataFrame({"lncRNA": LNC,"smut": smut,"dmut": dmut}).to_csv("test.txt",sep="\t")