In [1]:
# !pip install pandas
# !pip install watermark
# !pip install seaborn
# !pip install biopython
# !pip install sklearn
import os
import re 
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np 
import watermark
import random 
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
from multiprocessing import Pool
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'

In [2]:
left=3
right=9
signal_num = left+right
bases="ACGT"
lower_bases="acgtz"
pattern = re.compile("[^acgt]")
# output_name = f"output/WAM(-{left}+{right})"
output_name = "output/pomegranate+WAM"
# output_name = "output/DWAM"
# output_name = "output/pomegranate"

## Data Cleaning
- train set contains 462 files,2831 introns, all suffies of file is "TXT",the letters are  lowercase  letters. The first line is "LOCUS AB000381 35863 bp DNA PRI 14-MAY-1997", gapped by lots of whitespace
- test set contains 570 files,2071 introns,the suffixes of file contains "TXT"and "txt",and the letters are capital letters.The fist line in each file starts with like ">>ACU08131"

In [3]:
def mkdir(dirs):
    if not os.path.exists(dirs):
        os.makedirs(dirs)
    else:
        pass
mkdir(f"{output_name}")

In [4]:
def loadFile(file_dir):
    '''
    Function: Read  All files in the Training Set Folder and Testing Set Folder
    Parameter：file_dir
    Output: file_path,locus_list
    Attention: do not load non-fasta files!
    '''
    file_path = []
    file_locus_list = []
    count=0
    all_file = tqdm(os.listdir(file_dir), desc=f'LOADING {file_dir}')
    for file_name in all_file:
        count+=1
        suffix = re.findall("\.(.+$)",file_name)[-1].lower()
        # or  suffix = file_name.split(".")[1].lower()
        if suffix != "txt":
            continue
        path = f"{file_dir}/{file_name}"
        file_path.append(path)
    all_file.write(f"{count} Files Loading Finished!")
    all_file.close()
    print()
    return file_path
train_file_path =loadFile('Training and testing datasets/Training Set')
test_file_path =loadFile('Training and testing datasets/Testing Set')

LOADING Training and testing datasets/Training Set: 100%|████████████████████████| 462/462 [00:00<00:00, 231762.76it/s]
LOADING Training and testing datasets/Testing Set: 100%|█████████████████████████| 570/570 [00:00<00:00, 552392.16it/s]

462 Files Loading Finished!

570 Files Loading Finished!






In [5]:
def extract_donor_signal(file_path,dataset):
    '''
    Parameter：train_file_path|test_file_path [set_dataset]
    Output:file_donor_positions,file_acceptor_positions,file_donor_signals,donor_signal_all
    只输出没有含有未知碱基的site
    '''
    print(f'Extract {dataset} Set donor signals'.center(50, '*'))
    donor_positions= [] #1
    acceptor_positions= [] #1
    donor_signals=[]  #1
    acceptor_signals=[]   #1
    all_donor_signal=[]  
    all_acceptor_signal=[]
    length_list = [] #1
    seq_list = [] #1
    exons = [] #1
    locus =[]
    donor_file = []
    acceptor_file = []
    files = tqdm(file_path, desc=f'{dataset} Progressing：')
    for file in files:
        f = open(file)
        #  first line: extract gene locus
        first_line =f.readline() 
        if dataset == "test":
            locus.append(re.search(">(.+)$",first_line).group(1))
        elif dataset=="train":
            locus.append(first_line.split()[1]) 
        #  second line: extract  donor and acceptor site positions
        second_line=f.readline()  
        exon_positions_list = re.findall(r'(\d+)\.\.(\d+)',second_line)
        donor_positions_list = [int(pos_set[1])+1 for pos_set in exon_positions_list[:-1]]
        acceptor_positions_list= [int(pos_set[0])-1 for pos_set in exon_positions_list[1:]]
        exons.append(exon_positions_list)

        seq = ''
        # extract  seq info
        for line in f.readlines():
            seq += line.strip()
        seq_length = len(seq)
        seq_list.append(seq.lower())
        length_list.append(seq_length)
        
        # extract  donor site signal
        donor_signal=[]
        poses = []
        for pos in donor_positions_list:
            signal_range = seq[pos-1-left:pos-1+right].lower()
            no_known =pattern.search(signal_range)
            if no_known:
                    continue
            donor_signal.append(signal_range)
            all_donor_signal.append(signal_range)
            donor_file.append(file)
            poses.append(pos)
        donor_signals.append(donor_signal)
        donor_positions.append(poses)
        
        # extract  acceptor site signal
        acceptor_signal=[]
        poses = []
        for pos in acceptor_positions_list:
            signal_range = seq[pos-right:pos+left].lower()
            no_known =pattern.search(signal_range)
            if no_known:
                    continue
            poses.append(pos)
            acceptor_signal.append(signal_range)
            all_acceptor_signal.append(signal_range)
            acceptor_file.append(file)
        acceptor_signals.append(acceptor_signal)
        acceptor_positions.append(acceptor_positions_list)
        
    # save info to file
    df_set_info = pd.DataFrame({'Path':file_path, 'Locus':locus,"Length":length_list,"Exon Num":[ len(exons) for exons  in exons],\
 "Exon Location":exons,"Donor Site":donor_positions,"Acceptor Site":acceptor_positions,\
                                "Donor signals":donor_signals,"Acceptor signals":acceptor_signals})
    df_set_info.to_csv(f'{output_name}/{dataset.capitalize()}_set_info(non-seq).csv',index=None)
    np.savetxt(f'{output_name}/{dataset.capitalize()}_seq_list.txt',seq_list,delimiter = ',',fmt='%s')
    np.savetxt(f'{output_name}/{dataset.capitalize()}_donor_signal_str.txt',all_donor_signal,delimiter = ',',fmt='%s')
    print(f"Extract Dataset {dataset.capitalize()} info Finished!")
    if dataset=="train":
        return all_donor_signal,all_acceptor_signal,seq_list,donor_positions,acceptor_positions
    elif dataset=="test":
        return all_donor_signal,all_acceptor_signal,seq_list,donor_positions,acceptor_positions,donor_file

train_donor_signal_all_str,train_acceptor_signal_all_str,train_seq_list,\
train_donor_positions,train_acceptor_positions=extract_donor_signal(train_file_path,dataset="train")
test_donor_signal_all_str,test_acceptor_signal_all_str,test_seq_list,\
test_donor_positions,test_acceptor_positions,test_donor_filepath=extract_donor_signal(test_file_path,dataset="test")

train Progressing：:  34%|███████████████████▌                                     | 159/462 [00:00<00:00, 1577.93it/s]

*********Extract train Set donor signals**********


train Progressing：: 100%|█████████████████████████████████████████████████████████| 462/462 [00:00<00:00, 1781.55it/s]
test Progressing：: 100%|██████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 3572.72it/s]

Extract Dataset Train info Finished!
**********Extract test Set donor signals**********





Extract Dataset Test info Finished!


In [6]:
def signal_to_csv(signal_str,mode,folder="Train"):
    """
    output:  csv
    """
    signal_list= map(list, signal_str)
    if mode=="acceptor":
        col_name = list(range(-right+1,left+1))
    else:
        col_name = list(range(-left,right))
   
    donorDf = pd.DataFrame(columns=col_name, data=signal_list, index=None)
    donorDf.to_csv(f'{output_name}/{folder}_{mode}_signal.csv',index=None)
    return donorDf
def save_str_list(signal_str,filename="signal_str"):
    np.savetxt(f'{output_name}/{filename}.txt',signal_str,delimiter = ',',fmt='%s')
    print(f'save {filename} successful!')

train_signal = signal_to_csv(train_donor_signal_all_str,folder="Train",mode="donor")
test_signal = signal_to_csv(test_donor_signal_all_str,folder="Test",mode="donor")

In [7]:
def count_each_char(str_list):
    dict = {}
    str_list = tqdm(str_list, desc='Counting  Base Distribution:')
    for seq in str_list:
         for i  in seq:
            if i not in dict:
                dict[i] = 1
            else:
                dict[i] += 1
    return dict
base_dis_1 = count_each_char(train_seq_list)
base_dis_2 = count_each_char(train_donor_signal_all_str)
base_dis_3 = count_each_char(test_seq_list)
base_dis_4 = count_each_char(test_donor_signal_all_str)
print("Distribution of all bases in training set\n",base_dis_1)
print("Distribution of donor site bases in training  set\n",base_dis_2)
print("Distribution of all bases in testing set\n",base_dis_3)
print("Distribution of donor site  bases in testing set\n",base_dis_4)

Counting  Base Distribution:: 100%|█████████████████████████████████████████████████| 462/462 [00:01<00:00, 451.50it/s]
Counting  Base Distribution:: 100%|████████████████████████████████████████████| 2380/2380 [00:00<00:00, 340837.32it/s]
Counting  Base Distribution:: 100%|████████████████████████████████████████████████| 570/570 [00:00<00:00, 1099.00it/s]
Counting  Base Distribution:: 100%|████████████████████████████████████████████| 2079/2079 [00:00<00:00, 417103.13it/s]

Distribution of all bases in training set
 {'g': 1306260, 'c': 1282733, 'a': 1436966, 't': 1498203, 'n': 682, 'k': 28, 's': 27, 'r': 15, 'y': 26, 'w': 14, 'm': 16, 'v': 2, 'b': 4}
Distribution of donor site bases in training  set
 {'a': 7301, 'g': 11043, 't': 6245, 'c': 3971}
Distribution of all bases in testing set
 {'c': 689433, 't': 762900, 'g': 702343, 'a': 736600, 'n': 862, 'b': 1, 'r': 5, 'y': 3, 'k': 1, 's': 1}
Distribution of donor site  bases in testing set
 {'g': 9414, 't': 5642, 'a': 6506, 'c': 3386}





In [8]:
def sub_unknown(my_string):
    """
    function to convert a DNA sequence string to a numpy array
    converts to lower case, changes any non 'acgt' characters to 'n'
    like: ['c' 'a' 't' 'g' 'g']
    """
    my_string = pattern.sub('z', my_string)
    return my_string 


def process_to_int(donor):
    donor = list(sub_unknown(donor))
    integer_encoded = [char_to_int[char] for char in donor]
    return list(integer_encoded)

def code_all_seq(all_str):
    all_str = tqdm(all_str, desc='code_all_signal:')
    int_coded=map(process_to_int,all_str)
    return np.array(list(int_coded))

char_to_int = {c:i for i, c in enumerate(lower_bases)}
train_coded= code_all_seq(train_donor_signal_all_str)

code_all_signal:: 100%|████████████████████████████████████████████████████████| 2380/2380 [00:00<00:00, 216971.91it/s]


In [9]:

def create_pseudoDonor(file_path,seqs_DNA, donor_locations,dataset,ran_num=0):
    '''
    output :pseudo donor signal containing 'gt' in the right position
    '''
    nonDonors = []
    nonDonor_file_path = []
    nonDonor_positions = []
    file_num = tqdm(range(len(donor_locations)), desc='Creating Pseudo Donor Signal Sequence:')
    for i in file_num:
        file_nonDonors= []
        file_seq_DNA = seqs_DNA[i]
        num = len(donor_locations[i])  
        length = len(file_seq_DNA)
        donor_signals_start=[pos-1-left for pos in donor_locations[i]]
        for index in range(length-signal_num+1):
            if (file_seq_DNA[index+left:index+left+2] =='gt' ) and (index not in donor_signals_start) :
                nonDonor = file_seq_DNA[index:index + signal_num]
                no_known =pattern.search(nonDonor)
                if no_known:
                    continue # 这里之前写成break，有问题，这样遇到非正常碱基对的就直接循环中停止了
                file_nonDonors.append(nonDonor)
                nonDonor_file_path.append(file_path[i])
                nonDonor_positions.append(index+1+left)
        if ran_num:
            nonDonors += random.sample(file_nonDonors,ran_num)
        else:
            nonDonors += file_nonDonors
        # nonDonors.append(random.sample(file_nonDonors,ran_num))
    print('Created Pseudo Donor Signal Sequence successful!')
    if dataset == "train":
         return nonDonors
    elif dataset == "test":
        return nonDonors,nonDonor_file_path,nonDonor_positions

# 生成训练集假位点
train_pseudoDonor_list=create_pseudoDonor(train_file_path,train_seq_list, train_donor_positions,dataset = "train")
train_pseudoDonor_len= len(train_pseudoDonor_list) 
print(train_pseudoDonor_len) 
save_str_list(train_pseudoDonor_list,"Train_pseudoDonor_signal_str")


Creating Pseudo Donor Signal Sequence:: 100%|███████████████████████████████████████| 462/462 [00:02<00:00, 205.62it/s]


Created Pseudo Donor Signal Sequence successful!
283482
save Train_pseudoDonor_signal_str successful!


In [10]:
# 生成测试集假位点
test_pseudoDonor_list,test_pseudoDonor_filepath,test_pseudoDonor_positions=create_pseudoDonor(test_file_path,test_seq_list,\
                                                                                      test_donor_positions,dataset = "test")
test_pseudoDonor_len= len(test_pseudoDonor_list)
print(test_pseudoDonor_len)

Creating Pseudo Donor Signal Sequence:: 100%|███████████████████████████████████████| 570/570 [00:01<00:00, 472.66it/s]

Created Pseudo Donor Signal Sequence successful!
149126





In [11]:
# 随机抽取训练集样本
import random 
random.seed(123123)
random_len = 0
if random_len:
    train_pseudoDonor_list_part = random.sample(train_pseudoDonor_list, random_len)
else:
    train_pseudoDonor_list_part =train_pseudoDonor_list

In [12]:
# 得到训练集的信息df
train_donor_features = code_all_seq(train_donor_signal_all_str)
train_labels=[1]*len(train_donor_signal_all_str)
train_pseudoDonor_features = code_all_seq(train_pseudoDonor_list_part)
train_labels += [0]*len(train_pseudoDonor_list_part)
train_labels = np.array(train_labels) 
train_features =np.vstack([train_donor_features,train_pseudoDonor_features])

po_str = list(map(str,list(range(-left,right))))
train_features_df = pd.DataFrame(columns=po_str, data=train_features , index=None)
train_features_df["Label"] = train_labels
# train_donor_df = train_features_df[train_df["Label"] == 1]
# train_pseudo_df = train_features_df[train_df["Label"] == 0]
train_features_df.to_csv(f"{output_name}/Train_features.csv",index=None)

code_all_signal:: 100%|████████████████████████████████████████████████████████| 2380/2380 [00:00<00:00, 148858.39it/s]
code_all_signal:: 100%|████████████████████████████████████████████████████| 283482/283482 [00:00<00:00, 329598.29it/s]


In [13]:
# 获得测试集的信息df
import operator
from functools import reduce
# 把多维列表变为一维
test_donor_positions_1d = reduce(operator.add,test_donor_positions)
test_labels=[1]*len(test_donor_signal_all_str)
test_labels += [0]*test_pseudoDonor_len
test_labels = np.array(test_labels)
test_file_df = pd.DataFrame({"Filename":test_donor_filepath+test_pseudoDonor_filepath,"Donor Site":test_donor_positions_1d +test_pseudoDonor_positions,\
              "Signal":test_donor_signal_all_str+test_pseudoDonor_list,"label":test_labels})
test_file_df.to_csv(f"{output_name}/Test_predict.csv",index=None)
test_file_df

Unnamed: 0,Filename,Donor Site,Signal,label
0,Training and testing datasets/Testing Set/ACU0...,642,ggggtgagccca,1
1,Training and testing datasets/Testing Set/ACU0...,1363,gtggtaagagac,1
2,Training and testing datasets/Testing Set/ACU0...,2029,taggtgagtgtg,1
3,Training and testing datasets/Testing Set/ACU0...,2803,gcggtaggtact,1
4,Training and testing datasets/Testing Set/ACU0...,3798,caggtaattttc,1
...,...,...,...,...
151200,Training and testing datasets/Testing Set/ZEFB...,2123,taagttaaatca,0
151201,Training and testing datasets/Testing Set/ZEFB...,2145,atagtggcctac,0
151202,Training and testing datasets/Testing Set/ZEFB...,2159,tgagtttctgtt,0
151203,Training and testing datasets/Testing Set/ZEFB...,2165,tctgttatgtgg,0


In [14]:
test_features = code_all_seq(test_file_df['Signal'])
test_features_df= pd.DataFrame(columns=po_str, data=test_features , index=None)
test_features_df.to_csv(f"{output_name}/Test_features.csv",index=None)

code_all_signal:: 100%|████████████████████████████████████████████████████| 151205/151205 [00:00<00:00, 251471.87it/s]


In [15]:
# def create_commonSignal(seqs_DNA, donor_locations, acceptor_locations):

#     nonDonors = []
#     file_num = tqdm(range(len(donor_locations)), desc='Creating Non Donor Signal Sequence:')
#     for i in file_num:
#         # 每个文件循环
#         file_seq_DNA = seqs_DNA[i]
#         num = len(donor_locations[i])  
#         length = len(file_seq_DNA)
#         donor_signals_start=[pos-1-left for pos in donor_locations[i]]
#         acceptor_signals_start=[pos-right for pos in acceptor_locations[i] ]
#         signals_start=sorted(donor_signals_start+acceptor_signals_start)
#         for index in range(length-signal_num+1):
#             if index not in signals_start:
#                 nonDonor = file_seq_DNA[index:index + signal_num]
#                 no_known =pattern.search(nonDonor)
#                 if no_known:
#                     continue
#                 else:
#                     yield nonDonor

#     print('Created Common Signal Sequence successful!')
# common_list=create_commonSignal(train_seq_list, train_donor_positions, train_acceptor_positions)
# common_list=list(common_list)
# common_array=code_all_seq(common_list)

In [16]:
# import winsound
# duration = 1000  # millisecond
# freq = 440  # Hz
# winsound.Beep(freq, duration)

In [17]:
# from notify_run import Notify

# n = Notify()
# n.register()

In [18]:
# n.write_config()

In [19]:
from notify_run import Notify
n = Notify()
n.send(f"Finished {output_name} preprocesing~!")