In [12]:
from pathlib import Path
import pandas as pd
from Bio import SeqIO
import warnings
import numpy as np
warnings.filterwarnings('ignore')
Path('./one_hot_data/').mkdir(exist_ok=True,parents=True)

In [7]:
#定义函数
def read_fasta(fname):    #读取数据函数
    with open(fname, "rU") as f:
        seq_dict = [(record.id, record.seq._data.decode()) for record in SeqIO.parse(f, "fasta")]
    seq_df = pd.DataFrame(data=seq_dict, columns=["Id", "Sequence"])
    return seq_df

def process_(sequence,d):
    X = []
    for seq in sequence:
        x=[]
        for residue in seq:
            x.append(d[residue])
        x = np.array(x)
        X.append(x)
    X = np.array(X)
    return X

In [4]:
BPF = {
    'A':[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    'C':[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    'D':[0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    'E':[0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    'F':[0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    'G':[0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    'H':[0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0],
    'I':[0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0],
    'K':[0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0],
    'L':[0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0],
    'M':[0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0],
    'N':[0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0],
    'P':[0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0],
    'Q':[0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0],
    'R':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0],
    'S':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0],
    'T':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0],
    'V':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0],
    'W':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0],
    'Y':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1],
    'X':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], #padding
}

In [5]:
#读取数据处理数据
#分别读取训练集和测试集
train_sets = {
    lab: read_fasta('data/Process_data/{:s}_train.txt'.format(lab))
    for lab in ['Pos','Neg']
}
test_sets = {
    lab: read_fasta('data/Process_data/{:s}_test.txt'.format(lab))
    for lab in ['Pos','Neg']
}
#定义标签
train_sets['Pos'].loc[:,'Label'] = 1
train_sets['Neg'].loc[:,'Label'] = 0
test_sets['Pos'].loc[:,'Label'] = 1
test_sets['Neg'].loc[:,'Label'] = 0

#确定编码以后的训练集和测试集
all_train = pd.concat([train_sets['Pos'],train_sets['Neg']],axis=0,ignore_index='ignore')
all_test = pd.concat([test_sets['Pos'],test_sets['Neg']],axis=0,ignore_index='ignore')
X_train = all_train.iloc[:,0:2]
X_test = all_test.iloc[:,0:2]
y_train = all_train['Label']
y_test = all_test['Label']
X_train.to_csv('data/Process_data/train/X_train.csv',index = False)
X_test.to_csv('data/Process_data/test/X_test.csv',index = False)
y_train.to_csv('data/Process_data/train/y_train.csv',index = False)
y_test.to_csv('data/Process_data/test/y_test.csv',index = False)

In [14]:
X_train_BPF = process_(X_train['Sequence'],BPF)
X_test_BPF = process_(X_test['Sequence'],BPF)
np.savez('one_hot_data/BPF.npz',X_train = X_train_BPF,X_test = X_test_BPF)