In [315]:
import tensorflow as tf 
import numpy as np
from Bio import SeqIO
import json
from sklearn.model_selection import train_test_split
import feature as fe 
import copy 

In [181]:
def countFeatures(data):
    import feature 
    """
    input: 输入BCEs或者非BCEs的数据集 pd.DataFrame类型
    output:输出包含AAC、DPC、CTD和AAI的 pd.DataFrame类型
    
    """
    #计算AAC
    data['AAC']= data['Description'].apply(fe.CalculateAAComposition)
    #计算DPC
    data['DPC']= data['Description'].apply(fe.CalculateDipeptideComposition)
    #计算CTD
    data['CTD']= data['Description'].apply(fe.CalculateCTD)
    #计算AAI
    data['AAI']= data['Description'].apply(fe.CalculateAAIndex)
    return data

In [182]:
#读取 词分类器
with open('tfsModel/RNNamino_acids_toknizer.json','r') as f:
    toknizer =tf.keras.preprocessing.text.tokenizer_from_json(json.load(f))

In [420]:
def applynumpyArray1D(arr,func):
    """
    
    """
    array = copy.deepcopy(arr)  
    for i in range(array.shape[0]):
        array[i] = func(array[i])
    return array

def normize(array):
    """
    input: np.array shape = (n,) 一维向量 
    output: 对 输入的数据进行归一化，即 (a_i - a.mean())/a.std  若全为0，则返回原array。
    
    """
    if (array == 0).all:
        normizedarray = array
    else:
        normizedarray = (array-array.mean())/array.std()
    return normizedarray

def twoDArraynormize(twoDArray):
    """
    input: np.array, shape = (n,m)  n为样本个数，m为特征个数，  对每一个样本的同一个特征进行归一化处理即 (m_i - m's mean)/m'std
    output: np.array, shape = (n,m) n为样本个数，m为特征个数。 归一化后的结果 

    """
    for i in range(twoDArray.shape[1]):
        array = twoDArray[:,i]
        twoDArray[:,i] = normize(array)
    return twoDArray
    
    

#词向量化
def amino_acids2numvector(sequenceArray,toknizer):
    """
    input:
        sequence:(n,) np.array类型，多肽序列
        toknizer: 氨基酸toknizer。    
    output:（n,200) 数字化后的多肽序列
    """
    sequenceArray = applynumpyArray1D(sequenceArray,lambda x: ' '.join(x))
    sequenceArray = toknizer.texts_to_sequences(sequenceArray)
    #选取最常长度为20
    sequenceArray = tf.keras.preprocessing.sequence.pad_sequences(sequenceArray,truncating='post',padding='post',maxlen=20)
    return sequenceArray


#AAC Matrix
def aminoacids2AACMatrix(array):
    """
    
    """
    tmp = copy.deepcopy(array)
    tmp = applynumpyArray1D(tmp,fe.CalculateAAComposition)
    AACMatrix = applynumpyArray1D(tmp,lambda x: [i for i in x.values()])#输出是（n,)的shape
    #转化为np.array，shape为（n,20）
    AACMatrix = np.array(AACMatrix.tolist())
    return AACMatrix

#DPC Matrix
#DPC Matrix
def aminoacids2DPCMatrix(array):
    """
    
    """
    tmp = copy.deepcopy(array)
    tmp = applynumpyArray1D(tmp,fe.CalculateDipeptideComposition)
    DPCMatrix = applynumpyArray1D(tmp,lambda x: [i for i in x.values()])#输出是（n,)的shape
    #转化为np.array，shape为（n,20）
    DPCMatrix = np.array(DPCMatrix.tolist())
    return DPCMatrix

#CTD Matrix
def aminoacids2CTDMatrix(array):
    """
    
    """
    tmp = copy.deepcopy(array)
    tmp = applynumpyArray1D(tmp,fe.CalculateCTD)
    CTDMatrix = applynumpyArray1D(tmp,lambda x: [i for i in x.values()])#输出是（n,)的shape
    #转化为np.array，shape为（n,20）
    CTDMatrix = np.array(CTDMatrix.tolist())
    return CTDMatrix
#AAI Matrix
def aminoacids2AAIMatrix(array):
    """
    
    """
    tmp = copy.deepcopy(array)
    tmp = applynumpyArray1D(tmp,fe.CalculateAAIndex)
    AAIMatrix = applynumpyArray1D(tmp,lambda x: [i for i in x.values()])#输出是（n,)的shape
    #转化为np.array，shape为（n,20）
    AAIMatrix = np.array(AAIMatrix.tolist())
    return AAIMatrix





def generateFeaturesData(sequenceData):
    """
    input:
        sequence:(n,) np.array类型，包含n条多肽序列的数组
    output:[AAC,DPC,CTD,AAI,SeqVec]的一个列表
    """
    data = copy.deepcopy(sequenceData)
    AAC = aminoacids2AACMatrix(data)
    AAC = twoDArraynormize(AAC)
    
    DPC = aminoacids2DPCMatrix(data)
    DPC = twoDArraynormize(DPC)
    
    CTD = aminoacids2CTDMatrix(data)
    CTD = twoDArraynormize(CTD)
    
    AAI = aminoacids2AAIMatrix(data)
    AAI = twoDArraynormize(AAI)
    
    SeqVec = amino_acids2numvector(data,toknizer)
    return [AAC,DPC,CTD,AAI,SeqVec]

In [421]:
#IBCE-EL的数据集
B_positive = np.array([str(seq.seq) for seq in list(SeqIO.parse('B-positive.txt','fasta'))]).astype('object')
B_negative = np.array([str(seq.seq) for seq in list(SeqIO.parse('B-positive.txt','fasta'))]).astype('object')
B_positiveFeatures = generateFeaturesData(B_positive)
B_negativeFeatures = generateFeaturesData(B_negative)

In [425]:
#牛奶过敏数据集
milkAllergyBCEs = pd.read_csv('milk allergy linear BCEs bovine dataset.csv')['Description'].values
milkAllergynonBCEs = pd.read_csv('milk allergy linear non-BCEs experiece confirmed.csv')['Description'].values
milkAllergyBCEsFeatures = generateFeaturesData(milkAllergyBCEs)
milkAllergynonBCEsFeatures = generateFeaturesData(milkAllergynonBCEs)
milkAllergyTargets = np.concatenate([np.ones((milkAllergyBCEsFeatures[0].shape[0],)),np.zeros((milkAllergynonBCEsFeatures[0].shape[0],))])



In [214]:
#x_train, x_test, y_train, y_test = train_test_split( x_data, y_data, test_size=0.1)
#MilkTrainDataset  = tf.data.Dataset.from_tensor_slices((x_train,y_train)).shuffle(1000).batch(100)
#MilktestDataset  = tf.data.Dataset.from_tensor_slices((x_test,y_test)).shuffle(1000).batch(100)

array([ 4,  1,  6, 17,  8, 18,  1,  7, 14, 12,  6, 11, 10,  1,  2,  2, 10,
       16,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [423]:
#读取模型
AACModel = tf.keras.models.load_model('tfsModel/AACModel.h5')
DPCModel = tf.keras.models.load_model('tfsModel/DPCModel.h5')
CTDModel = tf.keras.models.load_model('tfsModel/CTDModel.h5')
AAIModel = tf.keras.models.load_model('tfsModel/AAIModel.h5')
RNNModel = tf.keras.models.load_model('tfsModel/RNNModel.h5')

In [426]:
AACModel.evaluate(x=np.concatenate([milkAllergyBCEsFeatures[0],milkAllergynonBCEsFeatures[0]]),
                 y = milkAllergyTargets,batch_size=16)

DPCModel.evaluate(x=np.concatenate([milkAllergyBCEsFeatures[1],milkAllergynonBCEsFeatures[1]]),
                 y = milkAllergyTargets,batch_size=16)

CTDModel.evaluate(x=np.concatenate([milkAllergyBCEsFeatures[2],milkAllergynonBCEsFeatures[2]]),
                 y = milkAllergyTargets,batch_size=16)

AAIModel.evaluate(x=np.concatenate([milkAllergyBCEsFeatures[3],milkAllergynonBCEsFeatures[3]]),
                 y = milkAllergyTargets,batch_size=16)



[0.5608154535293579,
 0.8318965435028076,
 0.24362005293369293,
 0.8882278203964233]

In [432]:
AACProb =AACModel.predict(x=np.concatenate([milkAllergyBCEsFeatures[2],milkAllergynonBCEsFeatures[2]]),)
AACProb =AACModel.predict()
CTDProb =CTDModel.predict(x=np.concatenate([milkAllergyBCEsFeatures[2],milkAllergynonBCEsFeatures[2]]),)
AACProb =AACModel.predict

In [429]:
x=np.concatenate([milkAllergyBCEsFeatures[2],milkAllergynonBCEsFeatures[2]]),



0.923