In [1]:
import torch
import torch.nn as nn
class ResnetBlock(nn.Module):
    
    def __init__(self, channel_size):
        super(ResnetBlock, self).__init__()

        self.channel_size = channel_size
        self.maxpool = nn.Sequential(
            nn.ConstantPad1d(padding=(0, 1), value=0), # 使用常量来填充输入张量的边界，padding - 填充的数量，value - 要填充的值
            nn.MaxPool1d(kernel_size=3, stride=2)
        )
        self.conv = nn.Sequential( #一个1/2池化的残差块
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),

            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1),
        )

    def forward(self, x):
        x_shortcut = self.maxpool(x)
        x = self.conv(x_shortcut)
        x = x + x_shortcut
        return x
class DPCNN(nn.Module):
    
    def __init__(self, max_features, word_embedding_dimension, max_sentence_length, num_classes):
        super(DPCNN, self).__init__()
        self.max_features = max_features #  词典的大小尺寸
        self.embed_size = word_embedding_dimension #  嵌入向量的维度，即用多少维来表示一个符号，300
        self.maxlen = max_sentence_length # 训练样本中最大的词数 65
        self.num_classes = num_classes # 分类数
        self.channel_size = 250

        # 存储固定大小的词典的嵌入向量的查找表，给一个编号，嵌入层就能返回这个编号对应的嵌入向量，嵌入向量反映了各个编号代表的符号之间的语义关系
        # 定义了一个(num_embeddings, embedding_dim )的二维embdding，对于NLP来说，相当于是len(voca)个词，每个词的词向量维数是word_embedding_dimension
        self.embedding = nn.Embedding(self.max_features, self.embed_size)
        
        # 从给定均值和标准差的正态分布N(mean, std)，填充输入的张量或变量(每个词embedding的向量为正态分布)
        torch.nn.init.normal_(self.embedding.weight.data, mean=0, std=0.01)
        
        # 是否需要在计算中保留对应的梯度信息，默认False，否。FAIR团队已经在底层帮我们实现了这部分功能。
        # 只要某一个输入需要相关梯度值，则输出也需要保存相关梯度信息，这样就保证了这个输入的梯度回传。
        # 当然这些部分会加大内存消耗，全连接层卷积层等结构的参数都是默认需要梯度的，所以我们需要改为True
        self.embedding.weight.requires_grad = True

        
        # 一、region embedding
        # 将普通的word embedding转换成能够覆盖一个或多个词的region embedding
        self.region_embedding = nn.Sequential(
            # 限制输出词向量维度为250，卷积核大小3，padding=1
            nn.Conv1d(self.embed_size, self.channel_size, kernel_size=3, padding=1),
            # BatchNorm在深度神经网络训练过程中使得每一层神经网络的输入保持相同分布
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # 二、经过两层等长卷积，为特征抽取提供更宽的感受眼，提高embdding的丰富性
        self.conv_block = nn.Sequential(
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1), # 在卷一次
            nn.BatchNorm1d(num_features=self.channel_size),
            nn.ReLU(),
            nn.Conv1d(self.channel_size, self.channel_size, kernel_size=3, padding=1), # 卷第二次
        )

        #  三、Downsampling（1/2池化）
        self.seq_len = self.maxlen
        resnet_block_list = []
        while (self.seq_len > 2):
            resnet_block_list.append(ResnetBlock(self.channel_size))
            self.seq_len = self.seq_len // 2  # 二分之一池化，减少词向量维度，提高每个神经元包含的语义

        # 四、构建pre-activation，近路连接
        self.resnet_layer = nn.Sequential(*resnet_block_list)

        # 五、全连接层
        self.fc = nn.Sequential(
            nn.Linear(self.channel_size * self.seq_len, self.num_classes),
            nn.BatchNorm1d(self.num_classes),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(self.num_classes, self.num_classes)
        )

    def forward(self, description_word_seq):
        x = self.embedding(description_word_seq)
        x = x.permute(0, 2, 1) # 交换维度：[batch_size, embedding_dim, max_len]
        x = self.region_embedding(x)
        x = self.conv_block(x)
        x = self.resnet_layer(x) # 
        x = x.permute(0, 2, 1)
        x = x.contiguous().view(x.size(0), -1)
        output = self.fc(x)
        return {'output': output}

    def predict(self, description_word_seq):
        """
        :param description_word_seq: torch.LongTensor
        :返回 predict: dict of torch.LongTensor
        """
        output = self(description_word_seq)
        _, predict = output['output'].max(dim=1)
        
        return {'predict': predict}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
x = torch.randn((2,3,4))

In [4]:
x

tensor([[[ 2.0663,  0.5835,  0.4816,  1.1625],
         [ 1.0210,  0.1008, -0.2442, -0.8603],
         [ 0.6592, -0.1576,  0.3684,  0.9353]],

        [[ 1.8719,  0.1059, -0.5340,  0.4053],
         [ 1.1096, -1.4473, -1.5438, -0.5399],
         [ 2.0299, -1.4080, -0.4999, -0.0527]]])

In [6]:
y = torch.randn((2,3,5))

In [9]:
z = torch.cat((x,y),dim=2)

In [10]:
z.shape

torch.Size([2, 3, 9])

In [11]:
z2 = z.permute(0,2,1)

In [12]:
z2.shape

torch.Size([2, 9, 3])

In [14]:
blosum62 = {
    'A': [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],  # A
    'R': [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],  # R
    'N': [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],  # N
    'D': [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],  # D
    'C': [0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],  # C
    'Q': [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],  # Q
    'E': [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],  # E
    'G': [0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],  # G
    'H': [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],  # H
    'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],  # I
    'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],  # L
    'K': [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2],  # K
    'M': [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1],  # M
    'F': [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1],  # F
    'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2],  # P
    'S': [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],  # S
    'T': [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0],  # T
    'W': [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3],  # W
    'Y': [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1],  # Y
    'V': [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4],  # V
    '-': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -
}
len(blosum62['A'])

20

In [8]:
import pandas as pd
df = pd.read_excel('../data/LLPS/DeePhase/pnas.2019053118.sd04.xlsx')

In [10]:
df.head

<bound method NDFrame.head of          Dataset S4. Subset of PDB* used for training.
0    MAAIPPDSWQPPNVYLETSMGIIVLELYWKHAPKTCKNFAELARRG...
1    MMRDIGLRVQPPAEKCDDPKCPWHGNLKIHGRVFEGIVVSDKPRKT...
2    MEIIHLSEIDSTNDYAKELAKEGKRNFIVLADKQNNGKGRWGRVWY...
3    MIEQNEKASIGIIGGSGLYDPGIFSESKEIKVYTPYGQPSDFITIG...
4    MSKATYKERAATHPSPVAAKLFNIMHEKQTNLCASLDVRTTKELLE...
..                                                 ...
132  MVKYQYEFPLDKAGKAGAVKPYRGGKNDFVTPVSNLSGVAEILTNA...
133  MLLEAIFHEAKGSYAYPISETQLRVRLRAKKGDVVRCEVLYADRYA...
134  ASEDGGRGPYVQADLAYAYEHITHDYPEPTAPNKNKISTVSDYFRN...
135  MKYDLIIIGSGSVGAAAGYYATRAGLNVLMTDAHMPPHQHGSHHGD...
136  QAVQPVDFRHHHFSDMEIFLRRYANEYPSITRLYSVGKSVELRELY...

[137 rows x 1 columns]>

In [11]:
df.rename(columns={'Dataset S4. Subset of PDB* used for training.':'seq'},inplace=True)

In [12]:
df.head()

Unnamed: 0,seq
0,MAAIPPDSWQPPNVYLETSMGIIVLELYWKHAPKTCKNFAELARRG...
1,MMRDIGLRVQPPAEKCDDPKCPWHGNLKIHGRVFEGIVVSDKPRKT...
2,MEIIHLSEIDSTNDYAKELAKEGKRNFIVLADKQNNGKGRWGRVWY...
3,MIEQNEKASIGIIGGSGLYDPGIFSESKEIKVYTPYGQPSDFITIG...
4,MSKATYKERAATHPSPVAAKLFNIMHEKQTNLCASLDVRTTKELLE...


In [14]:
len(df)

137

In [16]:
[1]*len(df)+[0]*100

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [17]:
    df1 = pd.read_excel('../data/LLPS/DeePhase/pnas.2019053118.sd01.xlsx')
    df4 = pd.read_excel('../data/LLPS/DeePhase/pnas.2019053118.sd04.xlsx')
    
    df1.rename(columns={'Dataset S1. LLPS+ (high LLPS-propensity sequences).':'seq'},inplace=True)
    df4.rename(columns={'Dataset S4. Subset of PDB* used for training.':'seq'},inplace=True)
    
    df1_labels = [1]*len(df1)
    df4_labels = [0]*len(df4)

    df = pd.DataFrame({'seq':list(df1['seq'])+list(df4['seq']),'label':df1_labels+df4_labels})

In [18]:
df

Unnamed: 0,seq,label
0,SMGNAVPGMNPAMGMNMGGMMGFPMGGPSASPNPMMNGFAAGSMGM...,1
1,MENSQLCKLFIGGLNVQTSESGLRGHFEAFGTLTDCVVVVNPQTKR...,1
2,MGRPEFNRGGGGGGFRGGRGGDRGGSRGGFGGGGRGGYGGGDRGSF...,1
3,ASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTDT...,1
4,MPLNVSFTNRNYDLDYDSVQPYFYCDEEENFYQQQQQSELQPPAPS...,1
...,...,...
269,MVKYQYEFPLDKAGKAGAVKPYRGGKNDFVTPVSNLSGVAEILTNA...,0
270,MLLEAIFHEAKGSYAYPISETQLRVRLRAKKGDVVRCEVLYADRYA...,0
271,ASEDGGRGPYVQADLAYAYEHITHDYPEPTAPNKNKISTVSDYFRN...,0
272,MKYDLIIIGSGSVGAAAGYYATRAGLNVLMTDAHMPPHQHGSHHGD...,0
