## 本节练习的重点在于以下几点
- 搞清楚```nn.embedding()```和`nn.Linear()`以及`nn.embedding.from_pretrained()`的区别
- 尝试将预训练模型用`nn.embedding()`和`nn.embedding.from_pretrained()`加载
- 将氨基酸自定义的物理化学性质作为特征向量加载

### 下面是李沐老师d2l中的示例代码，加载Glove词向量

In [1]:
import os
import torch
class TokenEmbedding:
    """GloVe嵌入"""
    def __init__(self, embedding_name):
        """Defined in :numref:`sec_synonyms`"""
        self.idx_to_token, self.idx_to_vec = self._load_embedding(
            embedding_name)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in
                             enumerate(self.idx_to_token)}

    def _load_embedding(self, embedding_name):
        idx_to_token, idx_to_vec = ['<unk>'], []
        data_dir = d2l.download_extract(embedding_name)
        # GloVe网站：https://nlp.stanford.edu/projects/glove/
        # fastText网站：https://fasttext.cc/
        with open(os.path.join(data_dir, 'vec.txt'), 'r') as f:
            for line in f:
                elems = line.rstrip().split(' ')
                token, elems = elems[0], [float(elem) for elem in elems[1:]]
                # 跳过标题信息，例如fastText中的首行
                if len(elems) > 1:
                    idx_to_token.append(token)
                    idx_to_vec.append(elems)
        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        return idx_to_token, torch.tensor(idx_to_vec)

    def __getitem__(self, tokens):
        indices = [self.token_to_idx.get(token, self.unknown_idx)
                   for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices)]
        return vecs

    def __len__(self):
        return len(self.idx_to_token)

  from .autonotebook import tqdm as notebook_tqdm


### 下面我希望借鉴上述的方法，将ilearn 中的特征工程加载词向量

In [2]:
import pandas as pd

In [36]:
AAindx = pd.read_csv('./PAAC.txt',sep='\t',index_col=False)
AAindx.head()


Unnamed: 0,#,A,R,N,D,C,Q,E,G,H,...,L,K,M,F,P,S,T,W,Y,V
0,Hydrophobicity,0.62,-2.53,-0.78,-0.9,0.29,-0.85,-0.74,0.48,-0.4,...,1.06,-1.5,0.64,1.19,0.12,-0.18,-0.05,0.81,0.26,1.08
1,Hydrophilicity,-0.5,3.0,0.2,3.0,-1.0,0.2,3.0,0.0,-0.5,...,-1.8,3.0,-1.3,-2.5,0.0,0.3,-0.4,-3.4,-2.3,-1.5
2,SideChainMass,15.0,101.0,58.0,59.0,47.0,72.0,73.0,1.0,82.0,...,57.0,73.0,75.0,91.0,42.0,31.0,45.0,130.0,107.0,43.0


In [20]:
AAindx.columns
idx_to_token = ['<unk>']
idx_to_vec = []
idx_to_token = idx_to_token+list(AAindx.columns)[1:]

idx_to_token


['<unk>',
 'A',
 'R',
 'N',
 'D',
 'C',
 'Q',
 'E',
 'G',
 'H',
 'I',
 'L',
 'K',
 'M',
 'F',
 'P',
 'S',
 'T',
 'W',
 'Y',
 'V']

In [25]:
for aa in list(AAindx.columns)[1:]:
    print(aa)
    idx_to_vec.append(list(AAindx[aa]))
idx_to_vec



A
R
N
D
C
Q
E
G
H
I
L
K
M
F
P
S
T
W
Y
V


[[4.35,
  0.61,
  1.18,
  1.56,
  1.0,
  0.77,
  0.37,
  0.357,
  52.6,
  16.0,
  44.0,
  7.3,
  3.9,
  -0.2,
  0.691,
  8.249,
  4.349,
  6.5,
  0.486,
  0.288,
  0.52,
  0.046,
  -0.368,
  0.71,
  -0.118,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  91.5,
  115.0,
  25.0,
  0.38,
  0.2,
  0.66,
  1.42,
  0.83,
  0.74,
  1.29,
  1.2,
  0.7,
  0.52,
  0.86,
  0.75,
  0.67,
  0.74,
  0.06,
  0.076,
  0.035,
  0.058,
  0.64,
  -0.45,
  -0.08,
  0.36,
  0.17,
  0.02,
  0.75,
  1.33,
  1.0,
  0.6,
  2.5,
  8.6,
  100.0,
  1.56,
  1.26,
  0.25,
  0.67,
  0.0,
  0.0,
  89.09,
  297.0,
  1.8,
  9.69,
  2.34,
  0.31,
  1.28,
  0.53,
  1.0,
  2.87,
  1.52,
  2.04,
  7.3,
  -0.01,
  0.0,
  0.0,
  0.0,
  0.0,
  4.76,
  1.08,
  1.0,
  1.0,
  1.2,
  1.0,
  0.28,
  1.29,
  1.13,
  1.55,
  1.19,
  0.84,
  0.86,
  0.91,
  0.91,
  0.8,
  1.1,
  0.93,
  0.75,
  88.3,
  0.0,
  8.1,
  31.0,
  0.1,
  1.0,
  -0.5,
  29.22,
  30.88,
  154.33,
  1.53,
  0.86,
  0.78,
  1.09,
  0.35,
  1.09,
  1.34,
  0.47,
  2

In [26]:
idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
idx_to_vec

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [38]:
import os
import torch
class TokenEmbedding:
    """特征嵌入"""
    def __init__(self, embedding_name = 'PAAC'):
        """Defined in :numref:`sec_synonyms`"""
        self.idx_to_token, self.idx_to_vec = self._load_embedding(
            embedding_name)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in
                             enumerate(self.idx_to_token)}

    def _load_embedding(self, embedding_name):
        PAAC = pd.read_csv(embedding_name+'.txt',sep = '\t',index_col=False)
        idx_to_token, idx_to_vec = ['<unk>'], []    
        idx_to_token = idx_to_token+list(PAAC.columns)[1:]
        for aa in list(PAAC.columns)[1:]:
            idx_to_vec.append(list(PAAC[aa]))
        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        
        return idx_to_token, torch.tensor(idx_to_vec)
    def __getitem__(self, tokens):
        indices = [self.token_to_idx.get(token, self.unknown_idx)
                   for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices)]
        return vecs

    def __len__(self):
        return len(self.idx_to_token)

In [39]:
embeds = TokenEmbedding('AAindex')

In [40]:
embeds.idx_to_vec.shape

torch.Size([21, 531])

### 接下来尝试集成多种特征提取方法


In [41]:
zscale = {
    'A': [0.24, -2.32, 0.60, -0.14, 1.30],  # A
    'C': [0.84, -1.67, 3.71, 0.18, -2.65],  # C
    'D': [3.98, 0.93, 1.93, -2.46, 0.75],  # D
    'E': [3.11, 0.26, -0.11, -0.34, -0.25],  # E
    'F': [-4.22, 1.94, 1.06, 0.54, -0.62],  # F
    'G': [2.05, -4.06, 0.36, -0.82, -0.38],  # G
    'H': [2.47, 1.95, 0.26, 3.90, 0.09],  # H
    'I': [-3.89, -1.73, -1.71, -0.84, 0.26],  # I
    'K': [2.29, 0.89, -2.49, 1.49, 0.31],  # K
    'L': [-4.28, -1.30, -1.49, -0.72, 0.84],  # L
    'M': [-2.85, -0.22, 0.47, 1.94, -0.98],  # M
    'N': [3.05, 1.62, 1.04, -1.15, 1.61],  # N
    'P': [-1.66, 0.27, 1.84, 0.70, 2.00],  # P
    'Q': [1.75, 0.50, -1.44, -1.34, 0.66],  # Q
    'R': [3.52, 2.50, -3.50, 1.99, -0.17],  # R
    'S': [2.39, -1.07, 1.15, -1.39, 0.67],  # S
    'T': [0.75, -2.18, -1.12, -1.46, -0.40],  # T
    'V': [-2.59, -2.64, -1.54, -0.85, -0.02],  # V
    'W': [-4.36, 3.94, 0.59, 3.44, -1.59],  # W
    'Y': [-2.54, 2.44, 0.43, 0.04, -1.47],  # Y
    '-': [0.00, 0.00, 0.00, 0.00, 0.00],  # -
}
blosum62 = {
    'A': [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0],  # A
    'R': [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3],  # R
    'N': [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3],  # N
    'D': [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3],  # D
    'C': [0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],  # C
    'Q': [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2],  # Q
    'E': [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2],  # E
    'G': [0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3],  # G
    'H': [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3],  # H
    'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3],  # I
    'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1],  # L
    'K': [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2],  # K
    'M': [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1],  # M
    'F': [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1],  # F
    'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2],  # P
    'S': [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2],  # S
    'T': [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0],  # T
    'W': [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3],  # W
    'Y': [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1],  # Y
    'V': [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4],  # V
    '-': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # -
}



In [42]:
Zscale_embed = pd.DataFrame.from_dict(zscale)

In [43]:
Zscale_embed

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,N,P,Q,R,S,T,V,W,Y,-
0,0.24,0.84,3.98,3.11,-4.22,2.05,2.47,-3.89,2.29,-4.28,...,3.05,-1.66,1.75,3.52,2.39,0.75,-2.59,-4.36,-2.54,0.0
1,-2.32,-1.67,0.93,0.26,1.94,-4.06,1.95,-1.73,0.89,-1.3,...,1.62,0.27,0.5,2.5,-1.07,-2.18,-2.64,3.94,2.44,0.0
2,0.6,3.71,1.93,-0.11,1.06,0.36,0.26,-1.71,-2.49,-1.49,...,1.04,1.84,-1.44,-3.5,1.15,-1.12,-1.54,0.59,0.43,0.0
3,-0.14,0.18,-2.46,-0.34,0.54,-0.82,3.9,-0.84,1.49,-0.72,...,-1.15,0.7,-1.34,1.99,-1.39,-1.46,-0.85,3.44,0.04,0.0
4,1.3,-2.65,0.75,-0.25,-0.62,-0.38,0.09,0.26,0.31,0.84,...,1.61,2.0,0.66,-0.17,0.67,-0.4,-0.02,-1.59,-1.47,0.0


In [44]:
blosum62_embed = pd.DataFrame.from_dict(blosum62)

In [45]:
blosum62_embed

Unnamed: 0,A,R,N,D,C,Q,E,G,H,I,...,K,M,F,P,S,T,W,Y,V,-
0,4,-1,-2,-2,0,-1,-1,0,-2,-1,...,-1,-1,-2,-1,1,0,-3,-2,0,0
1,-1,5,0,-2,-3,1,0,-2,0,-3,...,2,-1,-3,-2,-1,-1,-3,-2,-3,0
2,-2,0,6,1,-3,0,0,0,1,-3,...,0,-2,-3,-2,1,0,-4,-2,-3,0
3,-2,-2,1,6,-3,0,2,-1,-1,-3,...,-1,-3,-3,-1,0,-1,-4,-3,-3,0
4,0,-3,-3,-3,9,-3,-4,-3,-3,-1,...,-3,-1,-2,-3,-1,-1,-2,-2,-1,0
5,-1,1,0,0,-3,5,2,-2,0,-3,...,1,0,-3,-1,0,-1,-2,-1,-2,0
6,-1,0,0,2,-4,2,5,-2,0,-3,...,1,-2,-3,-1,0,-1,-3,-2,-2,0
7,0,-2,0,-1,-3,-2,-2,6,-2,-4,...,-2,-3,-3,-2,0,-2,-2,-3,-3,0
8,-2,0,1,-1,-3,0,0,-2,8,-3,...,-1,-2,-1,-2,-1,-2,-2,2,-3,0
9,-1,-3,-3,-3,-1,-3,-3,-4,-3,4,...,-3,1,0,-3,-2,-1,-3,-1,3,0


In [46]:
class TokenEmbeddingDict:
    """特征嵌入"""
    def __init__(self, embedding_name):
        """Defined in :numref:`sec_synonyms`"""
        self.idx_to_token, self.idx_to_vec = self._load_embedding(
            embedding_name)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in
                             enumerate(self.idx_to_token)}

    def _load_embedding(self, embedding_name):
        PAAC = pd.DataFrame.from_dict(embedding_name)
        idx_to_token, idx_to_vec = ['<unk>'], []    
        idx_to_token = idx_to_token+list(PAAC.columns)[1:]
        for aa in list(PAAC.columns)[1:]:
            idx_to_vec.append(list(PAAC[aa]))
        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        
        return idx_to_token, torch.tensor(idx_to_vec)
    def __getitem__(self, tokens):
        indices = [self.token_to_idx.get(token, self.unknown_idx)
                   for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices)]
        return vecs

    def __len__(self):
        return len(self.idx_to_token)

In [51]:
bl62 = TokenEmbeddingDict(zscale)

In [52]:
bl62.idx_to_vec.shape

torch.Size([21, 5])