### 1.Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import torch
import os
import configparser
import json

In [2]:
config_path = "../config/main.conf"
conf = configparser.ConfigParser()
conf.read(config_path)

model_conf = configparser.ConfigParser()
model_conf.read(conf['path']['model'])

['../config/model.conf']

In [3]:
data_partitions_dirpath = conf['path']['data_part']
print('Available dataset partitions: ', os.listdir(data_partitions_dirpath))

Available dataset partitions:  ['dev', 'train', 'preprocess', 'test', 'download.sh']


In [4]:
%%time
def read_all_shards(partition='dev', data_dir=data_partitions_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    return pd.concat(shards)

test = read_all_shards('test')
dev = read_all_shards('dev')
train = read_all_shards('train')

partitions = {'test': test, 'dev': dev, 'train': train}
for name, df in partitions.items():
    print('Dataset partition "%s" has %d sequences' % (name, len(df)))

Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences
CPU times: user 8.33 s, sys: 1.5 s, total: 9.84 s
Wall time: 11.1 s


In [5]:
test.head()

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,DUF4194,C7MGY1_BRAFD/37-194,PF13835.6,VHLLQGPFLDGRRD...GA.......RYAQLL..RDRTAIEARLAD...,VHLLQGPFLDGRRDGARYAQLLRDRTAIEARLADLFLELIVDDDAQ...
1,Clathrin_propel,Q7SHV2_NEUCR/257-292,PF01394.20,PPEA.SNDFPVALQVSQKYGIIYL.......ITKYGFIHLYDLE,PPEASNDFPVALQVSQKYGIIYLITKYGFIHLYDLE
2,Leu_Phe_trans,K7RWT2_ACIA4/30-205,PF03588.14,...VLAALHEGVFPMPIDGDEVPEPLR.GGMGW.....WSPQL......,VLAALHEGVFPMPIDGDEVPEPLRGGMGWWSPQLRARMPLERIRVP...
3,tRNA_anti-codon,EX7L_BACSU/29-104,PF01336.25,IWIK.GELSNVK...............IHT.RGHIYFT.....LKD...,IWIKGELSNVKIHTRGHIYFTLKDENARMQSVMFARQSERLPFKPE...
4,CSS-motif,PDED_ECOLI/42-242,PF12792.7,NQQRVVQFANHAVE.ELDKVLLPLQA.G...SEVLLP.LIGLPCS....,NQQRVVQFANHAVEELDKVLLPLQAGSEVLLPLIGLPCSVAHLPLR...


In [6]:
train.head()

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,EKR,R6QAS0_9FIRM/627-685,PF10371.9,.EEKKLVIPTNRPEMKDFVKNILHPIDHLHGDDLPVSKFV..DRAD...,EEKKLVIPTNRPEMKDFVKNILHPIDHLHGDDLPVSKFVDRADGVY...
1,DUF4042,B3MYB3_DROAN/365-542,PF13251.6,KVRISALHLLGSLAKNLPRRFLYGYWHILFPSG.......EHGATN...,KVRISALHLLGSLAKNLPRRFLYGYWHILFPSGEHGATNSHLLLLG...
2,Reg_prop,Q8EJN6_SHEON/296-320,PF07494.11,AQANMETLK..AILF...DKSG.LMWVGGSG,AQANMETLKAILFDKSGLMWVGGSG
3,DUF3880,M1WYU2_PSEP2/257-334,PF12996.7,WFVDNPHLILHHYTHPGTDNTAIFTYDAGNL.EPLRRKGFANTY.Y...,WFVDNPHLILHHYTHPGTDNTAIFTYDAGNLEPLRRKGFANTYYLP...
4,UPRTase,B6GYG1_PENRW/502-699,PF14681.6,AT.DRPAAKLLMTPMRDASI.SGSALRKVHGRVGFYLATELCT.QI...,ATDRPAAKLLMTPMRDASISGSALRKVHGRVGFYLATELCTQIMGL...


#### Prepare for train and test data

In [7]:
vocab = None
vocab_path = conf['path']['vocab']
with open(vocab_path, 'r') as of:
    vocab = json.load(of)
vocab

{'<PAD>': 0,
 '.': 1,
 'M': 2,
 'H': 3,
 'B': 4,
 'W': 5,
 'R': 6,
 'U': 7,
 'I': 8,
 'O': 9,
 'L': 10,
 'T': 11,
 'D': 12,
 'F': 13,
 'X': 14,
 'Q': 15,
 'K': 16,
 'N': 17,
 'A': 18,
 'E': 19,
 'Y': 20,
 'V': 21,
 'Z': 22,
 'S': 23,
 'P': 24,
 'C': 25,
 'G': 26}

#### Prepare for X

In [8]:
from scipy import sparse
import json

In [9]:
def to_sparse(x):
    """ converts dense tensor x to sparse format """
    x_typename = torch.typename(x).split('.')[-1]
    sparse_tensortype = getattr(torch.sparse, x_typename)

    indices = torch.nonzero(x)
    if len(indices.shape) == 0:  # if all elements are zeros
        return sparse_tensortype(*x.shape)
    indices = indices.t()
    values = x[tuple(indices[i] for i in range(indices.shape[0]))]
    return sparse_tensortype(indices, values, x.size())

In [10]:
def make_one_hot(Xs, vocab, max_len):
    n_vocab = len(vocab)
    for idx, X in enumerate(Xs):
        if idx % 10000 == 0:
            print("Current dealing with data piece no: %s" % (idx))
        tensor = torch.zeros(max_len, n_vocab)
        for chidx, ch in enumerate(X[:max_len]):
            tensor[chidx][vocab[ch]] = 1
        yield to_sparse(tensor)

In [11]:
%%time
from sklearn.model_selection import train_test_split
SAMPLE_RATE = 200

fams = np.array(train["family_id"].value_counts().index)[::SAMPLE_RATE]
partition = train[train["family_id"].isin(fams)]
max_len = int(model_conf['Preprocess']['MaxLen'])
X = partition['aligned_sequence'].values
y = partition['family_id'].values
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.25, random_state=41)
fam_vocab = {fam: idx for idx, fam in enumerate(fams)}

CPU times: user 546 ms, sys: 164 ms, total: 709 ms
Wall time: 708 ms


In [12]:
#%%time
#X_train = make_one_hot(X_train_raw, vocab, max_len)
#X_test = make_one_hot(X_test_raw, vocab, max_len)

In [13]:
%%time
y_train = [y for y in make_one_hot(np.expand_dims(y_train_raw, axis=1), fam_vocab, 1)]
y_test = [y for y in make_one_hot(np.expand_dims(y_test_raw, axis=1), fam_vocab, 1)]

Current dealing with data piece no: 0
Current dealing with data piece no: 0
CPU times: user 2.07 s, sys: 50.6 ms, total: 2.13 s
Wall time: 1.06 s


In [14]:
%%time
path = {'x_train': '../data/preprocess/x_train',
       'x_test': '../data/preprocess/x_test',
       'y_train': '../data/preprocess/y_train',
       'y_test': '../data/preprocess/y_test'}

def store_sparse(x, path):
    if os.path.isfile(path):
        return
    if not os.path.isdir(path):
        os.mkdir(path)
        
    torch.save(x.coalesce().values(), path + '/value.pt')
    torch.save(x.coalesce().indices(), path + '/indices.pt')
    torch.save(x.coalesce().size(), path + '/size.pt')

#store_sparse(X_train, path['x_train'])
#store_sparse(X_test, path['x_test'])
#store_sparse(X_train, path['y_train'])
#store_sparse(X_train, path['y_test'])

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 7.63 µs


### Train the Model

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x2aab2eb915b0>

In [16]:
def prepare_sequence(seq, vocab, padding):
    res = ['<PAD>'] * padding
    res[:min(padding, len(seq))] = seq[:min(padding, len(seq))]
    # use 0 for padding
    idxs = [vocab[w] for w in res]
    return torch.tensor(idxs, dtype=torch.long)

def one_hot(label, num_class):
    ones = torch.sparse.torch.eye(num_class)
    return ones.index_select(0, label)

In [17]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, batch_size, padding_idx=0):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.dropout_layer = nn.Dropout(p=0.2)
        self.batch_size = batch_size
        self.padding_idx = padding_idx
        self.softmax = nn.LogSoftmax()
    
    def forward(self, X):
        batch_size, seq_len = X.size()
        embeds = self.word_embeddings(X)
        lstm_out, _ = self.lstm(embeds.view(batch_size, seq_len, -1))
        lstm_out = lstm_out.view(batch_size, seq_len, -1)
        # current we just take the last hidden state of the LSTM, later will modify to attention layer
        # we do not want to take the state for padding
        last_state = lstm_out.mean(1)
        tag_space = self.hidden2tag(last_state)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
EMBEDDING_DIM = 30
HIDDEN_DIM = 20
BATCH_SIZE = 100
PADDING_SIZE = 300

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(vocab), len(fams), BATCH_SIZE)
#loss_function = nn.MSELoss()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

batched_training_data = []

# separate data to each batch
for idx in range(len(X_train_raw) // BATCH_SIZE + 1):
    X_batch = X_train_raw[BATCH_SIZE * idx:BATCH_SIZE * (idx + 1)]
    y_data = [y.to_dense() for y in 
                  y_train[BATCH_SIZE * idx:BATCH_SIZE * (idx + 1)]]
    if not y_data:
        continue
    y_batch = torch.Tensor(len(X_batch), *y_data[0].size())
    
    torch.cat(y_data, out=y_batch)
    batched_training_data.append((X_batch, y_batch))

for epoch in range(200):
    print("epoch: %d" % epoch)
    model.zero_grad()
    idx = 0
    for batch, target in batched_training_data:
        
        sentence_batch = [prepare_sequence(sentence, vocab, PADDING_SIZE)
                           for sentence in batch]
        sentence_in = torch.stack(sentence_batch)
        tag_scores = model(sentence_in)
        
        #softmax_target = model.softmax(target)
        
        labels = torch.max(target, 1)[1]
        loss = loss_function(tag_scores, labels.long())
        if idx % 100 == 0:
            print("Sample target:", labels)
            print("Sample outcome:", torch.max(tag_scores, 1)[1])
            print("batch no: %d" % idx)
            print(loss)
        loss.backward()
        optimizer.step()
        idx += 1
        

epoch: 0
Sample target: tensor([ 0,  6, 13, 18,  1, 11,  0,  0,  0,  4,  0,  0,  1, 21, 15, 22, 21,  0,
         9,  0,  3, 11,  0,  0,  2, 13, 34,  2, 15,  0,  0,  0,  1,  0, 20,  0,
         0,  0,  0,  1,  0,  0,  0,  1,  8, 21,  0,  0, 26,  2,  0, 13,  1, 64,
         0,  0,  0,  3,  0, 19,  0, 11,  1,  1,  0,  3, 11, 20,  0,  0,  0,  5,
         0,  0,  0, 14,  2,  5, 30,  0, 16,  5, 31, 30, 22, 10, 25, 15, 11,  1,
        20,  3,  0, 13, 58,  2,  1, 23,  0,  8])
Sample outcome: tensor([50, 72, 50, 50, 50, 50, 50, 50, 50, 72, 50, 50, 50, 50, 72, 50, 72, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 72, 50, 72, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 72, 50, 50, 72, 50, 50, 50, 38, 38,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 38, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 72, 50, 72, 50, 72, 72, 50, 50, 72, 72, 50, 38,
        50, 50, 50, 50, 72, 50, 50, 72, 50, 50])
batch no: 0
tensor(4.6174, grad_fn=<NllLossBackward>)
epoch: 1

In [None]:
X_test = [prepare_sequence(sentence, vocab, PADDING_SIZE)
                           for sentence in X_test_raw]
X_test = torch.stack(X_test)
score_pred = model(X_test)
y_pred = np.array(torch.max(score_pred, 1)[1].tolist())

In [None]:
y_test = np.array([fam_vocab[fam] for fam in y_test_raw])

In [None]:
acc = sum(y_test == y_pred) / len(y_test)
acc