### 1.Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import torch
import os
import configparser
import json

In [2]:
config_path = "../config/main.conf"
conf = configparser.ConfigParser()
conf.read(config_path)

prepro_conf = configparser.ConfigParser()
prepro_conf.read(conf['path']['preprocessing'])

['../config/proprocess.conf']

In [3]:
data_partitions_dirpath = conf['path']['data_part']
print('Available dataset partitions: ', os.listdir(data_partitions_dirpath))

Available dataset partitions:  ['dev', '.ipynb_checkpoints', 'train', 'preprocess', 'test', 'download.sh']


In [4]:
%%time
def read_all_shards(partition='dev', data_dir=data_partitions_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    return pd.concat(shards)

test = read_all_shards('test')
dev = read_all_shards('dev')
train = read_all_shards('train')

partitions = {'test': test, 'dev': dev, 'train': train}
for name, df in partitions.items():
    print('Dataset partition "%s" has %d sequences' % (name, len(df)))

Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences
CPU times: user 6.85 s, sys: 1.16 s, total: 8.01 s
Wall time: 8.19 s


In [5]:
test.head()

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,DUF4194,C7MGY1_BRAFD/37-194,PF13835.6,VHLLQGPFLDGRRD...GA.......RYAQLL..RDRTAIEARLAD...,VHLLQGPFLDGRRDGARYAQLLRDRTAIEARLADLFLELIVDDDAQ...
1,Clathrin_propel,Q7SHV2_NEUCR/257-292,PF01394.20,PPEA.SNDFPVALQVSQKYGIIYL.......ITKYGFIHLYDLE,PPEASNDFPVALQVSQKYGIIYLITKYGFIHLYDLE
2,Leu_Phe_trans,K7RWT2_ACIA4/30-205,PF03588.14,...VLAALHEGVFPMPIDGDEVPEPLR.GGMGW.....WSPQL......,VLAALHEGVFPMPIDGDEVPEPLRGGMGWWSPQLRARMPLERIRVP...
3,tRNA_anti-codon,EX7L_BACSU/29-104,PF01336.25,IWIK.GELSNVK...............IHT.RGHIYFT.....LKD...,IWIKGELSNVKIHTRGHIYFTLKDENARMQSVMFARQSERLPFKPE...
4,CSS-motif,PDED_ECOLI/42-242,PF12792.7,NQQRVVQFANHAVE.ELDKVLLPLQA.G...SEVLLP.LIGLPCS....,NQQRVVQFANHAVEELDKVLLPLQAGSEVLLPLIGLPCSVAHLPLR...


In [6]:
train.head()

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,EKR,R6QAS0_9FIRM/627-685,PF10371.9,.EEKKLVIPTNRPEMKDFVKNILHPIDHLHGDDLPVSKFV..DRAD...,EEKKLVIPTNRPEMKDFVKNILHPIDHLHGDDLPVSKFVDRADGVY...
1,DUF4042,B3MYB3_DROAN/365-542,PF13251.6,KVRISALHLLGSLAKNLPRRFLYGYWHILFPSG.......EHGATN...,KVRISALHLLGSLAKNLPRRFLYGYWHILFPSGEHGATNSHLLLLG...
2,Reg_prop,Q8EJN6_SHEON/296-320,PF07494.11,AQANMETLK..AILF...DKSG.LMWVGGSG,AQANMETLKAILFDKSGLMWVGGSG
3,DUF3880,M1WYU2_PSEP2/257-334,PF12996.7,WFVDNPHLILHHYTHPGTDNTAIFTYDAGNL.EPLRRKGFANTY.Y...,WFVDNPHLILHHYTHPGTDNTAIFTYDAGNLEPLRRKGFANTYYLP...
4,UPRTase,B6GYG1_PENRW/502-699,PF14681.6,AT.DRPAAKLLMTPMRDASI.SGSALRKVHGRVGFYLATELCT.QI...,ATDRPAAKLLMTPMRDASISGSALRKVHGRVGFYLATELCTQIMGL...


#### Prepare for train and test data

In [7]:
vocab = None
vocab_path = conf['path']['vocab']
with open(vocab_path, 'r') as of:
    vocab = json.load(of)
vocab

{'.': 0,
 'M': 1,
 'H': 2,
 'B': 3,
 'W': 4,
 'R': 5,
 'U': 6,
 'I': 7,
 'O': 8,
 'L': 9,
 'T': 10,
 'D': 11,
 'F': 12,
 'X': 13,
 'Q': 14,
 'K': 15,
 'N': 16,
 'A': 17,
 'E': 18,
 'Y': 19,
 'V': 20,
 'Z': 21,
 'S': 22,
 'P': 23,
 'C': 24,
 'G': 25}

#### Prepare for X

In [8]:
from scipy import sparse
import json

In [9]:
def one_hot_encoding(X, vocab, max_len):
    row_index = np.arange((max_len))
    num = 0
    X_res = None
    for x in X:
        num += 1
        if (num % 10000 == 0):
            print("No %d in X part" % num)
        col_index = [vocab.get(ch, 0) for ch in x]
        xlen = len(col_index)
        data = np.array([1] * min(max_len, xlen) + [0] * max(max_len - xlen, 0))
        # chop the indices
        col_index = col_index[:max_len] + [0] * max(max_len - xlen, 0)
        new_sparse = sparse.csc_matrix((data, (row_index, col_index)), shape=(max_len, 26))
        if X_res is None:
            X_res = new_sparse
        else:
            X_res = sparse.vstack((X_res, new_sparse))
    return X_res

def parse_and_save_dense_mat(X, path, to_sparse=True):
    if to_sparse:
        sparse.save_npz(path, X)
    else:
        with open(path, 'w') as of:
            for x in X:
                x_ = x.toarray().tolist()
                of.write(json.dumps(x_) + '\n')

In [10]:
def make_one_hot(Xs, vocab, max_len):
    n_vocab = len(vocab)
    tensor = torch.zeros(len(Xs), max_len, n_vocab)
    for idx, X in enumerate(Xs):
        for chidx, ch in enumerate(X[:max_len]):
            tensor[idx][chidx][vocab[ch]] = 1
    return tensor

In [33]:
%%time
fams = np.array(train["family_id"].value_counts().index)[::20]
partition = train[train["family_id"].isin(train_fams)]
max_len = int(prepro_conf['OneHot']['MaxLen'])
X = partition['aligned_sequence'].values
y = partition['family_id'].values
from sklearn.model_selection import train_test_split
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.25, random_state=41)
all_fam = {fam: idx for idx, fam in enumerate(list(set(y)))}

359
(18507,)
CPU times: user 259 ms, sys: 8.1 ms, total: 267 ms
Wall time: 265 ms


In [34]:
%%time
max_len = 3000
X_train = make_one_hot(X_train_raw, vocab, max_len)
X_test = make_one_hot(X_test_raw, vocab, max_len)
#X_train = one_hot_encoding(X_train_raw, vocab, max_len)
#X_test = one_hot_encoding(X_test_raw, vocab, max_len)

CPU times: user 1min 20s, sys: 1.71 s, total: 1min 22s
Wall time: 1min 21s


In [35]:
%%time
max_len = 1
y_train = make_one_hot(np.expand_dims(y_train_raw, axis=1), all_fam, max_len)
y_test = make_one_hot(np.expand_dims(y_test_raw, axis=1), all_fam, max_len)

CPU times: user 611 ms, sys: 27.9 ms, total: 639 ms
Wall time: 350 ms


In [37]:
%%time
path = {'x_train': '../data/preprocess/x_train.pt',
       'x_test': '../data/preprocess/x_test.pt',
       'y_train': '../data/preprocess/y_train.pt',
       'y_test': '../data/preprocess/y_test.pt'}

torch.save(X_train, path['x_train'])
torch.save(X_test, path['x_test'])
torch.save(X_train, path['y_train'])
torch.save(X_train, path['y_test'])

CPU times: user 223 ms, sys: 6.55 s, total: 6.77 s
Wall time: 32.4 s
