### 1.Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import configparser
import json

In [3]:
config_path = "../config/main.conf"
conf = configparser.ConfigParser()
conf.read(config_path)

['../config/proprocess.conf']

In [4]:
data_partitions_dirpath = conf['path']['data_part']
print('Available dataset partitions: ', os.listdir(data_partitions_dirpath))

Available dataset partitions:  ['dev', 'train', 'preprocess', 'test', 'download.sh']


In [5]:
%%time
def read_all_shards(partition='dev', data_dir=data_partitions_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    return pd.concat(shards)

test = read_all_shards('test')
dev = read_all_shards('dev')
train = read_all_shards('train')

partitions = {'test': test, 'dev': dev, 'train': train}
for name, df in partitions.items():
    print('Dataset partition "%s" has %d sequences' % (name, len(df)))

Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences
CPU times: user 7.65 s, sys: 1.02 s, total: 8.67 s
Wall time: 10.2 s


In [6]:
test.head()

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,DUF4194,C7MGY1_BRAFD/37-194,PF13835.6,VHLLQGPFLDGRRD...GA.......RYAQLL..RDRTAIEARLAD...,VHLLQGPFLDGRRDGARYAQLLRDRTAIEARLADLFLELIVDDDAQ...
1,Clathrin_propel,Q7SHV2_NEUCR/257-292,PF01394.20,PPEA.SNDFPVALQVSQKYGIIYL.......ITKYGFIHLYDLE,PPEASNDFPVALQVSQKYGIIYLITKYGFIHLYDLE
2,Leu_Phe_trans,K7RWT2_ACIA4/30-205,PF03588.14,...VLAALHEGVFPMPIDGDEVPEPLR.GGMGW.....WSPQL......,VLAALHEGVFPMPIDGDEVPEPLRGGMGWWSPQLRARMPLERIRVP...
3,tRNA_anti-codon,EX7L_BACSU/29-104,PF01336.25,IWIK.GELSNVK...............IHT.RGHIYFT.....LKD...,IWIKGELSNVKIHTRGHIYFTLKDENARMQSVMFARQSERLPFKPE...
4,CSS-motif,PDED_ECOLI/42-242,PF12792.7,NQQRVVQFANHAVE.ELDKVLLPLQA.G...SEVLLP.LIGLPCS....,NQQRVVQFANHAVEELDKVLLPLQAGSEVLLPLIGLPCSVAHLPLR...


In [7]:
train.head()

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,EKR,R6QAS0_9FIRM/627-685,PF10371.9,.EEKKLVIPTNRPEMKDFVKNILHPIDHLHGDDLPVSKFV..DRAD...,EEKKLVIPTNRPEMKDFVKNILHPIDHLHGDDLPVSKFVDRADGVY...
1,DUF4042,B3MYB3_DROAN/365-542,PF13251.6,KVRISALHLLGSLAKNLPRRFLYGYWHILFPSG.......EHGATN...,KVRISALHLLGSLAKNLPRRFLYGYWHILFPSGEHGATNSHLLLLG...
2,Reg_prop,Q8EJN6_SHEON/296-320,PF07494.11,AQANMETLK..AILF...DKSG.LMWVGGSG,AQANMETLKAILFDKSGLMWVGGSG
3,DUF3880,M1WYU2_PSEP2/257-334,PF12996.7,WFVDNPHLILHHYTHPGTDNTAIFTYDAGNL.EPLRRKGFANTY.Y...,WFVDNPHLILHHYTHPGTDNTAIFTYDAGNLEPLRRKGFANTYYLP...
4,UPRTase,B6GYG1_PENRW/502-699,PF14681.6,AT.DRPAAKLLMTPMRDASI.SGSALRKVHGRVGFYLATELCT.QI...,ATDRPAAKLLMTPMRDASISGSALRKVHGRVGFYLATELCTQIMGL...


#### Prepare for train and test data

In [8]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.utils import to_categorical

Using TensorFlow backend.


In [11]:
vocab = None
vocab_path = conf['path']['vocab']
with open(vocab_path, 'r') as of:
    vocab = json.load(of)
vocab

#### Prepare for X

In [13]:
from scipy import sparse
import json

In [14]:
def one_hot_encoding(X, vocab, max_len):
    row_index = np.arange((max_len))
    num = 0
    X_res = None
    for x in X:
        num += 1
        if (num % 100000 == 0):
            print("No %d in X part" % num)
        col_index = [vocab.get(ch, 0) for ch in x]
        xlen = len(col_index)
        data = np.array([1] * min(max_len, xlen) + [0] * max(max_len - xlen, 0))
        # chop the indices
        col_index = col_index[:max_len] + [0] * max(max_len - xlen, 0)
        new_sparse = sparse.coo_matrix((data, (row_index, col_index)), shape=(max_len, 26))
        if X_res is None:
            X_res = new_sparse
        else:
            sparse.hstack((X_res, new_sparse))
    return X_res

def parse_and_save_dense_mat(X, path, to_sparse=True):
    if to_sparse:
        sparse.save_npz(path, X)
    else:
        with open(path, 'w') as of:
            for x in X:
                x_ = x.toarray().tolist()
                of.write(json.dumps(x_) + '\n')

In [15]:
%%time

max_len = prepro_conf['OneHot']['MaxLen']
X_train_raw = train['aligned_sequence'].values
X_test_raw = test['aligned_sequence'].values
X_train = one_hot_encoding(X_train_raw, vocab)
X_test = one_hot_encoding(X_test_raw, vocab)

No 100000 in X part
No 200000 in X part
No 300000 in X part
No 400000 in X part
No 500000 in X part
No 600000 in X part
No 700000 in X part
No 800000 in X part
No 900000 in X part
No 1000000 in X part
No 100000 in X part


In [16]:
y_train_raw, y_test_raw = train['family_id'].values, test['family_id'].values
all_fam = {fam: idx for idx, fam in enumerate(list(set(y_train_raw)))}
y_train_raw = np.array([all_fam.get(fam, len(all_fam)) for fam in y_train_raw])
y_test_raw = np.array([all_fam.get(fam, len(all_fam)) for fam in y_test_raw])

In [17]:
y_train = one_hot_encoding([y_train_raw], all_fam, max_len)
y_test = one_hot_encoding([y_test_raw], all_fam, max_len)

In [18]:
%%time
path = {'x_train': '../data/preprocess/x_train.npz',
       'x_test': '../data/preprocess/x_test.npz',
       'y_train': '../data/preprocess/y_train.npz',
       'y_test': '../data/preprocess/y_test.npz'}

parse_and_save_dense_mat(X_train, path['x_train'])
parse_and_save_dense_mat(X_test, path['x_test'])
parse_and_save_dense_mat(y_train, path['y_train'])
parse_and_save_dense_mat(y_test, path['y_test'])

CPU times: user 11.9 ms, sys: 0 ns, total: 11.9 ms
Wall time: 3.29 s
