### 1.Data Preprocessing

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os

In [10]:
data_partitions_dirpath = '../data'
print('Available dataset partitions: ', os.listdir(data_partitions_dirpath))

Available dataset partitions:  ['dev', 'download.sh', 'test', 'train']


In [11]:
def read_all_shards(partition='dev', data_dir=data_partitions_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    return pd.concat(shards)

test = read_all_shards('test')
dev = read_all_shards('dev')
train = read_all_shards('train')

partitions = {'test': test, 'dev': dev, 'train': train}
for name, df in partitions.items():
    print('Dataset partition "%s" has %d sequences' % (name, len(df)))

Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences


In [12]:
test

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,GNAT_acetyltran,R6RQF6_9CLOT/17-251,PF12746.7,AFLFSGR..REVMAD....ACLQGMM..GCVYG..........TAG...,AFLFSGRREVMADACLQGMMGCVYGTAGGMDSAAAVLGDFCFLAGK...
1,MoaC,W5NKR5_LEPOC/505-640,PF01967.21,MVDVGGK.PVSRRTAAASATVLLG.EK..........AFWLV.......,MVDVGGKPVSRRTAAASATVLLGEKAFWLVKENQLAKGDALAVAQI...
2,Methyltransf_25,C0QLU8_DESAH/50-147,PF13649.6,VLDVACGT.C...D..VA...ME..AR.NQ.......T....G......,VLDVACGTCDVAMEARNQTGDAAFIIGTDFSPGMLTLGLQKLKKNR...
3,EMG1,T1G7Q2_HELRO/22-222,PF03587.14,VVLERASLESVKV..G.................KEYQLLN....CD...,VVLERASLESVKVGKEYQLLNCDRHKGIAKKFKRDISTCRPDITHQ...
4,Glyco_hydro_30C,C6VRM9_DYAFD/453-540,PF17189.4,GAVRVDVSGGLGTD...............AMVVSSYLN..TDKSLV...,GAVRVDVSGGLGTDAMVVSSYLNTDKSLVTVIVNADNQDRDISLAI...
...,...,...,...,...,...
12466,SCP2_2,Q73XI4_MYCPA/309-409,PF13530.6,VPEVLG....ARGYA...ADT....................DIVLD...,VPEVLGARGYAADTDIVLDVTDPLGLAGGRFQLQTRDGAGKCTPHD...
12467,F_bP_aldolase,Q5SHF7_THET8/3-304,PF01116.20,VTGLEILRKARAEGY.GVGAFNT.........NNMEFTQAILEAAE...,VTGLEILRKARAEGYGVGAFNTNNMEFTQAILEAAEEMKSPVILAL...
12468,MinE,MINE_CLOPE/14-83,PF03776.14,.KQVAKDR.....LKVILI.HDR......G..EL..SD.EVLDKIR...,KQVAKDRLKVILIHDRGELSDEVLDKIRLEILDVLSKYVEIENEDV...
12469,Glyco_hydro_47,Q1K8A8_NEUCR/113-607,PF01532.20,AFLRCWSSYRT.....KAWMSDELEPVNGGR.............KD...,AFLRCWSSYRTKAWMSDELEPVNGGRKDTFGGWGATLVDSLDTLWI...


In [13]:
train

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,GMC_oxred_C,A4WZS5_RHOS5/416-539,PF05199.13,PHPE.SRIRLST.RRDAHGMP.....IP.RIESRLGP............,PHPESRIRLSTRRDAHGMPIPRIESRLGPDAFARLRFMARTCRAIL...
1,DUF2887,K9QI92_9NOSO/3-203,PF11103.8,RDSIYYQIFKRFPALIFEL..VD.NRPPQAQNYRFESVEVKETAFR...,RDSIYYQIFKRFPALIFELVDNRPPQAQNYRFESVEVKETAFRIDG...
2,zf-IS66,Q92LC9_RHIME/32-75,PF13005.7,.TCCPDCGG.E..LRLVGED.AS....EILDMI.AAQMKVIEVARL...,TCCPDCGGELRLVGEDASEILDMIAAQMKVIEVARLKKSCRCCE
3,Asp_decarbox,X2GQZ4_9BACI/1-115,PF02261.16,MLRMMMNSKIHRATVTEADLNYVGSITIDEDILDAVGMLPNEKVHI...,MLRMMMNSKIHRATVTEADLNYVGSITIDEDILDAVGMLPNEKVHI...
4,Filamin,A7SQM3_NEMVE/342-439,PF00630.19,TACPKQ.CTA....RGLG.............LK.AAPVT.QPT..R...,TACPKQCTARGLGLKAAPVTQPTRFVVILNDCHGQPLGRSEGELEV...
...,...,...,...,...,...
13510,DUF4276,B1Y1D3_LEPCP/11-206,PF14103.6,VEEPSMEAFLHALLPRLMPAR..RTF..EIHP.................,VEEPSMEAFLHALLPRLMPARRTFEIHPFQGKDDLMAKLEARLRAY...
13511,Lipoprotein_3,Y645_MYCPN/25-108,PF00938.17,AATQVISSLSSAQKYFESSQG.ELNKKNVIKILKEGYESDANKAVH...,AATQVISSLSSAQKYFESSQGELNKKNVIKILKEGYESDANKAVHA...
13512,Ribosomal_S6,I4DCK0_DESAJ/3-92,PF01250.17,AYEILY...I...I...R...P.D..MDEE..ATN......A.......,AYEILYIIRPDMDEEATNALVDRLSGLAASNGGQNVTVDKWGKRRL...
13513,Sterile,K7J1J9_NASVI/384-476,PF03015.19,LDYVPALVADLLAV...LH.GNA.......P.......DSWA.......,LDYVPALVADLLAVLHGNAPDSWALLRESMGDMCRLHRFSSGNWRI...


#### Prepare for train and test data

In [14]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.utils import to_categorical

In [15]:
vocab = {'.': 0, 'M': 1, 'H': 2, 'B': 3, 'W': 4, 'R': 5, 'U': 6, 
         'I': 7, 'O': 8, 'L': 9, 'T': 10, 'D': 11, 'F': 12, 'X': 13, 
         'Q': 14, 'K': 15, 'N': 16, 'A': 17, 'E': 18, 'Y': 19, 
         'V': 20, 'Z': 21, 'S': 22, 'P': 23, 'C': 24, 'G': 25}

#### Prepare for X

In [None]:
%%time
X_train_raw, X_test_raw = train['aligned_sequence'].values, test['aligned_sequence'].values
X_train, X_test = [], []

for idx, x in enumerate(X_train_raw):
    if idx % 10000 == 0:
        print("No %d in training part" % idx)
    X_train.append([to_categorical(vocab.get(ch, len(vocab)), len(vocab)) for ch in x])
for idx, x in enumerate(X_test_raw):
    if idx % 10000 == 0:
        print("No %d in testing part" % idx)
    X_test.append([to_categorical(vocab.get(ch, len(vocab)), len(vocab)) for ch in x])
X_train = np.matrix(X_train)
X_test = np.matrix(X_test)

No 0 in training part
No 10000 in training part
No 20000 in training part
No 30000 in training part
No 40000 in training part
No 50000 in training part
No 60000 in training part
No 70000 in training part
No 80000 in training part


In [21]:
y_train_raw, y_test_raw = train['family_id'].values, test['family_id'].values

In [22]:
all_fam = {fam: idx for idx, fam in enumerate(list(set(y_train_raw)))}

In [23]:
y_train = np.array([all_fam.get(fam, len(all_fam)) for fam in y_train_raw])
y_test = np.array([all_fam.get(fam, len(all_fam)) for fam in y_test_raw])

In [24]:
y_train

array([ 5683, 17625,  4670, ...,  5133, 16801,  2827])

In [25]:
y_train = np.array([to_categorical(y, num_classes=len(all_fam) for y in y_train)])
y_test = np.array([to_categorical(y, num_classes=len(all_fam) for y in y_test)])

In [25]:
outdir = '../data/preprocess'
np.save(outdir + 'X_train', X_train)
np.save(outdir + 'X_train', X_test)
np.save(outdir + 'X_train', y_train)
np.save(outdir + 'X_train', y_test)