In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
data_partitions_dirpath = '../data'

In [9]:
def read_all_shards(partition='dev', data_dir=data_partitions_dirpath):
    shards = []
    for fn in os.listdir(os.path.join(data_dir, partition)):
        with open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))
    return pd.concat(shards)

test = read_all_shards('test')
dev = read_all_shards('dev')
train = read_all_shards('train')

partitions = {'test': test, 'dev': dev, 'train': train}
for name, df in partitions.items():
    print('Dataset partition "%s" has %d sequences' % (name, len(df)))

Dataset partition "test" has 126171 sequences
Dataset partition "dev" has 126171 sequences
Dataset partition "train" has 1086741 sequences


In [10]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.utils import to_categorical

In [12]:
X_train, X_test = train['aligned_sequence'].values, test['aligned_sequence'].values
y_train_raw, y_test_raw = train['family_id'].values, test['family_id'].values
all_fam = {fam: idx for idx, fam in enumerate(list(set(y_train_raw)))}
y_train = np.array([all_fam.get(fam, len(all_fam)) for fam in y_train_raw])
y_test = np.array([all_fam.get(fam, len(all_fam)) for fam in y_test_raw])
#y_train = to_categorical(y_train, num_classes=len(all_fam))
#y_test = to_categorical(y_test, num_classes=len(all_fam))

In [32]:
genome_vocab = { amino_acid : encoding for encoding, amino_acid in enumerate(list(set(''.join(X_train[0:100000])))) }
print(genome_vocab)

{'Z': 0, 'M': 1, 'H': 2, 'B': 3, 'W': 4, 'R': 5, 'U': 6, 'I': 7, 'O': 8, 'L': 9, 'T': 10, 'D': 11, 'F': 12, 'X': 13, 'Q': 14, 'K': 15, 'N': 16, 'A': 17, 'E': 18, 'Y': 19, 'V': 20, '.': 21, 'S': 22, 'P': 23, 'C': 24, 'G': 25}
