## Packages

In [217]:
import os
import random
import itertools
import gzip
from collections import Counter


import numpy as np
from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.utils import Sequence
import keras
from keras import layers
from keras.callbacks import TensorBoard

In [11]:
DATA = 'DATA/'
RAWALGS = 'DATA/1236_raw_algs/'

## Data preparation

### Conservation extraction

In [33]:
def extract_cons(filename):
    path = RAWALGS + filename
    rawalg = str(gzip.open(path, 'rt').read()).split()
    someprot = rawalg[1]
    numdict = {}
    condict = {}
    for i in range(len(someprot)):
        aa = someprot[i]
        if aa != '-':
            numdict[i] = aa
            this_pos = []
            for line in rawalg:
                if not '>' in line:
                    this_pos.append(line[i])

            counter = Counter(this_pos)
            condict[i] = float(counter.most_common()[0][1])/len(this_pos)
    
    return numdict, condict

In [58]:
for filename in os.listdir(RAWALGS):
    name, raw, faa, gz = filename.split('.')
    prefix = 'PREPR_'

    with open(f'DATA/preprocessed/{prefix + name}.txt', 'w') as pr:
        a, b = extract_cons(filename)
        for k, v in list(a.items()):
            pr.write(str(k))
            pr.write('\t')
            pr.write(str(v))
            pr.write('\n')
        pr.write('--------------------------\n')
        for k, v in list(b.items()):
            pr.write(str(k))
            pr.write('\t')
            pr.write(str(v))
            pr.write('\n')
    

### Train test split

In [118]:
frnocog = open('results/sets_ROOT_noCOG_sorted.tsv')
groups = frnocog.read().split('\n')
random.seed(1337)
random.shuffle(groups)
g_train = groups[:int(9*len(groups)/12)] 
g_val = groups[int(9*len(groups)/12): int(10*len(groups)/12)] 
g_test = groups[int(10*len(groups)/12):] 

In [119]:
len(groups)

27295

In [120]:
len(g_test)

4550

In [121]:
def extract_names(g_list):
    name_list = []
    for group in g_list:
        gsplit = group.split('\t')[1:]
        name_list.append(gsplit)
    return name_list

In [122]:
train_list = list(itertools.chain(*extract_names(g_train)))
test_list = list(itertools.chain(*extract_names(g_test)))
val_list = list(itertools.chain(*extract_names(g_val)))

In [123]:
test_list

['1SDQW',
 '1TMK0',
 '1SCWA',
 '1RTHE',
 '1TKIQ',
 '1TD51',
 '1T99H',
 '1TAW7',
 '1S7R4',
 '1RTJS',
 '1SDS9',
 '1TGCW',
 '1T9DJ',
 '1TNF9',
 '1T9VM',
 '1S92E',
 '1T9UD',
 '1RVJ0',
 '1SS6M',
 '1TG6U',
 '1T9QB',
 '1SV0C',
 '1S8XC',
 '1SIN4',
 '1STIV',
 '1T7AN',
 '1SI1M',
 '1STAT',
 '1SI4R',
 '1T7X7',
 '1T7W1',
 '1TGB8',
 '1TB4W',
 '1TMGA',
 '1TDIK',
 '1SU0Z',
 '1T2NK',
 '1T28P',
 '1SPE7',
 '1SNZA',
 '1TDJK',
 '1SVF7',
 '1SNMP',
 '1RUGR',
 '1TID1',
 '1SRAJ',
 '1SPR7',
 '1SGG8',
 '1SD0I',
 '1SY5B',
 '1T5MN',
 '1SVRX',
 '1SMW2',
 '1THEH',
 '1SN3G',
 '1ST70',
 '1SS2N',
 '1TG7M',
 '1STF6',
 '1TJS4',
 '1SPW9',
 '1TKSC',
 '1TIXZ',
 '1S442',
 '1S3E1',
 '1T757',
 '1T7ZH',
 '1TNXR',
 '1TNGQ',
 '1SW96',
 '1SKW3',
 '1TJMG',
 '1RSYC',
 '1RSVI',
 '1TEJ6',
 '1TCTJ',
 '1S3Y0',
 '1RUHX',
 '1T664',
 '1T80F',
 '1T83E',
 '1SB91',
 '1TFQM',
 '1TIHA',
 '1TNGH',
 '1SA8J',
 '1TNGG',
 '1TFEF',
 '1SXBE',
 '1SXBF',
 '1SBF3',
 '1SAGH',
 '1RUS2',
 '1TI61',
 '1SWB8',
 '1SQZ4',
 '1SPB1',
 '1SICT',
 '1SQ26',
 '1SMCE',


### One-Hot Encoder

In [107]:
aas = np.array([aa for aa in 'ACDEFGHIKLMNPQRSTVWYX']).reshape(-1, 1)
enc = OneHotEncoder(handle_unknown='ignore')
print(enc.fit_transform(aas).toarray())
enc.categories_

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0.

[array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
        'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y'], dtype='<U1')]

In [131]:
name = '1RM7I'
c = 0
framelen = 51

numdict = {}
condict = {}

folder = 'train/'
namelist = train_list
for name in namelist:
    inp = open(DATA + 'preprocessed/PREPR_' + name + '.txt')

    state = 0
    for line in inp:
        if not '-' in line:
            spline = line.split('\t')
            if state == 0:
                numdict[spline[0]] = spline[1].replace('\n', '')
            else:
                condict[spline[0]] = spline[1].replace('\n', '')
        else:
            state = 1

    for i in range(len(numdict.items()) - framelen + 1):
        frame = np.array(list(numdict.values())[i: i+framelen]).reshape(-1, 1)
        encoded_frame = enc.transform(frame).toarray()
        if float(list(condict.values())[i + 25]) >= 0.95:
            y = np.array(1)
        else:
            y = np.array(0)
            
        np.save(f'{DATA}{folder}/X/X_{c}', encoded_frame)
        np.save(f'{DATA}{folder}/y/y_{c}', encoded_frame)
        
        c += 1


    inp.close()

KeyboardInterrupt: 

## Autoencoder

### Dataloader

In [135]:
a = np.load('DATA/train/X/X_1.npy')
b = np.load('DATA/train/X/X_2.npy')
np.append(a, b, axis = 0)

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [172]:
ids = ['X_' + str(i) + '.npy' for i in range(100000)]
yids = ['y_' + str(i) + '.npy' for i in range(100000)]

In [177]:
dg

<__main__.DataGenerator at 0x25c2e839bc8>

In [238]:
class DataGenerator(Sequence):
    """Generates data for Keras
    Sequence based data generator. Suitable for building data generator for training and prediction.
    """
    def __init__(self, list_IDs, labels, y_labels, image_path, mask_path,
                 to_fit=False, batch_size=256, dim=(256, 256),
                 n_channels=1, n_classes=10, shuffle=True):
        """Initialization
        :param list_IDs: list of all 'label' ids to use in the generator
        :param labels: list of file names
        :param image_path: path to X location
        :param mask_path: path to y location
        :param to_fit: True to return X and y, False to return X only
        :param batch_size: batch size at each iteration
        :param dim: tuple indicating dimension
        :param n_channels: number of channels
        :param n_classes: number of output classes
        :param shuffle: True to shuffle label indexes after every epoch
        """
        self.list_IDs = list_IDs
        self.labels = labels
        self.y_labels = y_labels
        self.image_path = image_path
        self.mask_path = mask_path
        self.to_fit = to_fit
        self.batch_size = batch_size
        self.dim = dim
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        """Denotes the number of batches per epoch
        :return: number of batches per epoch
        """
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        """Generate one batch of data
        :param index: index of the batch
        :return: X and y when fitting. X only when predicting
        """
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X = self._generate_X(list_IDs_temp)

        if self.to_fit:
            y = self._generate_y(list_IDs_temp)
            return X, y
        else:
            return X

    def on_epoch_end(self):
        """Updates indexes after each epoch
        """
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def _generate_X(self, list_IDs_temp):
        """Generates data containing batch_size images
        :param list_IDs_temp: list of label ids to load
        :return: batch of images
        """
        # Initialization
        X = np.empty((self.batch_size, *self.dim))

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load(self.image_path + self.labels[i])

        return X

    def _generate_y(self, list_IDs_temp):
        """Generates data containing batch_size masks
        :param list_IDs_temp: list of label ids to load
        :return: batch if masks
        """
        y = np.empty((self.batch_size, *self.dim), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            y[i,] = np.load(self.mask_path + self.y_labels[i])

        return y

    def _load_grayscale_image(self, image_path):
        """Load grayscale image
        :param image_path: path to image to load
        :return: loaded image
        """
        np.load(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        img = img / 255
        return img

In [244]:
dg = DataGenerator(ids, ids,  yids, 'DATA/train/X/', 'DATA/train/y/', dim = (51, 21), batch_size = 5000)
dg[0][1:100]

array([[[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0.

In [246]:
dg[1][100:201]

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

## Neural Network

In [262]:
inp = keras.Input(shape=(51,21))
encoding_dim = 32 

encoded = layers.Dense(encoding_dim, activation='relu')(inp)

decoded = layers.Dense(21, activation='sigmoid')(encoded)

autoencoder = keras.Model(inp, decoded)

In [264]:
encoder = keras.Model(inp, encoded)

In [263]:
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [228]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [255]:
%tensorboard --logdir logs/fit

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 4768.

In [265]:
train = dg[0][:1000]
test = dg[1][1001:2000]

In [266]:
autoencoder.fit(train, train,
                epochs=50,
                batch_size=1,
                shuffle=True,
                validation_data=(test, test),
                callbacks=[TensorBoard(log_dir='logs/fit')])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x25c2fc1c948>

In [258]:
decoded = autoencoder.predict(dg[1][1001:2000])

In [259]:
decoded

array([[[3.7139228e-10, 5.5814614e-10, 5.3217247e-10, ...,
         3.9372333e-10, 6.2095003e-11, 9.6153530e-11],
        [3.7139228e-10, 5.5814614e-10, 5.3217247e-10, ...,
         3.9372333e-10, 6.2095003e-11, 9.6153530e-11],
        [5.6226412e-10, 8.6809215e-10, 3.2216815e-10, ...,
         6.3139871e-12, 3.0999675e-11, 1.1522551e-09],
        ...,
        [1.3249681e-09, 4.2378767e-09, 2.4003244e-09, ...,
         1.0000000e+00, 3.4303388e-10, 5.9838468e-09],
        [1.8140840e-12, 1.3034410e-10, 3.5324718e-10, ...,
         2.3455113e-10, 2.0967561e-10, 2.2176412e-11],
        [1.3862413e-10, 2.6668870e-10, 1.6326492e-11, ...,
         1.2417710e-09, 1.8309138e-10, 4.9745871e-17]],

       [[3.7139228e-10, 5.5814614e-10, 5.3217247e-10, ...,
         3.9372333e-10, 6.2095003e-11, 9.6153530e-11],
        [5.6226412e-10, 8.6809215e-10, 3.2216815e-10, ...,
         6.3139871e-12, 3.0999675e-11, 1.1522551e-09],
        [3.7713796e-10, 6.3126145e-12, 6.6469241e-10, ...,
         2.566