In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
!pip install biopython

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/28/15/8ac646ff24cfa2588b4d5e5ea51e8d13f3d35806bd9498fbf40ef79026fd/biopython-1.73-cp36-cp36m-manylinux1_x86_64.whl (2.2MB)
[K    100% |████████████████████████████████| 2.2MB 1.5MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.73


In [0]:
import pickle
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import to_categorical
from random import shuffle

In [0]:
# Define hyperparameters

BATCH_SIZE = 32
NUM_EPOCHS = 50
HIDDEN_UNITS = 64
LEARNING_RATE = 0.01
HIDDEN_LAYERS = 1
DROP_OUT = 0.3

In [0]:
# Define lists of amino acids including name, molecular weight, hydrophobicity, hydrophilicity. 

AA_NAMES = {'A': 'alanine',
            'B': 'aspartate/asparagine',
            'C': 'cystine',
            'D': 'aspartate',
            'E': 'glutamate',
            'F': 'phenylalanine',
            'G': 'glycine',
            'H': 'histidine',
            'I': 'isoleucine',
            'K': 'lysine',
            'L': 'leucine',
            'M': 'methionine',
            'N': 'asparagine',
            'P': 'proline',
            'Q': 'glutamine',
            'R': 'arginine',
            'S': 'serine',
            'T': 'threonine',
            'U': 'selenocysteine',
            'V': 'valine',
            'W': 'tryptophan',
            'Y': 'tyrosine',
            'Z': 'glutamate/glutamine'}

AA_NAMES_LIST = list(AA_NAMES.keys())

AA_NAMES_LIST.append('X')
AA_NAMES_LIST.append('*')

AA_MOLECULAR_WEIGHTS = {'A': 89.1, 
                        'C': 121.2,
                        'D': 133.1,
                        'E': 147.1,
                        'F': 165.2,
                        'G': 75.1,
                        'H': 155.2,
                        'I': 131.2,
                        'K': 146.2,
                        'L': 131.2,
                        'M': 149.2,
                        'N': 132.1,
                        'P': 115.1,
                        'Q': 146.2,
                        'R': 174.2,
                        'S': 105.1,
                        'T': 119.1,
                        'V': 117.1,
                        'W': 204.2,
                        'Y': 181.2}

AA_MOLECULAR_WEIGHTS_MEAN = np.mean(list(AA_MOLECULAR_WEIGHTS.values()))
AA_MOLECULAR_WEIGHTS_STD = np.std(list(AA_MOLECULAR_WEIGHTS.values()))

AA_MOLECULAR_WEIGHTS.update({'U': 167.1, 
                             'B': (AA_MOLECULAR_WEIGHTS['D'] + AA_MOLECULAR_WEIGHTS['N']) / 2,
                             'Z': (AA_MOLECULAR_WEIGHTS['E'] + AA_MOLECULAR_WEIGHTS['Q']) / 2,
                             'X': AA_MOLECULAR_WEIGHTS_MEAN})

AA_MOLECULAR_WEIGHTS_NORM = {k: (v - AA_MOLECULAR_WEIGHTS_MEAN) / AA_MOLECULAR_WEIGHTS_STD for k, v in
                             AA_MOLECULAR_WEIGHTS.items()}

def seq_molecular_weight(proteins):
    return np.mean([AA_MOLECULAR_WEIGHTS[aa] for aa in proteins])

AA_HYDROPHOBICITY = {'A': 1.8,
                     'C': 2.5,
                     'D': -3.5,
                     'E': -3.5,
                     'F': 2.8,
                     'G': -0.4,
                     'H': -3.2,
                     'I': 4.5,
                     'K': -3.9,
                     'L': 3.8,
                     'M': 1.9,
                     'N': -3.5,
                     'P': -1.6,
                     'Q': -3.5,
                     'R': -4.5,
                     'S': -0.8,
                     'T': -0.7,
                     'V': 4.2,
                     'W': -0.9,
                     'Y': -1.3}

AA_HYDROPHOBICITY_MEAN = np.mean(list(AA_HYDROPHOBICITY.values()))
AA_HYDROPHOBICITY_STD = np.std(list(AA_HYDROPHOBICITY.values()))

AA_HYDROPHOBICITY.update({'U': AA_HYDROPHOBICITY['C'],
                          'B': (AA_HYDROPHOBICITY['D'] + AA_HYDROPHOBICITY['N']) / 2,
                          'Z': (AA_HYDROPHOBICITY['E'] + AA_HYDROPHOBICITY['Q']) / 2,
                          'X': AA_HYDROPHOBICITY_MEAN})

AA_HYDROPHOBICITY_NORM = {k: (v - AA_HYDROPHOBICITY_MEAN) / AA_HYDROPHOBICITY_STD for k, v in AA_HYDROPHOBICITY.items()}

def seq_hydrophobicity(proteins):
    return np.mean([AA_HYDROPHOBICITY[aa] for aa in proteins])

AA_HYDROPHILICITY = {'A': -0.5,
                     'C': -1.0,
                     'D': 3.0,
                     'E': 3.0,
                     'F': -2.5,
                     'G': 0.0,
                     'H': -0.5,
                     'I': -1.8,
                     'K': 3.0,
                     'L': -1.8,
                     'M': -1.3,
                     'N': 0.2,
                     'P': 0.0,
                     'Q': 0.2,
                     'R': 3.0,
                     'S': 0.3,
                     'T': -0.4,
                     'V': -1.5,
                     'W': -3.4,
                     'Y': -2.3}

AA_HYDROPHILICITY_MEAN = np.mean(list(AA_HYDROPHILICITY.values()))
AA_HYDROPHILICITY_STD = np.std(list(AA_HYDROPHILICITY.values()))

AA_HYDROPHILICITY.update({'U': AA_HYDROPHILICITY['C'],
                          'B': (AA_HYDROPHILICITY['D'] + AA_HYDROPHILICITY['N']) / 2,
                          'Z': (AA_HYDROPHILICITY['E'] + AA_HYDROPHILICITY['Q']) / 2,
                          'X': AA_HYDROPHILICITY_MEAN})

AA_HYDROPHILICITY_NORM = {k: (v - AA_HYDROPHILICITY_MEAN) / AA_HYDROPHILICITY_STD for k, v in AA_HYDROPHILICITY.items()}

def seq_hydrophilicity(proteins):
    return np.mean([AA_HYDROPHILICITY[aa] for aa in proteins])
  
CLASSES = ['cyto', 'secreted', 'mito', 'nucleus']

In [0]:
# preprocess test dataset

file = '/content/gdrive/My Drive/MSc ML/0082/blind.fasta.txt'
proteins = []
s = ''
with open(file, 'rb') as f:
    lines = f.readlines()
count = 0
for l in lines:
    l = l.decode("utf-8")
    if l[0] == '>':
        if count > 0:
            proteins.append(s)
        s = ''
    else:
        s += l[:-1]
        if count == len(lines) - 1:
            proteins.append(s)
    count += 1
with open('/content/gdrive/My Drive/MSc ML/0082/test_proteins', 'wb') as f:
    pickle.dump(proteins, f)


In [0]:
with open('/content/gdrive/My Drive/MSc ML/0082/test_proteins', 'rb') as f:
    proteins_test = pickle.load(f)

In [0]:
# Create features

def create_feature(proteins):
    SEQ_LENGTH = []
    GLO_COMP = []
    LOC_FIRST = []
    LOC_LAST = []
    ISO_P = []
    MOL_W = []
    SEQ_START = []
    SEQ_END = []
    AROM = []
    HYDROPHO = []
    HYDROPHI = []
    for i in range(len(proteins)):
        SEQ_LENGTH.append(len(proteins[i]))
        analysed_seq = ProteinAnalysis(proteins[i])
        GLO_COMP.append(analysed_seq.get_amino_acids_percent())
        if len(proteins[i]) > 50:
            analysed_seq1 = ProteinAnalysis(proteins[i][:50])
            LOC_FIRST.append(analysed_seq1.get_amino_acids_percent())
            analysed_seq2 = ProteinAnalysis(proteins[i][-50:])
            LOC_LAST.append(analysed_seq2.get_amino_acids_percent())
        else:
            LOC_FIRST.append(analysed_seq.get_amino_acids_percent())
            LOC_LAST.append(analysed_seq.get_amino_acids_percent())
        ISO_P.append(analysed_seq.isoelectric_point())
        MOL_W.append(seq_molecular_weight(proteins[i]))
        if len(proteins[i]) >= 100:
            SEQ_START.append(proteins[i][:100])
            SEQ_END.append(proteins[i][-100:])
        else:
            length_ = 100-len(proteins[i])
            app = "*"*length_
            SEQ_START.append(proteins[i] + app)
            SEQ_END.append(proteins[i] + app)
        AROM.append(analysed_seq.aromaticity())
        HYDROPHO.append(seq_hydrophobicity(proteins[i]))
        HYDROPHI.append(seq_hydrophilicity(proteins[i]))
    return SEQ_LENGTH, GLO_COMP, LOC_FIRST, LOC_LAST, ISO_P, MOL_W, SEQ_START, SEQ_END, AROM, HYDROPHO, HYDROPHI
    

In [0]:
SEQ_LENGTH, GLO_COMP, LOC_FIRST, LOC_LAST, ISO_P, MOL_W, SEQ_START, SEQ_END, AROM, HYDROPHO, HYDROPHI = create_feature(proteins_test)

In [10]:
print('Number of protein is:', len(SEQ_LENGTH))
print('Mean of sequence length is:', np.mean(np.array(SEQ_LENGTH )))
print('Standard deviation of sequence length is:', np.std(np.array(SEQ_LENGTH)))
print('Maximum of sequence length is:', np.max(np.array(SEQ_LENGTH)))
print('Minimum of sequence length is:', np.min(np.array(SEQ_LENGTH)))

Number of protein is: 20
Mean of sequence length is: 546.6
Standard deviation of sequence length is: 421.4766185685749
Maximum of sequence length is: 1876
Minimum of sequence length is: 141


In [0]:
SEQ_START_encoded = []
for i in range(len(SEQ_START)):
    encoded_SEQ_START = [{c: k for k, c in enumerate(AA_NAMES_LIST)}[j] for j in SEQ_START[i]]
    SEQ_START_encoded.append(encoded_SEQ_START)
SEQ_START_encoded = np.array([np.array(xi) for xi in SEQ_START_encoded])

SEQ_END_encoded = []
for i in range(len(SEQ_END)):
    encoded_SEQ_END = [{c: k for k, c in enumerate(AA_NAMES_LIST)}[j] for j in SEQ_END[i]]
    SEQ_END_encoded.append(encoded_SEQ_END)
SEQ_END_encoded = np.array([np.array(xi) for xi in SEQ_END_encoded])

GLO_COMP_encoded = []
for i in range(len(GLO_COMP)):
    encoded_GLO_COMP = [GLO_COMP[i][j] for j in GLO_COMP[i]]
    GLO_COMP_encoded.append(encoded_GLO_COMP)
GLO_COMP_encoded = np.array([np.array(xi) for xi in GLO_COMP_encoded])

LOC_FIRST_encoded = []
for i in range(len(LOC_FIRST)):
    encoded_LOC_FIRST = [LOC_FIRST[i][j] for j in LOC_FIRST[i]]
    LOC_FIRST_encoded.append(encoded_LOC_FIRST)
LOC_FIRST_encoded = np.array([np.array(xi) for xi in LOC_FIRST_encoded])

LOC_LAST_encoded = []
for i in range(len(LOC_LAST)):
    encoded_LOC_LAST = [LOC_LAST[i][j] for j in LOC_LAST[i]]
    LOC_LAST_encoded.append(encoded_LOC_LAST)
LOC_LAST_encoded = np.array([np.array(xi) for xi in LOC_LAST_encoded])

fetures = []
fetures.append(SEQ_LENGTH)
fetures.append(ISO_P)
fetures.append(MOL_W)
fetures.append(AROM)
fetures.append(HYDROPHO)
fetures.append(HYDROPHI)
fetures = np.array(fetures).T

In [12]:
# Reload our pre-trained model and predict

model1 = load_model('/content/gdrive/My Drive/MSc ML/0082/Model_RNN_SEQ_START.h5')
SEQ_START_prediction = model1.predict(x=SEQ_START_encoded, batch_size=BATCH_SIZE)
model2 = load_model('/content/gdrive/My Drive/MSc ML/0082/Model_RNN_SEQ_END.h5')
SEQ_END_prediction = model2.predict(x=SEQ_END_encoded, batch_size=BATCH_SIZE)
model3 = load_model('/content/gdrive/My Drive/MSc ML/0082/Model_NN_GLO_COMP.h5')
GLO_COMP_prediction = model3.predict(x=GLO_COMP_encoded, batch_size=BATCH_SIZE)
model4 = load_model('/content/gdrive/My Drive/MSc ML/0082/Model_NN_LOC_FIRST.h5')
LOC_FIRST_prediction = model4.predict(x=LOC_FIRST_encoded, batch_size=BATCH_SIZE)
model5 = load_model('/content/gdrive/My Drive/MSc ML/0082/Model_NN_LOC_LAST.h5')
LOC_LAST_prediction = model5.predict(x=LOC_LAST_encoded, batch_size=BATCH_SIZE)
model6 = load_model('/content/gdrive/My Drive/MSc ML/0082/Model_NN_features.h5')
features_prediction = model6.predict(x=fetures, batch_size=BATCH_SIZE)
model7 = load_model('/content/gdrive/My Drive/MSc ML/0082/Model_NN.h5')


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
PREDICT = np.concatenate((SEQ_START_prediction, SEQ_END_prediction, 
                          GLO_COMP_prediction, LOC_FIRST_prediction, 
                          LOC_LAST_prediction, features_prediction), axis=1)

In [0]:
output = model7.predict(x=PREDICT, batch_size=BATCH_SIZE)
output_ = np.argmax(output, axis=1)
confidence = (2 * np.max(output, axis=1) - np.max(output - np.multiply(output, (output == np.max(output, axis=1, keepdims=True))), axis=1)) / 2

predicted_location_and_confidence = []
for i in range(len(output_)): 
    location_confidence = CLASSES[output_[i]] + ',' + str(format(confidence[i], '.2f'))
    predicted_location_and_confidence.append(location_confidence)

In [17]:
# Predicted location and confidence

predicted_location_and_confidence

['secreted,0.62',
 'secreted,1.00',
 'secreted,1.00',
 'nucleus,0.89',
 'nucleus,0.57',
 'cyto,0.55',
 'nucleus,0.50',
 'cyto,0.81',
 'nucleus,0.51',
 'mito,0.51',
 'mito,0.21',
 'mito,0.24',
 'secreted,1.00',
 'secreted,1.00',
 'nucleus,0.87',
 'nucleus,1.00',
 'cyto,0.42',
 'cyto,0.64',
 'cyto,0.21',
 'cyto,0.57']