In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pathlib
import pickle
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, precision_score, auc, make_scorer, recall_score, matthews_corrcoef, f1_score, balanced_accuracy_score, average_precision_score

In [None]:
def afsm12_encode_data(data, input_size):
    """
    Takes in fasta sequence and returns encoded/padded data
    """
    residue_dictionary = {"A": 1, "E": 2, "L": 3, "M": 4, "C": 5, "D": 6, "F": 7, "G": 8,
                          "H": 9, "K":10, "N": 11, "P": 12, "Q": 13, "R": 14, "S": 15,
                          "W": 16, "Y": 17, "T": 18, "V": 19, "I": 20}
    
    fasta = list(str(data))
    # Encode data
    for index, value in enumerate(fasta):
        fasta[index] = residue_dictionary[value]
    # Pad data

    # Invert FASTA and make list 200 times the length to avoid edge cases where FASTA is small
    padding = fasta[::-1]*2000
    
    split = int((input_size-len(fasta))/2)
    last_padding_len = input_size - len(fasta) - split

    stop_pos = int(split+len(fasta))
    padding_1 = padding[-split:]
    padding_2 = padding[:last_padding_len]
    fasta = padding_1 + fasta + padding_2
    
    # Reshape data for input
    fasta = np.array(fasta).reshape(-1, input_size, 1)
    # Normalize data by subtracting training mean and dividing by training std. deviation
    fasta = (fasta - 10.108613363425793)/6.034641898334733
    return fasta, split, stop_pos

def afsm3_encode_data(data, input_size):
    """
    Takes in fasta sequence and returns encoded/padded data
    """
    residue_dictionary = {"A": 1, "E": 2, "L": 3, "M": 4, "C": 5, "D": 6, "F": 7, "G": 8,
                          "H": 9, "K":10, "N": 11, "P": 12, "Q": 13, "R": 14, "S": 15,
                          "W": 16, "Y": 17, "T": 18, "V": 19, "I": 20}
    
    fasta = list(str(data))
    # Encode data
    for index, value in enumerate(fasta):
        fasta[index] = residue_dictionary[value]
    # Pad data

    # Invert FASTA and make list 200 times the length to avoid edge cases where FASTA is small
    padding = fasta[::-1]*2000
    
    split = int((input_size-len(fasta))/2)
    last_padding_len = input_size - len(fasta) - split

    stop_pos = int(split+len(fasta))
    padding_1 = padding[-split:]
    padding_2 = padding[:last_padding_len]
    fasta = padding_1 + fasta + padding_2
    
    # Reshape data for input
    fasta = np.array(fasta).reshape(-1, input_size, 1)
    # Normalize data by subtracting training mean and dividing by training std. deviation
    fasta = (fasta - 10.15)/5.98
    return fasta, split, stop_pos


def afsm12_predict_data(fasta, model, input_size):
    """
    Generate prediction for data point. Will return either predicted pae or plddt.
    """

    data, start_pos, stop_pos = afsm12_encode_data(fasta, input_size)
    prediction = model.predict(data).reshape(input_size, 1)
    prediction = prediction[start_pos:stop_pos]
    prediction = [float(i) for i in prediction]

    return prediction


def afsm3_predict_data(fasta, model, input_size):
    """
    Generate prediction for data point. Will return either probability of 
    crystallization.
    """

    data, start_pos, stop_pos = afsm3_encode_data(fasta, input_size)
    prediction = model.predict(data)[0]
    prediction = list(prediction[:,1])
    prediction = prediction[start_pos:stop_pos]
    prediction = [float(i) for i in prediction]

    return prediction

def encode_sequence(fasta):
    
    residue_dictionary = {"A": 1, "E": 2, "L": 3, "M": 4, "C": 5, "D": 6, "F": 7, "G": 8,
                          "H": 9, "K":10, "N": 11, "P": 12, "Q": 13, "R": 14, "S": 15,
                          "W": 16, "Y": 17, "T": 18, "V": 19, "I": 20}
    
    fasta = list(str(fasta))
    # Encode data
    for index, value in enumerate(fasta):
        fasta[index] = int(residue_dictionary[value])
        
    return fasta

def disorder_list(sequence: str) -> float:

    predictions = []
    # generate encodings for sequence
    afsm1_pred = afsm12_predict_data(sequence, afsm1_model, 4096)
    afsm2_pred = list(np.array(afsm12_predict_data(sequence, afsm2_model, 4096))/100.0)
    afsm3_pred = afsm3_predict_data(sequence, afsm3_model, 2048)
    ordinal_list = encode_sequence(sequence)
    # window size of predictions
    win_size = 11

    start, label, stop = 0, int(win_size), int((win_size * 2) + 1)

    while stop < len(sequence) + 1:
        prediction = pirate_model.predict_proba(
            afsm1_pred[start:stop] + afsm2_pred[start:stop] + afsm3_pred[start:stop] +
        ordinal_list[start:stop])[0]
        predictions.append(prediction)

        start += 1
        label += 1
        stop += 1

    return predictions

In [None]:
local_path = pathlib.Path().absolute()
model_path = str(local_path.parents[0])+"/models/"
afsm1_path = model_path+"afsm1"
afsm2_path = model_path+"afsm2"
afsm3_path = model_path+"afsm3"
pirate_path = model_path+"pirate.pkl"
input_size = 4096
presort_input = 2048
afsm1_model = tf.keras.models.load_model(afsm1_path, custom_objects=None, compile=True, options=None)
print("afsm1 loaded")
afsm2_model = tf.keras.models.load_model(afsm2_path, custom_objects=None, compile=True, options=None)
print("afsm2 loaded")
afsm3_model = tf.keras.models.load_model(afsm3_path, custom_objects=None, compile=True, options=None)
print("afsm3 loaded")
pirate_model = pickle.load(open(pirate_path, 'rb'))
print("pirate loaded")

In [None]:
ids = []
labels = []
features = []

file = open('disorder_pdb.fasta','r')
count = 0
for line in file:
    line = line.replace('\n', ' ').replace('\r', '').replace('>', '')
    if count == 0:
        ids.append(line.strip())
    if count == 1:
        features.append(line.strip())
    if count == 2:
        line = line.strip()
        line = [e for e in line]
        labels.append(line)
        count = 0
        continue
    count += 1
    continue

In [None]:
probs = []
ground_truth = []

for count, sequence in enumerate(features):
    
    if len(sequence) < 2048:
        
        afsm1_pred = afsm12_predict_data(sequence, afsm1_model, 4096)
        afsm2_pred = list(np.array(afsm12_predict_data(sequence, afsm2_model, 4096))/100.0)
        afsm3_pred = afsm3_predict_data(sequence, afsm3_model, 2048)
        ordinal_list = encode_sequence(sequence)
        probabilities = disorder_list(sequence)      
        probs += probabilities
        ground_truth += labels[count][11:-11]

In [None]:
# remove labels and corresponding predictions if the label isn't 1 or 0
for count, value in reversed(list(enumerate(ground_truth))):
    if value == "-":
        del ground_truth[count]
        del probs[count]

for count, e in enumerate(ground_truth):
    ground_truth[count] = int(ground_truth[count])
    probs[count] = float(probs[count])

In [None]:
best_f1 = 0.0
best_threshold = 0
best_preds = []
for threshold in range(1, 1000, 1):
    new_preds = []
    threshold = float(threshold/1000.0)
    for count, prob in enumerate(probs):
        if prob >= threshold:
            new_preds.append(1)
        if prob < threshold:
            new_preds.append(0)
            
    f1 = f1_score(ground_truth, new_preds)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold
        best_preds = new_preds

In [None]:
accuracy = balanced_accuracy_score(ground_truth, best_preds)
f1 = f1_score(ground_truth, best_preds)
mcc = matthews_corrcoef(ground_truth, best_preds)
auc = roc_auc_score(ground_truth, probs)
aps = average_precision_score(ground_truth, probs)
print(f"Accuracy: {accuracy}")
print(f"F1 max: {f1}")
print(f"MCC: {mcc}")
print(f"AUC: {auc}")
print(f"APS: {aps}")
print(classification_report(ground_truth, best_preds))