### We used the python code from the GitHub repository "https://github.com/SysBioChalmers/DLKcat" to reproduce the DLKcat model and to make predictions for their test set:

In [1]:
import numpy as np
import pandas as pd
import os
from os.path import join
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib as mpl
import json

from Bio import pairwise2
from Bio.Emboss.Applications import NeedleCommandline

import pickle
import torch

In [5]:
def load_tensor(file_name, dtype):
    return [dtype(d).to(device) for d in np.load(file_name + '.npy', allow_pickle=True)]


def load_pickle(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)

def shuffle_dataset(dataset, seed):
    np.random.seed(seed)
    np.random.shuffle(dataset)
    return dataset

def split_dataset(dataset, ratio):
    n = int(ratio * len(dataset))
    dataset_1, dataset_2 = dataset[:n], dataset[n:]
    return dataset_1, dataset_2

def calculate_identity(fasta_file_1, fasta_file_2):
    needle_cline = NeedleCommandline(asequence = fasta_file_1, bsequence = fasta_file_2,
                                     gapopen=10, gapextend=0.5,  filter = True)

    out = needle_cline()[0]
    out = out[out.find("Identity"):]
    out = out[:out.find("\n")]
    percent = float(out[out.find("(")+1 :out.find(")")-1].replace(" ", ""))
    return(percent)

device = "cpu"

## 1. Loading results from DLKcat prediction

In [8]:
with open(join("..", "..", "data", "DLKcat", "sequences_test.pkl"), 'rb') as f:
    sequences_test = pickle.load(f)
    
with open(join("..", "..", "data", "DLKcat", 'sequences_train.pkl'), 'rb') as f:
    sequences_train = pickle.load(f)
    
with open(join("..", "..", "data", "DLKcat", 'y_pred_test.pkl'), 'rb') as f:
    y_test_pred = pickle.load(f)

interactions = load_tensor(join("..", "..", "data", "DLKcat", 'regression'), torch.FloatTensor)
interactions = shuffle_dataset(interactions, 1234)
interactions_train, interactions_ = split_dataset(interactions, 0.8)
interactions_dev, interactions_test = split_dataset(interactions_, 0.5)

#converting kcat values from log2 to log10:
interactions_test = [np.log10(2**float(kcat)) for kcat in interactions_test]

df_pred = pd.DataFrame({"y_true" : interactions_test, "y_pred" : y_test_pred, "Sequence" : sequences_test})
df_pred

Unnamed: 0,y_true,y_pred,Sequence
0,-2.207608,-0.071899,MSAIDCIITAAGLSSRMGQWKMMLPWEQGTILDTSIKNALQFCSRI...
1,-3.657577,-2.707640,MKEFYLTVEQIGDSIFERYIDSNGRERTREVEYKPSLFAHCPESQA...
2,0.949390,0.831021,MSPSKMNATVGSTSEVEQKIRQELALSDEVTTIRRNAPAAVLYEDG...
3,1.672098,1.513026,MKNVGFIGWRGMVGSVLMQRMVEERDFDAIRPVFFSTSQLGQAAPS...
4,-1.790485,-2.830310,MATSTETISSLAQPFVHLENPINSPLVKETIRPRNDTTITPPPTQW...
...,...,...,...
1679,-1.920819,-0.281649,MNYPAEPFRIKSVETVSMIPRDERLKKMQEAGYNTFLLNSKDIYID...
1680,2.740363,0.945056,MIEADYLVIGAGIAGASTGYWLSAHGRVVVLEREAQPGYHSTGRSA...
1681,1.198657,1.115256,MNLREKYGEWGLILGATEGVGKAFCEKIAAGGMNVVMVGRREEKLN...
1682,0.740363,0.917627,MALLSQAGGSYTVVPSGVCSKAGTKAVVSGGVRNLDVLRMKEAFGS...


Calculating RMSE and coefficient of determination R²

In [9]:
np.sqrt(mean_squared_error(interactions_test,y_test_pred)), r2_score(interactions_test,y_test_pred)

(1.1195636742162083, 0.44447253110852536)

## 2. Calculating the maximal sequence identity compared to all sequences in the training set:

#### (a) Creating Fasta files for all training and test sequences:

In [29]:
for ind in df_pred.index:
    ofile = open(join("..", "..", "data", "DLKcat", "Fasta_files", "test_seq_" +str(ind) + ".fasta", "w")
    ofile.write("> seq_" + str(ind) + "\n" + df_pred["Sequence"][ind] + "\n")
    ofile.close()
    
for ind in range(len(sequences_train)):
    ofile = open(join("..", "..", "data", "DLKcat", "Fasta_files", "Fasta_files", "train_seq_" +str(ind) + ".fasta", "w")
    ofile.write("> seq_train_" + str(ind) + "\n" + sequences_train[ind] + "\n")
    ofile.close()

#### (b) Calculating the maximal pairwise sequence identity:

In [None]:
'''from Bio.Emboss.Applications import NeedleCommandline
import os
from os.path import join
import pandas as pd
import sys
import time
import numpy as np


arg = int(sys.argv[1])

    
def calculate_identity(fasta_file_1, fasta_file_2):
    needle_cline = NeedleCommandline(asequence = fasta_file_1, bsequence = fasta_file_2,
                                     gapopen=10, gapextend=0.5,  filter = True)

    out = needle_cline()[0]
    out = out[out.find("Identity"):]
    out = out[:out.find("\n")]
    percent = float(out[out.find("(")+1 :out.find(")")-1].replace(" ", ""))
    return(percent)


identities = []
for i in range(13470):
    ident = calculate_identity(
                fasta_file_1 = join("..", "..", "data", "DLKcat", "Fasta_files", "test_seq_" + str(arg) + ".fasta"),
               fasta_file_2 = join("..", "..", "data", "DLKcat", "Fasta_files", "train_seq_" + str(i) + ".fasta"))
    identities.append(ident)


ofile = open(join("..", "..", "data", "DLKcat", "DLkcat_ident", "test_seq" + str(arg) + ".txt"), "w")
ofile.write(str(max(identities)))
ofile.close()''';

Loading the results:

In [10]:
df_pred["max_ident"] = np.nan

for ind in df_pred.index:
    try:
        with open(join("..", "..", "data", "DLKcat", "DLkcat_ident", "test_seq" + str(ind) + ".txt")) as f:
            ident = f.readlines()
            ident = float(ident[0])
        df_pred["max_ident"][ind] = ident
    except FileNotFoundError:
        pass
df_pred

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,y_true,y_pred,Sequence,max_ident
0,-2.207608,-0.071899,MSAIDCIITAAGLSSRMGQWKMMLPWEQGTILDTSIKNALQFCSRI...,22.8
1,-3.657577,-2.707640,MKEFYLTVEQIGDSIFERYIDSNGRERTREVEYKPSLFAHCPESQA...,100.0
2,0.949390,0.831021,MSPSKMNATVGSTSEVEQKIRQELALSDEVTTIRRNAPAAVLYEDG...,100.0
3,1.672098,1.513026,MKNVGFIGWRGMVGSVLMQRMVEERDFDAIRPVFFSTSQLGQAAPS...,100.0
4,-1.790485,-2.830310,MATSTETISSLAQPFVHLENPINSPLVKETIRPRNDTTITPPPTQW...,99.4
...,...,...,...,...
1679,-1.920819,-0.281649,MNYPAEPFRIKSVETVSMIPRDERLKKMQEAGYNTFLLNSKDIYID...,100.0
1680,2.740363,0.945056,MIEADYLVIGAGIAGASTGYWLSAHGRVVVLEREAQPGYHSTGRSA...,100.0
1681,1.198657,1.115256,MNLREKYGEWGLILGATEGVGKAFCEKIAAGGMNVVMVGRREEKLN...,100.0
1682,0.740363,0.917627,MALLSQAGGSYTVVPSGVCSKAGTKAVVSGGVRNLDVLRMKEAFGS...,99.8


In [11]:
df_pred.to_pickle(join("..", "..", "data", "DLKcat", "df_pred.pkl"))

## Calculate model performance for different sequence identities:

In [12]:
lower_bounds = [0, 40, 80, 99]
upper_bounds = [40,80,99,100]


for i in range(len(lower_bounds)):
    lb, ub = lower_bounds[i], upper_bounds[i]
    help_df = df_pred.loc[df_pred["max_ident"] >=  lb].loc[df_pred["max_ident"] <= ub]
    if len(help_df) > 0:
        y_pred, y_true = np.array(help_df["y_pred"]), np.array(help_df["y_true"])
        R2 =  r2_score(y_true, y_pred)
        RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
        
        print("Lower bound: %s, upper bound: %s, no. of data points: %s, R2: %s, RMSE: %s" %  (lb, ub, len(help_df), R2, RMSE))

Lower bound: 0, upper bound: 40, no. of data points: 82, R2: -0.6072304105234347, RMSE: 2.0954145982947594
Lower bound: 40, upper bound: 80, no. of data points: 42, R2: 0.34280134977895493, RMSE: 1.2434758603533023
Lower bound: 80, upper bound: 99, no. of data points: 27, R2: 0.48622435213243465, RMSE: 1.1191845445308464
Lower bound: 99, upper bound: 100, no. of data points: 1536, R2: 0.5128517542754034, RMSE: 1.0371892758113466
