In [1]:
import pandas as pd
import numpy as np
import random
import pickle
import sys
import os
import logging
from os.path import join
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, Trials, rand
import xgboost as xgb


sys.path.append('.\\additional_code')
from data_preprocessing import *

CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

C:\Users\alexk\OneDrive\Dokumente\GitHub\SubFinder\notebooks_and_code


## 1. Loading and preprocessing data for model training and evaluation

In [2]:
def array_column_to_strings(df, column):
    df[column] = [str(list(df[column][ind])) for ind in df.index]
    return(df)

def string_column_to_array(df, column):
    df[column] = [np.array(eval(df[column][ind])) for ind in df.index]
    return(df)

### (a) Loading data: 
Only keeping data points from the GO Annotation database with experimental evidence

In [3]:
df_test  = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_test_with_ESM1b_ts.pkl"))
df_test = df_test.loc[df_test["ESM1b"] != ""]
df_test = df_test.loc[df_test["type"] != "engqvist"]
df_test.reset_index(inplace = True, drop = True)

  result = libops.scalar_compare(x.ravel(), y, op)


In [4]:
df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_ESM1b_ts.pkl"))
df_train = df_train.loc[df_train["ESM1b"] != ""]
df_train = df_train.loc[df_train["type"] != "engqvist"]
df_train.reset_index(inplace = True, drop = True)

training_UIDs = list(set(df_train["Uniprot ID"]))
random.seed(1)
random.shuffle(training_UIDs)

  result = libops.scalar_compare(x.ravel(), y, op)


In [5]:
perc_train_UIDs = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8,0.9,1]

In [6]:
def create_input_and_output_data(df):
    X = ();
    y = ();
    
    for ind in df.index:
        emb = df["ESM1b_ts"][ind]
        ecfp = np.array(list(df["ECFP"][ind])).astype(int)
                
        X = X +(np.concatenate([ecfp, emb]), );
        y = y + (df["Binding"][ind], );

    return(X,y)

test_X, test_y =  create_input_and_output_data(df = df_test)
test_X  = np.array(test_X)
test_y  = np.array(test_y)

feature_names =  ["ECFP_" + str(i) for i in range(1024)]
feature_names = feature_names + ["ESM1b_ts_" + str(i) for i in range(1280)]

param = {'learning_rate': 0.31553117247348733,
         'max_delta_step': 1.7726044219753656,
         'max_depth': 10,
         'min_child_weight': 1.3845040588450772,
         'num_rounds': 342.68325188584106,
         'reg_alpha': 0.531395259755843,
         'reg_lambda': 3.744980563764689,
         'weight': 0.26187490421514203}

num_round = param["num_rounds"]
param["tree_method"] = "gpu_hist"
param["sampling_method"] = "gradient_based"
param['objective'] = 'binary:logistic'
weights = np.array([param["weight"] if binding == 0 else 1.0 for binding in df_train["Binding"]])

del param["num_rounds"]
del param["weight"]

In [7]:
accuracies = []
roc_auc_scores = []

for i, perc in enumerate(perc_train_UIDs):

    df_train = pd.read_pickle(join(CURRENT_DIR, ".." ,"data","splits", "df_train_with_ESM1b_ts.pkl"))
    df_train = df_train.loc[df_train["ESM1b"] != ""]
    df_train = df_train.loc[df_train["type"] != "engqvist"]
    df_train.reset_index(inplace = True, drop = True)

    use_UIDs = training_UIDs[:int(perc*len(training_UIDs))]
    df_train = df_train.loc[df_train["Uniprot ID"].isin(use_UIDs)]
    print("Number of training points: %s, Number of Uniprot IDs: %s, Percentage: %s " % (len(df_train), len (use_UIDs), perc))
    train_X, train_y =  create_input_and_output_data(df = df_train)
    train_X = np.array(train_X)
    train_y = np.array(train_y)
    
    weights = np.array([0.26187490421514203 if binding == 0 else 1.0 for binding in df_train["Binding"]])
    
    dtrain = xgb.DMatrix(np.array(train_X), weight = weights, label = np.array(train_y),
                feature_names= feature_names)
    dtest = xgb.DMatrix(np.array(test_X), label = np.array(test_y),
                        feature_names= feature_names)

    bst = xgb.train(param,  dtrain, int(num_round), verbose_eval=1)
    y_test_pred = np.round(bst.predict(dtest))
    acc_test = np.mean(y_test_pred == np.array(test_y))
    roc_auc = roc_auc_score(np.array(test_y), bst.predict(dtest))
    
    accuracies.append(acc_test), roc_auc_scores.append(roc_auc)
   
    print("Accuracy on test set: %s, ROC-AUC score for test set: %s"  % (acc_test, roc_auc))


  result = libops.scalar_compare(x.ravel(), y, op)


Number of training points: 16473, Number of Uniprot IDs: 2920, Percentage: 0.3 
Accuracy on test set: 0.8648043023603227, ROC-AUC score for test set: 0.9063167032103918


  result = libops.scalar_compare(x.ravel(), y, op)


Number of training points: 21898, Number of Uniprot IDs: 3894, Percentage: 0.4 
Accuracy on test set: 0.8775769345682701, ROC-AUC score for test set: 0.9162847896205941


  result = libops.scalar_compare(x.ravel(), y, op)


Number of training points: 27434, Number of Uniprot IDs: 4867, Percentage: 0.5 
Accuracy on test set: 0.8843740663280549, ROC-AUC score for test set: 0.9261187014624037


  result = libops.scalar_compare(x.ravel(), y, op)


Number of training points: 33076, Number of Uniprot IDs: 5841, Percentage: 0.6 
Accuracy on test set: 0.8920675231550642, ROC-AUC score for test set: 0.9331161241620587


  result = libops.scalar_compare(x.ravel(), y, op)


Number of training points: 38793, Number of Uniprot IDs: 6814, Percentage: 0.7 
Accuracy on test set: 0.8973707798028084, ROC-AUC score for test set: 0.9380667996093273


  result = libops.scalar_compare(x.ravel(), y, op)


Number of training points: 44564, Number of Uniprot IDs: 7788, Percentage: 0.8 
Accuracy on test set: 0.9038691365401852, ROC-AUC score for test set: 0.9430676271986395


  result = libops.scalar_compare(x.ravel(), y, op)


Number of training points: 50104, Number of Uniprot IDs: 8761, Percentage: 0.9 
Accuracy on test set: 0.9037197490289812, ROC-AUC score for test set: 0.9454433426384008


  result = libops.scalar_compare(x.ravel(), y, op)


Number of training points: 55742, Number of Uniprot IDs: 9735, Percentage: 1 
Accuracy on test set: 0.9081266806095011, ROC-AUC score for test set: 0.9494189013588201


In [9]:
accuracies = [0.8648043023603227, 0.8775769345682701, 0.8843740663280549, 0.8920675231550642,0.8973707798028084,
              0.9038691365401852, 0.9037197490289812, 0.9081266806095011]
roc_auc_scores = [0.9063167032103918, 0.9162847896205941, 0.9261187014624037, 0.9331161241620587, 0.9380667996093273,
                  0.9430676271986395, 0.9454433426384008, 0.9494189013588201]