In [1]:
import generate_encodings as ge
import prediction_models as pm
import tqdm
import os, sys
from joblib import parallel_backend


In [2]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [3]:
import_data = "../data/NOD.csv"
input_data = []
with open(import_data, "r") as infile:
    for line in infile.readlines():
        input_data.append(line[:-1].split(","))

In [4]:
with HiddenPrints():
    k_folds = 5
    model_data = input_data
    to_encode = [line[0] for line in input_data[1:]]
    benchmark_results = dict()

    encodings = ["onehot", "esm1b", "esm2", "blosum45", "blosum50", "blosum62", "blosum80", "blosum90", "georgiev"][0:]
    models = ["xgboost", "xgboost_rf", "rf", "svr", "adaboost", "gboost"][0:]
    number_examples = 50

    with tqdm.tqdm(total=(len(encodings) * len(models) * number_examples)) as pbar:
        for e_type in encodings:
            encodings = ge.generate_sequence_encodings(method=e_type, sequences=to_encode)
            for i, encoding in enumerate(encodings):
                # index i+1 because of header line, [0] to replace the sequence line
                model_data[i + 1][0] = encoding
            with parallel_backend('threading', n_jobs=12):
                for m_type in models:
                    scores = []
                    for i in range(number_examples):
                        # use random seed
                        model = pm.ActivityPredictor(model_type=m_type, data=model_data, x_column_index=0,
                                                     y_column_index=2)
                        model.train(k_folds)
                        scores.append(model.get_performance())
                        pbar.update()
                    benchmark_results.update({f'{e_type}_{m_type}': scores})




100%|██████████| 2700/2700 [3:01:22<00:00,  4.03s/it]  


In [7]:
for key in benchmark_results.keys():
    print(key, [(round(float(a), 3), round(float(b), 3)) for a, b in benchmark_results[key]])

one_hot_xgboost [(0.449, 0.261), (0.757, 0.198), (0.439, 0.235), (0.821, 0.163), (0.655, 0.226), (0.595, 0.238), (0.473, 0.284), (0.681, 0.228), (0.694, 0.195), (0.474, 0.236), (0.746, 0.182), (0.579, 0.249), (0.698, 0.202), (0.691, 0.212), (0.52, 0.222), (0.615, 0.214), (0.637, 0.216), (0.466, 0.255), (0.41, 0.249), (0.514, 0.264), (0.792, 0.2), (0.707, 0.196), (0.753, 0.199), (0.394, 0.254), (0.696, 0.216), (0.428, 0.266), (0.699, 0.21), (0.643, 0.206), (0.666, 0.216), (0.585, 0.253), (0.633, 0.237), (0.723, 0.209), (0.617, 0.231), (0.501, 0.27), (0.52, 0.241), (0.74, 0.182), (0.658, 0.237), (0.695, 0.223), (0.665, 0.212), (0.695, 0.186), (0.705, 0.208), (0.515, 0.237), (0.691, 0.204), (0.661, 0.211), (0.77, 0.199), (0.673, 0.235), (0.699, 0.25), (0.552, 0.249), (0.747, 0.221), (0.467, 0.26)]
one_hot_xgboost_rf [(0.626, 0.229), (0.567, 0.224), (0.513, 0.248), (0.702, 0.193), (0.544, 0.253), (0.612, 0.199), (0.681, 0.216), (0.787, 0.161), (0.592, 0.211), (0.733, 0.174), (0.731, 0.19),

In [8]:
non_esm_results = benchmark_results
out = "results_all.csv"
with open(out, "w") as outfile:
    for key in non_esm_results.keys():
        outfile.write(f"{key} \t {[(round(float(a), 3), round(float(b), 3)) for a, b in benchmark_results[key]]} \n")