In [1]:
import src.generate_encodings as ge
import src.prediction_models as pm
import tqdm
import os, sys
from joblib import parallel_backend
import warnings
import ast

In [1]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout


class HiddenWarnings():
    def __enter__(self):
        # Save the current filter settings before changing them
        self._previous_filters = warnings.filters[:]
        # Ignore all warnings
        warnings.filterwarnings("ignore")

    def __exit__(self, exc_type, exc_val, exc_tb):
        # Restore the original warning filter settings
        warnings.filters = self._previous_filters


In [3]:
data_file = "../Data/NOD.csv"
k_folds = 5
model_data = []

benchmark_results = dict()
number_examples = 50

e_types = ["esmc_300m", "esmc_600m", "georgiev", "one_hot", "blosum45", "blosum50", "blosum62", "blosum80",
           "blosum90", ][-2:-1]
models = ["lightgbm", "xgboost", "rf", "xgboost_rf", "adaboost", "svr"][0:1]

In [4]:
import torch
import gc

# import pdb; pdb.set_trace()
for e_type in e_types:
    model_data = []
    with open(data_file, "r") as infile:
        lines = infile.readlines()[1:]
        print(f"generate {e_type} encodings")
        with tqdm.tqdm(total=len(lines)) as pbar:
            for line in lines:
                line = line[:-1].split(",")
                to_encode = [line[0]]
                representation = ge.generate_sequence_encodings(e_type, to_encode)
                score = float(line[2])
                model_data.append((representation[0], score))
                pbar.update(1)

    with tqdm.tqdm(total=(len(models) * number_examples)) as pbar:
        with parallel_backend('threading', n_jobs=12):
            for m_type in models:
                scores = []
                for i in range(number_examples):
                    # use random seed
                    model = pm.ActivityPredictor(model_type=m_type, data=model_data, x_column_index=0,
                                                 y_column_index=1)
                    with HiddenPrints():
                        with HiddenWarnings():
                            model.train(k_folds)
                        scores.append(model.get_performance())
                    pbar.update()
                benchmark_results.update({f'{e_type}_{m_type}': scores})
    gc.collect()
    torch.cuda.empty_cache()

generate blosum80 encodings


100%|██████████| 566/566 [00:00<00:00, 3399.52it/s]
100%|██████████| 50/50 [00:12<00:00,  4.13it/s]


In [5]:
out = "bl_lgbm.csv"
with open(out, "a") as outfile:
    for key in benchmark_results.keys():
        outfile.write(f"{key} \t {[(round(float(a), 3), round(float(b), 3)) for a, b in benchmark_results[key]]} \n")

In [6]:
print(benchmark_results)

{'blosum80_lightgbm': [(0.7077307951427297, 0.20324797901413832), (0.7630661594384968, 0.17749796562056633), (0.619996204854276, 0.23203646654255153), (0.6933255043936438, 0.20476653232387565), (0.6355294990499714, 0.22513930558842307), (0.7878787153926969, 0.2024945908729646), (0.44662376025271106, 0.2376671378640582), (0.7061913816008868, 0.22115587386804286), (0.5775966248269186, 0.2108062758726385), (0.6034898679624479, 0.24721808389145875), (0.7265857088455923, 0.18237160455144127), (0.7565313710915273, 0.1750722546736129), (0.6409691667027577, 0.19880446625594322), (0.7207077447178498, 0.2213755951072351), (0.6433442067874064, 0.2228394242680954), (0.6929981099179241, 0.23258417391175673), (0.7500297191132963, 0.20601044548402564), (0.7277995445460079, 0.19414788962878185), (0.8133475211149233, 0.18805337656520266), (0.8070705029473938, 0.1899825797231421), (0.6987492020229062, 0.20114078154533957), (0.6881120536783503, 0.189029041607905), (0.6621586453950239, 0.24442147304541498

In [7]:
# for key in benchmark_results.keys():
#     print(key, [(round(float(a), 3), round(float(b), 3)) for a, b in benchmark_results[key]])

In [8]:
# esm2_results = benchmark_results
# out = "results_all.csv"
# with open(out, "w") as outfile:
#     for key in esm2_results.keys():
#         outfile.write(f"{key} \t {[(round(float(a), 3), round(float(b), 3)) for a, b in benchmark_results[key]]} \n")

In [9]:
# """recovering the Results script :)"""
#
# infile = "Results/results_all.csv"
# results = dict()
# with open(infile, "r") as infile:
#     for line in infile.readlines():
#         key = line[:-1].split(" , ")[0]
#         value = line[:-1].split(" , ")[1]
#         value = ast.literal_eval(value)
#         results.update({key: value})
#
# outfile = "results_all_recovered.csv"
# with open(outfile, "w") as outfile:
#     for key in results.keys():
#         outfile.write(f"{key} \t {[(round(float(a), 3), round(float(b), 3)) for a, b in results[key]]} \n")

In [10]:
x = [3.63, 3.02, 3.82, 3.42, 3.59, 2.87, 3.03, 3.46, 3.36, 3.3]
y = [53.1, 49.7, 48.4, 54.2, 54.9, 43.7, 47.2, 45.2, 54.4, 50.4]

In [11]:
import numpy as np
import math


def PearsonCorrelationCoefficient(x_arr, y_arr):
    x_arr = np.array(x_arr)
    x_av = np.mean(x_arr)
    y_arr = np.array(y_arr)
    y_av = np.mean(y_arr)
    numerator = sum([(x_i - x_av) * (y_i - y_av) for x_i, y_i in zip(x_arr, y_arr)])
    denominator = np.sqrt(sum([(x_i - x_av) ** 2 for x_i in x_arr]) * sum([(y_i - y_av) ** 2 for y_i in y_arr]))
    if denominator == 0:
        r = 1
    else:
        r = numerator / denominator
    return r


print(PearsonCorrelationCoefficient(x, y))

0.47017723296840297
