In [None]:
import tensorflow as tf
from deeplcretrainer import deeplcretrainer
import pandas as pd
from tensorflow.python.eager import context
from deeplc import DeepLC
import statsmodels.api as sm

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error 
import numpy as np

from deeplc.feat_extractor import FeatExtractor
import os


In [None]:
features = FeatExtractor("/home/ubuntu/NuXL_rescore/unimod/unimod_to_formula.csv")
df_train_file = "/home/ubuntu/NuXL_rescore/Deeplc_retrain/Train_RNA_All.csv"
base_model = "/home/ubuntu/Rescoring/RT_deeplc_model/base_model/"


In [None]:
#Training 

_ = tf.Variable([1])
context._context = None
context._create_context()
tf.config.threading.set_inter_op_parallelism_threads(1) 

models = deeplcretrainer.retrain(
    [df_train_file],
    mods_transfer_learning=[
        base_model +"/full_hc_train_1fd8363d9af9dcad3be7553c39396960.hdf5",
        base_model +"/full_hc_train_8c22d89667368f2f02ad996469ba157e.hdf5",
        base_model +"/full_hc_train_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
    ],
    freeze_layers=True,
    n_epochs=30,
    costum_modification_file = "/home/ubuntu/NuXL_rescore/unimod/unimod_to_formula.csv",
    freeze_after_concat=0,
    plot_results= True,
    write_csv_results = True,
    batch_size=32
)



In [None]:
def evaluate_linear_regression_plot_(df:pd.DataFrame, x="tr", y="predicted_rt", name="evaluate_regression", model_name = "base model"):
    ci=95
    n_sample=10000000
    if len(df) > n_sample:
        df = df.sample(n_sample, replace=False)
    gls = sm.GLS(df[y], sm.add_constant(df[x]))
    res = gls.fit()
    summary = res.summary(alpha=1-ci/100.0)
    dfs = []
    results_as_html = summary.tables[0].as_html()
    dfs.append(pd.read_html(results_as_html, index_col=None)[0])
    results_as_html = summary.tables[1].as_html()
    dfs.append(pd.read_html(results_as_html, index_col=None)[0])
    summary = pd.concat(dfs, ignore_index=True)
    R_square = float(summary.loc[0,3])
    R = np.sqrt(R_square)
    n,b,w = summary.loc[[5,10,11],1].values.astype(float)
    
    from scipy.stats import pearsonr
    X_ = pearsonr(np.array(df[x]), np.array(df[y]))
    print("X_------", X_)

    MAE = mean_absolute_error (df[x], df[y])
    perc95_calib = np.percentile(abs(df[x]-df[y]),95)*2
    
    plt.figure(figsize=(10.0,8.5))
    plt.title(name + model_name +  f"\n R: {round(R,3)} - MAE: {round(MAE,2)} - 95th percentile: {round(perc95_calib,2)}", fontsize=16)#  R_Square: {round(R_square,3)}") # \n slope: {round(w,3)} intercept: {round(b,3)} samples: {n}")
    plt.scatter(df[y],df[x],s=1,alpha=0.6, color="tab:blue")
    plt.ylabel("Observed retention time", fontsize=16)
    plt.xlabel("Predicted retention time", fontsize=16) 
    plt.ylim([0, 10000])
    plt.xlim([0, 10000])
    plt.savefig(name+ model_name +".pdf")

    return pd.DataFrame(
        dict(
            MAE=[MAE],perc95_calib=[perc95_calib], R_square=[R_square],R=[R],
            slope=[w],intercept=[b],test_num=[n]
        )
    )
    

In [None]:
def model_Results (model, cal_df, data, name, model_name):
    model.calibrate_preds(seq_df=cal_df)
    preds_new = model.make_preds(seq_df=data, calibrate=True)
    data['predicted_rt'] = preds_new
    n = name
    result_df = evaluate_linear_regression_plot_(data,"tr", "predicted_rt", n, model_name)
    print(result_df)
    data.to_csv(name + model_name+'.csv')
    print("File saved at: ", name + model_name+'.csv')


In [None]:
#Taking predictions of test dataset from base model
Train_All= pd.read_csv("/home/ubuntu/NuXL_rescore/Deeplc_retrain/Train_RNA_All.csv")
Test_All = pd.read_csv("/home/ubuntu/NuXL_rescore/Deeplc_retrain/Test_RNA_All.csv") 

_ = tf.Variable([1])
context._context = None
context._create_context()
tf.config.threading.set_inter_op_parallelism_threads(1) 

dlc = DeepLC(
        path_model=[
                base_model +"/full_hc_train_1fd8363d9af9dcad3be7553c39396960.hdf5",
                base_model +"/full_hc_train_8c22d89667368f2f02ad996469ba157e.hdf5",
                base_model +"/full_hc_train_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
        ],
        batch_num=1024000,
        pygam_calibration=True,
        f_extractor = features
)

model_Results(dlc, Train_All, Test_All, "All", "DeepLC (base model)")

In [None]:
#Taking predictions of test dataset from generic model
generic_model = "/home/ubuntu/NuXL_rescore/RT_deeplc_model/generic_model"

_ = tf.Variable([1])
context._context = None
context._create_context()
tf.config.threading.set_inter_op_parallelism_threads(1) 

dlc = DeepLC(
        path_model=[
                generic_model + "/full_hc_Train_RNA_All_1fd8363d9af9dcad3be7553c39396960.hdf5",
                generic_model + "/full_hc_Train_RNA_All_8c22d89667368f2f02ad996469ba157e.hdf5",
                generic_model + "/full_hc_Train_RNA_All_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
        ],
        batch_num=1024000,
        pygam_calibration=True,
        f_extractor = features
)

model_Results(dlc, Train_All, Test_All, "All", "DeepLC (generic model)")
