In [1]:
import tensorflow as tf
from deeplcretrainer import deeplcretrainer
import pandas as pd
from tensorflow.python.eager import context
from deeplc import DeepLC
import statsmodels.api as sm

import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
import numpy as np

from deeplc.feat_extractor import FeatExtractor
import os


2023-08-11 20:28:34.287048: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-11 20:28:38.811050: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-11 20:28:38.817083: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

In [2]:
features = FeatExtractor("/home/ubuntu/NuXL_rescore/unimod/unimod_to_formula.csv")
df_train_file = "/home/ubuntu/NuXL_rescore/Deeplc_retrain/Train_RNA_UV.csv"
base_model = "/home/ubuntu/Rescoring/RT_deeplc_model/base_model/"


In [None]:
#Training 

_ = tf.Variable([1])
context._context = None
context._create_context()
tf.config.threading.set_inter_op_parallelism_threads(1) 

models = deeplcretrainer.retrain(
    [df_train_file],
    mods_transfer_learning=[
        base_model +"/full_hc_train_1fd8363d9af9dcad3be7553c39396960.hdf5",
        base_model +"/full_hc_train_8c22d89667368f2f02ad996469ba157e.hdf5",
        base_model +"/full_hc_train_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
    ],
    freeze_layers=True,
    n_epochs=40,
    costum_modification_file = "/home/ubuntu/NuXL_rescore/unimod/unimod_to_formula.csv",
    freeze_after_concat=0,
    plot_results= True,
    write_csv_results = True,
    regularizer_val=[0.000025]
)



In [3]:
def evaluate_linear_regression_plot_(df:pd.DataFrame, x="tr", y="predicted_rt", name="evaluate_regression", model_name = "base model"):
    ci=95
    n_sample=10000000
    if len(df) > n_sample:
        df = df.sample(n_sample, replace=False)
    gls = sm.GLS(df[y], sm.add_constant(df[x]))
    res = gls.fit()
    summary = res.summary(alpha=1-ci/100.0)
    dfs = []
    results_as_html = summary.tables[0].as_html()
    dfs.append(pd.read_html(results_as_html, index_col=None)[0])
    results_as_html = summary.tables[1].as_html()
    dfs.append(pd.read_html(results_as_html, index_col=None)[0])
    summary = pd.concat(dfs, ignore_index=True)
    R_square = float(summary.loc[0,3])
    R = np.sqrt(R_square)
    n,b,w = summary.loc[[5,10,11],1].values.astype(float)
    
    from scipy.stats import pearsonr
    X_ = pearsonr(np.array(df[x]), np.array(df[y]))
    print("X_------", X_)

    MAE = mean_absolute_error (df[x], df[y])
    perc95_calib = np.percentile(abs(df[x]-df[y]),95)*2
    
    plt.figure(figsize=(10.0,8.5))
    plt.title(name + model_name +  f"\n R: {round(R,3)} - MAE: {round(MAE,2)} - 95th percentile: {round(perc95_calib,2)}", fontsize=16)#  R_Square: {round(R_square,3)}") # \n slope: {round(w,3)} intercept: {round(b,3)} samples: {n}")
    plt.scatter(df[y],df[x],s=1,alpha=0.6, color="tab:blue")
    plt.ylabel("Observed retention time", fontsize=16)
    plt.xlabel("Predicted retention time", fontsize=16) 
    plt.ylim([0, 10000])
    plt.xlim([0, 10000])
    plt.savefig(name+ model_name +".pdf")

    return pd.DataFrame(
        dict(
            MAE=[MAE],perc95_calib=[perc95_calib], R_square=[R_square],R=[R],
            slope=[w],intercept=[b],test_num=[n]
        )
    )
    

In [4]:
def model_Results (model, cal_df, data, name, model_name):
    model.calibrate_preds(seq_df=cal_df)
    preds_new = model.make_preds(seq_df=data, calibrate=True)
    data['predicted_rt'] = preds_new
    n = name
    result_df = evaluate_linear_regression_plot_(data,"tr", "predicted_rt", n, model_name)
    print(result_df)
    data.to_csv(name + model_name+'.csv')
    print("File saved at: ", name + model_name+'.csv')


In [None]:
#Taking predictions of test dataset from base model
Train_UV= pd.read_csv("/home/ubuntu/NuXL_rescore/Deeplc_retrain/Train_RNA_UV.csv")
Test_UV = pd.read_csv("/home/ubuntu/NuXL_rescore/Deeplc_retrain/Test_RNA_UV.csv") 

_ = tf.Variable([1])
context._context = None
context._create_context()
tf.config.threading.set_inter_op_parallelism_threads(1) 

dlc = DeepLC(
        path_model=[
                base_model +"/full_hc_train_1fd8363d9af9dcad3be7553c39396960.hdf5",
                base_model +"/full_hc_train_8c22d89667368f2f02ad996469ba157e.hdf5",
                base_model +"/full_hc_train_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
        ],
        batch_num=1024000,
        pygam_calibration=True,
        f_extractor = features
)

model_Results(dlc, Train_UV, Test_UV, "UV ", "DeepLC (base model)")

In [None]:
#Taking predictions of test dataset from generic model
generic_model = "/home/ubuntu/NuXL_rescore/RT_deeplc_model/generic_model"

_ = tf.Variable([1])
context._context = None
context._create_context()
tf.config.threading.set_inter_op_parallelism_threads(1) 

dlc = DeepLC(
        path_model=[
                generic_model + "/full_hc_Train_RNA_All_1fd8363d9af9dcad3be7553c39396960.hdf5",
                generic_model + "/full_hc_Train_RNA_All_8c22d89667368f2f02ad996469ba157e.hdf5",
                generic_model + "/full_hc_Train_RNA_All_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
        ],
        batch_num=1024000,
        pygam_calibration=True,
        f_extractor = features
)

model_Results(dlc, Train_UV, Test_UV, "UV ", "DeepLC (generic model)")

In [None]:
#Training_Graph
specific_model = "/home/ubuntu/NuXL_rescore/RT_deeplc_model/specific_model"

_ = tf.Variable([1])
context._context = None
context._create_context()
tf.config.threading.set_inter_op_parallelism_threads(1) 

dlc = DeepLC(
        path_model=[
                specific_model + "/full_hc_Train_RNA_UV_1fd8363d9af9dcad3be7553c39396960.hdf5",
                specific_model + "/full_hc_Train_RNA_UV_8c22d89667368f2f02ad996469ba157e.hdf5",
                specific_model + "/full_hc_Train_RNA_UV_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
        ],
        batch_num=1024000,
        pygam_calibration=True,
        f_extractor = features
)

model_Results(dlc, Train_UV, Test_UV, "UV ", "DeepLC (specific model)") 



In [5]:
#Taking predictions from specific model on all test data
specific_model = "/home/ubuntu/NuXL_rescore/RT_deeplc_model/specific_model"
Train_UV= pd.read_csv("/home/ubuntu/NuXL_rescore/Deeplc_retrain/Train_RNA_UV.csv")
Test_All = pd.read_csv("/home/ubuntu/NuXL_rescore/Deeplc_retrain/Test_RNA_All.csv")

_ = tf.Variable([1]) 
context._context = None
context._create_context()
tf.config.threading.set_inter_op_parallelism_threads(1) 

dlc = DeepLC(
        path_model=[ 
                specific_model + "/full_hc_Train_RNA_UV_1fd8363d9af9dcad3be7553c39396960.hdf5",
                specific_model + "/full_hc_Train_RNA_UV_8c22d89667368f2f02ad996469ba157e.hdf5",
                specific_model + "/full_hc_Train_RNA_UV_cb975cfdd4105f97efa0b3afffe075cc.hdf5"
        ],
        batch_num=1024000,
        pygam_calibration=True,
        f_extractor = features
)

model_Results(dlc, Train_UV, Test_All, "UV ", "DeepLC (specific model) All Test data")

2023-08-11 20:29:25.225690: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-11 20:29:25.227190: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-08-11 20:29:25.734426: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/t



This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.








This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.








This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.










X_------ PearsonRResult(statistic=0.6508709990903075, pvalue=0.0)
           MAE  perc95_calib  R_square         R   slope  intercept  test_num
0  1591.304037   6584.930202     0.424  0.651153  0.5762   2069.556    4367.0
File saved at:  UV DeepLC (specific model) All Test data.csv
