# 1.Environment and data preparation

## 1.1.Import

In [1]:
import pandas as pd
import os
import re
from tqdm import tqdm
import importlib
from matplotlib.pyplot import figure
from ZHMolGraph.import_modules import *
from ZHMolGraph import ZHMolGraph
import pickle

2024-07-18 10:53:20.906750: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-18 10:53:20.931681: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
importlib.reload(ZHMolGraph)  

<module 'ZHMolGraph.ZHMolGraph' from '/home/quan/RNA_Protein_Network/MY-ZHMolRPGraph/Github_package/code/ZHMolGraph/ZHMolGraph.py'>

## 1.2.GPU Setting

In [3]:
str(subprocess.check_output('nvidia-smi', shell = True)).split('\\n')

["b'Thu Jul 18 10:53:22 2024       ",
 '+---------------------------------------------------------------------------------------+',
 '| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |',
 '|-----------------------------------------+----------------------+----------------------+',
 '| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |',
 '| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |',
 '|                                         |                      |               MIG M. |',
 '|   0  NVIDIA GeForce RTX 3090        Off | 00000000:01:00.0 Off |                  N/A |',
 '| 31%   53C    P8              24W / 350W |  12857MiB / 24576MiB |      0%      Default |',
 '|                                         |                      |                  N/A |',
 '+-----------------------------------------+----------------------+----------------------+',
 '                    

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(tensorflow.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2024-07-18 10:53:22.261703: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-18 10:53:22.281236: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-18 10:53:22.281411: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

## 1.3.Define my vecnn structure

In [5]:
Dataset="NPInter2"

In [6]:
# Read In rnas and proteins dataframes to pass to AIBind after changing column names 
with open('data/Mol2Vec/RPI_'+ Dataset +'_rnafm_embed_normal.pkl', 'rb') as file: 
    rnas = pkl.load(file)
    
with open('data/Mol2Vec/RPI_' + Dataset + '_proteinprottrans_embed_normal.pkl', 'rb') as file: 
    proteins = pkl.load(file)

# print(rnas)
# print(len(rnas))
# print(proteins)
# print(type(proteins))


In [7]:
# create object
vecnn_object = ZHMolGraph.ZHMolGraph(interactions_location = 'data/interactions/dataset_RPI_' + Dataset + '_RP.csv',
                              interactions = None,
                              interaction_y_name = 'Y',

                              absolute_negatives_location = None,
                              absolute_negatives = None,

                              rnas_location = None,
                              rnas_dataframe = rnas,
                              rna_seq_name = 'RNA_aa_code',

                              proteins_location = None,
                              proteins_dataframe = proteins, 
                              protein_seq_name = 'target_aa_code',


                              model_out_dir = f'trained_model/ZHMolGraph_VecNN_model_RPI_{Dataset}/',

        

                              debug = False)

In [8]:
os.makedirs(vecnn_object.model_out_dir, exist_ok=True)

In [9]:
vecnn_object.rnas = rnas
vecnn_object.proteins = proteins

In [10]:
all_samples = pd.concat([vecnn_object.interactions])

In [11]:
Negative_samples = all_samples[all_samples['Y']==0]

In [12]:
vecnn_object.negatives = pd.concat([Negative_samples[vecnn_object.rna_seq_name], Negative_samples[vecnn_object.protein_seq_name], 
                                    Negative_samples[vecnn_object.interaction_y_name]],axis=1)
vecnn_object.negatives = vecnn_object.negatives.reset_index(drop=True)


## 1.4.Create train and test dataset

In [13]:
# 从文件加载变量
train_sets_file = vecnn_object.model_out_dir + '/train_sets.pkl'
with open(train_sets_file, 'rb') as f:
    vecnn_object.train_sets = pickle.load(f)

test_sets_file = vecnn_object.model_out_dir + '/test_sets.pkl'
with open(test_sets_file, 'rb') as f:
    vecnn_object.test_sets = pickle.load(f)

## 1.5.Generate Mol2Vec and ProtVec Embeddings

In [14]:
# Read In rnas and proteins dataframes to pass to AIBind after changing column names 
with open('data/Mol2Vec/RPI_'+ Dataset +'_rnafm_embed_normal.pkl', 'rb') as file: 
    rnas = pkl.load(file)
    
with open('data/Mol2Vec/RPI_' + Dataset + '_proteinprottrans_embed_normal.pkl', 'rb') as file: 
    proteins = pkl.load(file)

# print(rnas)
# print(len(rnas))
# print(proteins)
# print(type(proteins))

vecnn_object.rnas = rnas
vecnn_object.proteins = proteins

In [15]:
rna_vector_length = 640
protein_vector_length = 1024

In [16]:
rna_embeddings = rnas['normalized_embeddings']
rna_array = np.zeros((len(rnas['normalized_embeddings']), rna_vector_length))
# 使用 for 循环逐行赋值
for i in tqdm(range(len(rnas['normalized_embeddings']))):
    rna_array[i, :] = rna_embeddings.iloc[i]
    
vecnn_object.rna_embeddings = rna_array

  0%|          | 0/4580 [00:00<?, ?it/s]

In [17]:

protein_embeddings = proteins['normalized_embeddings']
protein_array = np.zeros((len(proteins['normalized_embeddings']), protein_vector_length))
# 使用 for 循环逐行赋值
for i in tqdm(range(len(proteins['normalized_embeddings']))):
    protein_array[i, :] = protein_embeddings.iloc[i]
    
vecnn_object.protein_embeddings = protein_array


  0%|          | 0/439 [00:00<?, ?it/s]

# 2. Get the benchmark validation results of ZHMolGraph

In [18]:
embedding_type = 'Pretrain'
vecnn_object.get_benchmark_validation_ZHMolGraph_results(rna_embedding_length = rna_vector_length, 
                                                  protein_embedding_length = protein_vector_length, 
                                                  dataset = Dataset, 
                                                  embedding_type = embedding_type)

  0%|                                                   | 0/5 [00:00<?, ?it/s]

————————————————————————————————————————
Run_0
————————————————————————————————————————
using device 0 NVIDIA GeForce RTX 3090
DEVICE: cuda
(4580, 100)
(4580, 640)


 20%|████████▌                                  | 1/5 [00:05<00:21,  5.39s/it]

torch.Size([1666, 4488])
torch.Size([1666, 2952])
torch.Size([4164, 4488])
torch.Size([4164, 2952])
——————————————————————————————————————————————————
Performance of Run_0
——————————————————————————————————————————————————
accuracy : 0.957
sensitivity : 0.973
specificity : 0.941
precision : 0.945
mcc : 0.915
tn : 1926
fp : 121
fn : 57
tp : 2060

————————————————————————————————————————
Run_1
————————————————————————————————————————
using device 0 NVIDIA GeForce RTX 3090
DEVICE: cuda
(4580, 100)
(4580, 640)


 40%|█████████████████▏                         | 2/5 [00:08<00:12,  4.07s/it]

torch.Size([1666, 4488])
torch.Size([1666, 2952])
torch.Size([4164, 4488])
torch.Size([4164, 2952])
——————————————————————————————————————————————————
Performance of Run_1
——————————————————————————————————————————————————
accuracy : 0.957
sensitivity : 0.98
specificity : 0.934
precision : 0.934
mcc : 0.915
tn : 1990
fp : 140
fn : 40
tp : 1994

————————————————————————————————————————
Run_2
————————————————————————————————————————
using device 0 NVIDIA GeForce RTX 3090
DEVICE: cuda
(4580, 100)
(4580, 640)


 60%|█████████████████████████▊                 | 3/5 [00:11<00:07,  3.63s/it]

torch.Size([1666, 4488])
torch.Size([1666, 2952])
torch.Size([4164, 4488])
torch.Size([4164, 2952])
——————————————————————————————————————————————————
Performance of Run_2
——————————————————————————————————————————————————
accuracy : 0.955
sensitivity : 0.976
specificity : 0.933
precision : 0.938
mcc : 0.91
tn : 1918
fp : 137
fn : 51
tp : 2058

————————————————————————————————————————
Run_3
————————————————————————————————————————
using device 0 NVIDIA GeForce RTX 3090
DEVICE: cuda
(4580, 100)
(4580, 640)


 80%|██████████████████████████████████▍        | 4/5 [00:14<00:03,  3.42s/it]

torch.Size([1666, 4488])
torch.Size([1666, 2952])
torch.Size([4164, 4488])
torch.Size([4164, 2952])
——————————————————————————————————————————————————
Performance of Run_3
——————————————————————————————————————————————————
accuracy : 0.95
sensitivity : 0.965
specificity : 0.935
precision : 0.937
mcc : 0.901
tn : 1953
fp : 135
fn : 72
tp : 2004

————————————————————————————————————————
Run_4
————————————————————————————————————————
using device 0 NVIDIA GeForce RTX 3090
DEVICE: cuda
(4580, 100)
(4580, 640)


100%|███████████████████████████████████████████| 5/5 [00:17<00:00,  3.57s/it]

torch.Size([1666, 4488])
torch.Size([1666, 2952])
torch.Size([4168, 4488])
torch.Size([4168, 2952])
——————————————————————————————————————————————————
Performance of Run_4
——————————————————————————————————————————————————
accuracy : 0.956
sensitivity : 0.98
specificity : 0.933
precision : 0.936
mcc : 0.914
tn : 1952
fp : 140
fn : 42
tp : 2034

——————————————————————————————————————————————————
Validation Performance of Dataset NPInter2: 
——————————————————————————————————————————————————
Accuracy :  0.955 +/- 0.002536484778969642
Sensitivity :  0.975 +/- 0.005462948928435651
Specificity :  0.935 +/- 0.002865821174204263
Precision :  0.938 +/- 0.0035346844072795787
MCC :  0.911 +/- 0.005213818486901342
写入result文件夹



