# 1.Environment and data preparation

## 1.1.Import

In [None]:
import pandas as pd
import os
import re
from tqdm import tqdm
import importlib
from matplotlib.pyplot import figure
from ZHMolGraph.import_modules import *
from ZHMolGraph import ZHMolGraph
import pickle

In [None]:
importlib.reload(ZHMolGraph)  

## 1.2.GPU Setting

In [None]:
str(subprocess.check_output('nvidia-smi', shell = True)).split('\\n')

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(tensorflow.config.list_physical_devices('GPU'))

## 1.3.Define my vecnn structure

In [None]:
Dataset="RPI7317"

In [None]:
# Read In rnas and proteins dataframes to pass to AIBind after changing column names 
with open('data/Mol2Vec/RPI_'+ Dataset +'_rnafm_embed_normal.pkl', 'rb') as file: 
    rnas = pkl.load(file)
    
with open('data/Mol2Vec/RPI_' + Dataset + '_proteinprottrans_embed_normal.pkl', 'rb') as file: 
    proteins = pkl.load(file)

print(rnas)
print(len(rnas))
print(proteins)
print(type(proteins))


In [None]:
# create object
vecnn_object = ZHMolGraph.ZHMolGraph(interactions_location = 'data/interactions/dataset_RPI_' + Dataset + '_RP.csv',
                              interactions = None,
                              interaction_y_name = 'Y',

                              absolute_negatives_location = None,
                              absolute_negatives = None,

                              rnas_location = None,
                              rnas_dataframe = rnas,
                              rna_seq_name = 'RNA_aa_code',

                              proteins_location = None,
                              proteins_dataframe = proteins, 
                              protein_seq_name = 'target_aa_code',


                              model_out_dir = f'trained_model/ZHMolGraph_VecNN_model_RPI_{Dataset}/',

        

                              debug = False)

In [None]:
os.makedirs(vecnn_object.model_out_dir, exist_ok=True)

In [None]:
vecnn_object.rnas = rnas
vecnn_object.proteins = proteins

In [None]:
all_samples = pd.concat([vecnn_object.interactions])

In [None]:
Negative_samples = all_samples[all_samples['Y']==0]

In [None]:
vecnn_object.negatives = pd.concat([Negative_samples[vecnn_object.rna_seq_name], Negative_samples[vecnn_object.protein_seq_name], 
                                    Negative_samples[vecnn_object.interaction_y_name]],axis=1)
vecnn_object.negatives = vecnn_object.negatives.reset_index(drop=True)

# print(vecnn_object.negatives)

## 1.4.Create train and test dataset

In [None]:
# 从文件加载变量
train_sets_file = vecnn_object.model_out_dir + '/train_sets.pkl'
with open(train_sets_file, 'rb') as f:
    vecnn_object.train_sets = pickle.load(f)

test_sets_file = vecnn_object.model_out_dir + '/test_sets.pkl'
with open(test_sets_file, 'rb') as f:
    vecnn_object.test_sets = pickle.load(f)

## 1.5.Generate Mol2Vec and ProtVec Embeddings

In [None]:
# Read In rnas and proteins dataframes to pass to AIBind after changing column names 
with open('data/Mol2Vec/RPI_'+ Dataset +'_rnafm_embed_normal.pkl', 'rb') as file: 
    rnas = pkl.load(file)
    
with open('data/Mol2Vec/RPI_' + Dataset + '_proteinprottrans_embed_normal.pkl', 'rb') as file: 
    proteins = pkl.load(file)

# print(rnas)
# print(len(rnas))
# print(proteins)
# print(type(proteins))

vecnn_object.rnas = rnas
vecnn_object.proteins = proteins

In [None]:
rna_vector_length = 640
protein_vector_length = 1024

In [None]:
rna_embeddings = rnas['normalized_embeddings']
rna_array = np.zeros((len(rnas['normalized_embeddings']), rna_vector_length))
# 使用 for 循环逐行赋值
for i in tqdm(range(len(rnas['normalized_embeddings']))):
    rna_array[i, :] = rna_embeddings.iloc[i]
    
vecnn_object.rna_embeddings = rna_array

In [None]:

protein_embeddings = proteins['normalized_embeddings']
protein_array = np.zeros((len(proteins['normalized_embeddings']), protein_vector_length))
# 使用 for 循环逐行赋值
for i in tqdm(range(len(proteins['normalized_embeddings']))):
    protein_array[i, :] = protein_embeddings.iloc[i]
    
vecnn_object.protein_embeddings = protein_array


# 2. Get the benchmark validation results of ZHMolGraph

In [None]:
embedding_type = 'Pretrain'
vecnn_object.get_benchmark_validation_ZHMolGraph_results(rna_embedding_length = rna_vector_length, 
                                                  protein_embedding_length = protein_vector_length, 
                                                  dataset = Dataset, 
                                                  embedding_type = embedding_type)