In [1]:
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from torch_geometric.loader import DataLoader
from tqdm import tqdm
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import MessagePassing, global_mean_pool, global_max_pool
from sklearn.metrics import average_precision_score, roc_auc_score
from absl import app, flags
import time
import datetime
from torch_scatter import scatter
from multiprocessing import Pool
from featurizing_data import *
from train_gnn import *

In [9]:

PACK_NODE_DIM=5
PACK_EDGE_DIM=1
NODE_DIM=PACK_NODE_DIM*8-4
EDGE_DIM=PACK_EDGE_DIM*8-2

# ---hyper-parameters
input_dim = NODE_DIM
edge_dim = EDGE_DIM

emb_dim = 96
num_layers = 4
dropout_rate = 0.3
out_channels = 3


device = "cuda:0"
# define model
#model = GNNModel(input_dim, hidden_dim, num_layers, dropout_rate,out_channels).to(device)
model = GNNModel(in_dim=input_dim, edge_dim=edge_dim, emb_dim=emb_dim, num_layers=num_layers,
                    out_channels = out_channels,dropout=dropout_rate).to(device)

#results_dir = 'results/gnn_valid_byte_10m'
results_dir = 'results/75M_randomslpit'
results_dir = 'results/75M_randomslpit_80epoch/'

#model.load_state_dict(torch.load(os.path.join(results_dir,'best_val.pth')))  # Loading best model of this fold
model.load_state_dict(torch.load(os.path.join(results_dir,'Epoch_41.pth')))  # Loading best model of this fold



<All keys matched successfully>

In [4]:
dtypes = {'buildingblock1_smiles': np.int16, 'buildingblock2_smiles': np.int16, 'buildingblock3_smiles': np.int16,
          'binds_BRD4':np.byte, 'binds_HSA':np.byte, 'binds_sEH':np.byte}

test_df = pd.read_csv('../shrunken_data/test.csv', dtype = dtypes)
print(len(test_df))
test_df.head()

878022


Unnamed: 0,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,is_BRD4,is_HSA,is_sEH
0,0,17,17,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,True,True,True
1,0,17,87,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,True,True,True
2,0,17,99,C#CCCC[C@H](Nc1nc(NCC2(O)CCCC2(C)C)nc(Nc2ccc(C...,True,True,True
3,0,17,244,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2sc(Cl)c...,True,True,True
4,0,17,394,C#CCCC[C@H](Nc1nc(NCC2CCC(SC)CC2)nc(Nc2ccc(C=C...,True,True,True


In [10]:
smiles_list_test = test_df['molecule_smiles'].tolist()
#print(smiles_list_train)
print('----Featurizing testing data -----')
#with Pool(processes=64) as pool:
#    test_data = list(pool.imap(smile_to_graph, smiles_list_test))

# Predict
#test_loader = DataLoader(test_data, batch_size=32, shuffle=False)
test_idx = np.array(test_df.index)
predictions = predict_with_model(model, test_data,device,test_idx)

# save predictions
test_csv_for_ids_path = "../data/test.csv"
y_pred_and_ids_df = select_and_save_predictions_with_ids(predictions,test_df,test_csv_for_ids_path)

output_path = os.path.join(results_dir,'ids_pred_results_epoch41.csv')
y_pred_and_ids_df.to_csv(output_path,index=False)

----Featurizing testing data -----


In [15]:
# List of CSV files containing predictions
csv_files = ['ids_pred_results_best_model.csv', 'ids_pred_results_epoch41.csv', 'ids_pred_results_epoch52.csv']

# Corresponding weights for each model's predictions
weights = [0.6, 0.3, 0.1]

weighted_predictions = None

# Loop through each file and weight
for file, weight in zip(csv_files, weights):
    # Read predictions from the current file
    in_path = os.path.join(results_dir,file)

    df = pd.read_csv(in_path)
    
    # Ensure the 'prediction' column exists
    if 'binds' not in df.columns:
        raise ValueError(f"'prediction' column not found in {file}")
    
    # Multiply the predictions by the corresponding weight
    weighted_pred = df['binds'] * weight
    
    # If it's the first model, initialize the weighted_predictions
    if weighted_predictions is None:
        weighted_predictions = weighted_pred
    else:
        # Add the weighted predictions to the ensemble
        weighted_predictions += weighted_pred


test = pd.read_csv('../data/test.csv',index_col=False)#[:len(y_pred_df)]
test_ids = pd.DataFrame(test.id)
ensemble_df = pd.concat([test_ids,weighted_predictions],axis=1)
# Create a DataFrame for the ensemble predictions

# Save the ensemble predictions to a new CSV file
output_path = os.path.join(results_dir,'ensemble_predictions.csv')

ensemble_df.to_csv(output_path, index=False)



In [16]:
ensemble_df

Unnamed: 0,id,binds
0,295246830,1.658991e-06
1,295246831,2.562397e-05
2,295246832,2.294762e-07
3,295246833,3.253262e-05
4,295246834,5.909419e-04
...,...,...
1674891,296921721,1.942364e-04
1674892,296921722,1.922892e-03
1674893,296921723,6.047438e-05
1674894,296921724,5.464500e-04


In [14]:
df

Unnamed: 0,id,binds
0,295246830,0.000009
1,295246831,0.000142
2,295246832,0.000002
3,295246833,0.000128
4,295246834,0.001050
...,...,...
1674891,296921721,0.000190
1674892,296921722,0.000487
1674893,296921723,0.000215
1674894,296921724,0.000435
