In [1]:
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from torch_geometric.loader import DataLoader
from tqdm import tqdm
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import MessagePassing, global_mean_pool, global_max_pool
from sklearn.metrics import average_precision_score, roc_auc_score
from absl import app, flags
import time
import datetime
from torch_scatter import scatter
from multiprocessing import Pool
from featurizing_data import *


In [3]:
MODEL_TYPE = 'GNN'
MODEL_TYPE = '1DCNN'
device = "cuda:0"
test_csv_for_ids_path = "../data/test.csv"
ENSEMBLE = False
if ENSEMBLE:
    #csv_files = ['results/75M_randomslpit_80epoch/ids_pred_results_best_model.csv',
    #             'results/1dcnn/ids_pred_results_best.csv']
    #csv_files = ['results/75M_randomslpit_80epoch/ensemble_predictions.csv',
    #             'results/1dcnn/ids_pred_results_best.csv']
    csv_files = ['results/75M_randomslpit_80epoch/ids_pred_results.csv',
                 'results/1dcnn/ids_pred_results_best.csv',
                 'results/1dcnn_selfies/ids_pred_results_Epoch31.csv']
    # Corresponding weights for each model's predictions
    #weights = [0.5, 0.25,0.25]
    weights = np.array([0.6323,0.61397,0.6352])
    weights = weights/np.sum(weights)

In [42]:
w = np.array([0.6323,0.61397,0.6352])
w = w/np.sum(w)
w

array([0.33606701, 0.32632463, 0.33760836])

In [4]:

if MODEL_TYPE == 'GNN':
    from train_gnn import *
    PACK_NODE_DIM=5
    PACK_EDGE_DIM=1
    NODE_DIM=PACK_NODE_DIM*8-4
    EDGE_DIM=PACK_EDGE_DIM*8-2

    # ---hyper-parameters
    input_dim = NODE_DIM
    edge_dim = EDGE_DIM

    emb_dim = 96
    num_layers = 4
    dropout_rate = 0.3
    out_channels = 3


    # define model
    #model = GNNModel(input_dim, hidden_dim, num_layers, dropout_rate,out_channels).to(device)
    model = GNNModel(in_dim=input_dim, edge_dim=edge_dim, emb_dim=emb_dim, num_layers=num_layers,
                        out_channels = out_channels,dropout=dropout_rate).to(device)

    #results_dir = 'results/gnn_valid_byte_10m'
    results_dir = 'results/75M_randomslpit'
    results_dir = 'results/75M_randomslpit_80epoch/'
else:
    from train_1dcnn import *
    model = oneDCNN(41).to(device)
    results_dir = 'results/1dcnn'
    results_dir = 'results/1dcnn_selfies'
    results_dir = 'results/1dcnn_selfies_all'

    
model.load_state_dict(torch.load(os.path.join(results_dir,'best_val.pth')))  # Loading best model of this fold
#model.load_state_dict(torch.load(os.path.join(results_dir,'Epoch_29.pth')))  # Loading best model of this fold
output_path = os.path.join(results_dir,'ids_pred_results_Epoch_24.csv')




In [5]:
dtypes = {'buildingblock1_smiles': np.int16, 'buildingblock2_smiles': np.int16, 'buildingblock3_smiles': np.int16,
          'binds_BRD4':np.byte, 'binds_HSA':np.byte, 'binds_sEH':np.byte}

test_df = pd.read_csv('../shrunken_data/test.csv', dtype = dtypes)
print(len(test_df))
test_df.head()

878022


Unnamed: 0,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,is_BRD4,is_HSA,is_sEH
0,0,17,17,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,True,True,True
1,0,17,87,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,True,True,True
2,0,17,99,C#CCCC[C@H](Nc1nc(NCC2(O)CCCC2(C)C)nc(Nc2ccc(C...,True,True,True
3,0,17,244,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2sc(Cl)c...,True,True,True
4,0,17,394,C#CCCC[C@H](Nc1nc(NCC2CCC(SC)CC2)nc(Nc2ccc(C=C...,True,True,True


In [35]:
def predict_with_model(model, val_loader,device):
    model.eval()
    predictions = []

    with torch.no_grad():
        #for t, index in enumerate(np.arange(0,len(idx),batch_size)):
        for inputs in val_loader:  # Assuming you have a DataLoader named val_loader
            #index_batch = idx[index:index+batch_size]
            #batch = dotdict(
            #    graph = my_collate(data,index_batch,device=device),
            #)
            #print(inputs[0].shape)
            #if is_labeled:
            #input = inputs[0]
            output = model(inputs[0].to(device)) 
            predictions.extend(torch.sigmoid(output).tolist())

    return predictions

In [6]:
if ENSEMBLE is False:       
    if MODEL_TYPE == 'GNN': 
        print('----Featurizing testing data -----')
        smiles_list_test = test_df['molecule_smiles'].tolist()
        with Pool(processes=64) as pool:
            test_dataset = list(pool.imap(smile_to_graph, smiles_list_test))

        # Predict
        test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)
        predictions = predict_with_model(model, test_loader,device,False)
        y_pred_and_ids_df = select_and_save_predictions_with_ids(predictions,test_df,test_csv_for_ids_path)
        y_pred_and_ids_df.to_csv(output_path,index=False)
    else:
        MAX_LEN = 130 # 142
        FEATURES = [f'enc{i}' for i in range(MAX_LEN)]
        #test_data = pd.read_parquet('test_enc.parquet')
        test_data = pd.read_parquet('test_enc_selfies.parquet')
        
        test_idx = np.array(test_data.index)
        X_test = torch.tensor(test_data.loc[test_idx, FEATURES].values, dtype=torch.int)
        # Create TensorDatasets
        test_dataset = TensorDataset(X_test)
        tst = pd.read_csv(test_csv_for_ids_path,index_col=False)#[:len(y_pred_df)]
        test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)
        predictions = predict_with_model(model, test_loader,device)
        predictions = np.array(predictions)
        tst['binds'] = 0
        tst.loc[tst['protein_name']=='BRD4', 'binds'] = predictions[(tst['protein_name']=='BRD4').values, 0]
        tst.loc[tst['protein_name']=='HSA', 'binds'] = predictions[(tst['protein_name']=='HSA').values, 1]
        tst.loc[tst['protein_name']=='sEH', 'binds'] = predictions[(tst['protein_name']=='sEH').values, 2]
        tst[['id', 'binds']].to_csv(output_path, index = False)



In [28]:
tst['binds']

0          4.814573e-06
1          2.401210e-05
2          1.750874e-09
3          5.170739e-10
4          1.157709e-06
               ...     
1674891    2.709554e-06
1674892    2.382939e-07
1674893    3.778671e-06
1674894    1.530312e-04
1674895    8.083007e-07
Name: binds, Length: 1674896, dtype: float64

In [45]:
if ENSEMBLE:
    # List of CSV files containing predictions

    weighted_predictions = None

    # Loop through each file and weight
    for file, weight in zip(csv_files, weights):
        # Read predictions from the current file
        #in_path = os.path.join(results_dir,file)

        df = pd.read_csv(file)
        
        # Ensure the 'prediction' column exists
        if 'binds' not in df.columns:
            raise ValueError(f"'prediction' column not found in {file}")
        
        # Multiply the predictions by the corresponding weight
        weighted_pred = df['binds'] * weight
        
        # If it's the first model, initialize the weighted_predictions
        if weighted_predictions is None:
            weighted_predictions = weighted_pred
        else:
            # Add the weighted predictions to the ensemble
            weighted_predictions += weighted_pred


    test = pd.read_csv('../data/test.csv',index_col=False)#[:len(y_pred_df)]
    test_ids = pd.DataFrame(test.id)
    ensemble_df = pd.concat([test_ids,weighted_predictions],axis=1)
    # Create a DataFrame for the ensemble predictions

    # Save the ensemble predictions to a new CSV file
    output_path = os.path.join(results_dir,'ensemble_predictions_gnn_1dcnnsmiles_1dcnnselfies_weights_adjusted.csv')

    ensemble_df.to_csv(output_path, index=False)



: 

In [30]:
output_path

'results/1dcnn_selfies/ids_pred_results_epoch29.csv'