In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import random
import time
import gc
from sklearn import metrics
from torch.cuda.amp import autocast, GradScaler
# torch.autograd.set_detect_anomaly(True)
CUDA_LAUNCH_BLOCKING=1


# fixed random seed for reproduction
seed = 10
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False
print('Random seed :', seed)

from collections import OrderedDict
@torch.no_grad()
def update_ema(ema_model, model, decay=0.9999):
    """
    Step the EMA model towards the current model.
    """
    ema_params = OrderedDict(ema_model.named_parameters())
    model_params = OrderedDict(model.named_parameters())

    for name, param in model_params.items():
        # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed
        ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)

def requires_grad(model, flag=True):
    """
    Set requires_grad flag for all parameters in a model.
    """
    for p in model.parameters():
        p.requires_grad = flag

from sklearn import metrics
def compute_AUC(y, pred, n_class=1):
    # compute one score
    if n_class == 1:
        auc = metrics.roc_auc_score(y, pred)

    # compute two-class
    elif n_class == 2:
        # pos = pred[:, 1]
        auc = metrics.roc_auc_score(y, pred)
    return auc

pad_len = 150 # best
conf = 10 # best# 

##########################################
#########  construct dataloader  ######### 
##########################################

from herg_cls_datapipeline import HERG_LMDBDataset 
# test_dataset = HERG_LMDBDataset(conf=conf, pad_len=pad_len, mode="week1")
# test_dataset = HERG_LMDBDataset(conf=conf, pad_len=pad_len, mode="week2")
# test_dataset = HERG_LMDBDataset(conf=conf, pad_len=pad_len, mode="week3")
test_dataset = HERG_LMDBDataset(conf=conf, pad_len=pad_len, mode="week4")
test_set = DataLoader(test_dataset,
                    batch_size=32,
                    drop_last=False,
                    shuffle=False,
                    pin_memory=True,
                    num_workers=4,
                    # collate_fn=train_dataset.collate_fn,
                    worker_init_fn=test_dataset.worker_init_fn
                    )


##########################################
######  build model and optimizer  ####### 
##########################################
dev = "cuda" if torch.cuda.is_available() else "cpu"
from CLIP import clip
clip_model, preprocess = clip.load(name="ViT-B/16", device="cpu", download_root="/home/jovyan/clip_download_root")
from model_zoo import CLIP_Protein
model = CLIP_Protein(clip_model, conf, pad_len=pad_len).to(dev)

# path = "/home/jovyan/prompts_learning/trained_weight/HERG_cliP_Epoch14_val_auc_0.82784.pth" # conf 200 prompts 1
path = "/home/jovyan/prompts_learning/trained_weight/A_10_9_HERG_prompts2_cliP_Epoch18_val_auc_0.82123.pth" # 2 prompts
# path = "/home/jovyan/prompts_learning/trained_weight/HERG_cliP_Epoch18_val_auc_0.82686.pth" # 4 prompts
# path = "/home/jovyan/prompts_learning/trained_weight/A_10_9_HERG_cliP_Epoch18_val_auc_0.82286.pth"
path  = "/home/jovyan/prompts_learning/trained_weight/A_10_10_HERG_cliPm_Epoch43_val_auc_0.85830.pth" # multi prompts
# path = "/home/jovyan/prompts_learning/trained_weight/HERG_cliPm_Epoch9_val_auc_0.86128.pth"
sd = torch.load(path)
model.load_state_dict(sd)
print("pre-trained weights loaded...")

                
    
##########################################
####### start evaluating our model #######
##########################################
model.eval()
print("evaluating...")
with torch.no_grad():
    all_pred = None 
    all_lab = None
    for step_id, datas in enumerate(test_set):
            atoms = datas["atoms"].to(dev, non_blocking=True).long()
            # coord = datas["coordinate"].to(dev, non_blocking=True).float()
            pair = datas["distance"].to(dev, non_blocking=True).float()
            spd = datas["SPD"].to(dev, non_blocking=True).float()
            edge = datas["edge"].to(dev, non_blocking=True).float()
            label = datas["label"].to(dev, non_blocking=True).float()

            pred = model(atoms, pair, spd, edge)
            pred = torch.sigmoid(pred)
            # pred = torch.softmax(pred, dim=-1)[:, 1]

            all_pred = pred if all_pred is None else torch.cat([all_pred, pred], dim=0)
            all_lab = label if all_lab is None else torch.cat([all_lab, label], dim=0)
auc = compute_AUC(all_lab.cpu().detach(), all_pred.cpu().detach())
print(f"test AUC: {auc:.5f}")



In [2]:
import torch
from torch_geometric.data import InMemoryDataset


class PubChemDataset(InMemoryDataset):
    def __init__(self, path):
        super(PubChemDataset, self).__init__()
        self.data, self.slices = torch.load(path)
        print(self.slices)
    
    def __getitem__(self, idx):
        return self.get(idx)

if __name__ == '__main__':
    dataset = PubChemDataset('./pretrain_data/PubChem324kV2/pretrain.pt')
    print(len(dataset))
    # print(dataset[0])
    # print(dataset[0]['x'])
    # print(dataset[0]['edge_index'])
    # print(dataset[0]['edge_attr'])
    # print(dataset[0]['smiles'])
    print(dataset[1]['text'])
    # print(dataset[0]['cid'])
    # for i in range(len(dataset)):
    #     print()
    

defaultdict(<class 'dict'>, {'x': tensor([       0,       14,       21,  ..., 10549705, 10549732, 10549760]), 'edge_index': tensor([       0,       26,       38,  ..., 22561532, 22561592, 22561658]), 'edge_attr': tensor([       0,       26,       38,  ..., 22561532, 22561592, 22561658]), 'text': tensor([     0,      1,      2,  ..., 298081, 298082, 298083]), 'smiles': tensor([     0,      1,      2,  ..., 298081, 298082, 298083]), 'cid': tensor([     0,      1,      2,  ..., 298081, 298082, 298083])})
298083
3-chloro-1,1,1-trifluoropropane appears as a colorless odorless nonflammable liquid. Poisonous by inhalation. Emits toxic fumes of chlorine and fluorine when heated to decomposition.


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
import random
import time
import gc
from sklearn import metrics
from torch.cuda.amp import autocast, GradScaler
# torch.autograd.set_detect_anomaly(True)
CUDA_LAUNCH_BLOCKING=1


# fixed random seed for reproduction
seed = 10
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False
print('Random seed :', seed)

from collections import OrderedDict
@torch.no_grad()
def update_ema(ema_model, model, decay=0.9999):
    """
    Step the EMA model towards the current model.
    """
    ema_params = OrderedDict(ema_model.named_parameters())
    model_params = OrderedDict(model.named_parameters())

    for name, param in model_params.items():
        # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed
        ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)

def requires_grad(model, flag=True):
    """
    Set requires_grad flag for all parameters in a model.
    """
    for p in model.parameters():
        p.requires_grad = flag

from sklearn import metrics
def compute_AUC(y, pred, n_class=1):
    # compute one score
    if n_class == 1:
        auc = metrics.roc_auc_score(y, pred)

    # compute two-class
    elif n_class == 2:
        # pos = pred[:, 1]
        auc = metrics.roc_auc_score(y, pred)
    return auc

pad_len = 150 # best
conf = 10 # best# 

##########################################
#########  construct dataloader  ######### 
##########################################
from tune_cls_pipeline import MoleculeHERGDataset, MoleculeHERGTestDataset
from KPGT.src.data.featurizer import Vocab, N_BOND_TYPES, N_ATOM_TYPES
from KPGT.src.data.collator_tune import Collator_pretrain, Collator_tune
from KPGT.src.model_config import config_dict
config = config_dict['base']


test_dataset = MoleculeHERGTestDataset("week1")
# test_dataset = MoleculeHERGTestDataset("week2")
# test_dataset = MoleculeHERGTestDataset("week3")
# test_dataset = MoleculeHERGTestDataset("week4")
collator = Collator_tune(vocab, max_length=config['path_length'], n_virtual_nodes=2, candi_rate=config['candi_rate'], fp_disturb_rate=config['fp_disturb_rate'], md_disturb_rate=config['md_disturb_rate'])
test_set = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=8, drop_last=False, collate_fn=collator)


##########################################
######  build model and optimizer  ####### 
##########################################
dev = "cuda" if torch.cuda.is_available() else "cpu"
from CLIP import clip
clip_model, preprocess = clip.load(name="ViT-B/16", device="cpu", download_root="/home/jovyan/clip_download_root")

from KPGT.src.model.light import LiGhTPredictor as LiGhT
kpgt = LiGhT(
        d_node_feats=config['d_node_feats'],
        d_edge_feats=config['d_edge_feats'],
        d_g_feats=config['d_g_feats'],
        d_fp_feats=train_dataset.d_fps,
        d_md_feats=train_dataset.d_mds,
        d_hpath_ratio=config['d_hpath_ratio'],
        n_mol_layers=config['n_mol_layers'],
        path_length=config['path_length'],
        n_heads=config['n_heads'],
        n_ffn_dense_layers=config['n_ffn_dense_layers'],
        input_drop=config['input_drop'],
        attn_drop=config['attn_drop'],
        feat_drop=config['feat_drop'],
        # input_drop=0.,
        # attn_drop=0.,
        # feat_drop=0.,
        n_node_types=vocab.vocab_size
    )# .to("cuda")
    # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True)
# kpgt.load_state_dict({k.replace('module.', ''): v for k, v in torch.load("/home/jovyan/prompts_learning/KPGT/src/models/base.pth").items()})
# print("Pre-trained weights of KPGT were loaded successfully!")

from model_zoo import CLIPM
model = CLIPM(kpgt, clip_model).to(dev)

path = "/home/jovyan/prompts_learning/trained_weight/cliPM_tune_herg_Epoch33_val_auc_0.87989.pth"
sd = torch.load(path)
model.load_state_dict(sd)
print("pre-trained weights loaded...")

                
    
##########################################
####### start evaluating our model #######
##########################################
model.eval()
print("evaluating...")
with torch.no_grad():
    all_pred = None 
    all_lab = None
    for step_id, datas in enumerate(test_set):
            (_, batched_graph, fps, mds, _, _, _, label, logd, logp, pka, pkb, logsol, wlogsol) = batched_data
            batched_graph = batched_graph.to(dev)
            fps = fps.to(dev)
            mds = mds.to(dev)
            
            pred = model([batched_graph, fps, mds])
            # pred = ema([batched_graph, fps, mds])

            pred = torch.sigmoid(pred)

            all_pred = pred if all_pred is None else torch.cat([all_pred, pred], dim=0)
            all_lab = label if all_lab is None else torch.cat([all_lab, label], dim=0)
auc = compute_AUC(all_lab.cpu().detach(), all_pred.cpu().detach())
print(f"test AUC: {auc:.5f}")



Random seed : 10


OSError: libcusparse.so.11: cannot open shared object file: No such file or directory