# PSDTA Data Preparation

Your CSV should look something like this:

| ID |  Protein  |          Ligand          | Target_Chain | regression_label |
|:----------:|:---------:|:------------------------:|:------------:|:----------------:|
| 1a30| PQITL.... |       CC(C)C[C...        |      A       |       4.3        |
| 1bcu| ADCGL.... | Nc1ccc2cc3ccc(N)cc3nc2c1 |      B       |       3.8        |
|...|    ...    |           ...            |     ...      |       ...        |
|1bzc| TEMEKE... |        NC(=O)...         |      A       |       6.6        |

You also need to prepare the PDB file corresponding to each protein.


# PSDTA Hyperparameter Tuning

The config.json file contains the hyperparameter settings.

When the dataset is small, the attention mechanism can be set to 2 layers.

For larger datasets, it is recommended to set it to 3 layers.

For PNA, different combinations of aggregators show varying performance. Currently, the combination of mean + min achieves the best results.

In [None]:
{
    "params": {
        "mol_in_channels": 43,
        "prot_in_channels": 33,
        "prot_evo_channels": 1280,
        "hidden_channels":200,
        "aggregators": [
            "mean",
            "min"
        ],
        "scalers": [
            "identity",
            "amplification",
            "linear"
        ],
        "pre_layers": 2,
        "post_layers": 1,
        "total_layer": 3,
        "K": [
            5,
            10,
            20
        ],
        "dropout": 0,
        "dropout_attn_score": 0,
        "heads": 5
    }
}


In addition, some hyperparameters are also defined in main.py.

In [None]:
### Seed and device
parser.add_argument('--seed', type=int, default=2)
parser.add_argument('--device', type=str, default='cuda:0', help='')
parser.add_argument('--config_path', type=str, default='config.json')
### Data and Pre-processing
parser.add_argument('--datafolder', type=str, default='./dataset/davis/', help='protein data path')  
parser.add_argument('--result_path', type=str, default='./result/', help='path to save results') 
parser.add_argument('--save_interpret', type=bool, default=True, help='path to save results')

# For PDBBIND datasets - we train for 30K iteration
parser.add_argument('--regression_task', type=bool, help='True if regression else False')
# For any classification type - we train for 100 epochs (same as DrugBAN) [change --total_iters = None]
parser.add_argument('--classification_task', type=bool, help='True if classification else False')
parser.add_argument('--mclassification_task', type=int, help='number of multiclassification, 0 if no multiclass task')
parser.add_argument('--epochs', type=int, default=200 , help='')
parser.add_argument('--evaluate_epoch', type=int, default=1)

parser.add_argument('--total_iters', type=int, default=None)
parser.add_argument('--evaluate_step', type=int, default=500)

# optimizer params - only change this for PDBBind v2016
parser.add_argument('--lrate', type=float, default=1e-5,  #change to 1e-5 for PDBv2016  1e-4 for PDB2020
                    help='learning rate for PSICHIC')  # change to 1e-5 for LargeScaleInteractionDataset
parser.add_argument('--eps', type=float, default=1e-5, help='higher = closer to SGD')  # change to 1e-5 for PDBv2016, 1e-08 for PDB2020
parser.add_argument('--betas', type=tuple_type, default="(0.9,0.999)")  # change to (0.9,0.99) for PDBv2016  (0.9,0.999) for PDB2020
# batch size
parser.add_argument('--batch_size', type=int, default=14)
# sampling method - only used for pretraining large-scale interaction dataset ; allow self specified weights to the samples
parser.add_argument('--sampling_col', type=str, default='')
parser.add_argument('--trained_model_path', type=str, default='', #./result/PDB2016_BENCHMARK/save_model_seed2/
                    help='This does not need to be perfectly aligned, as you can add prediction head for some other tasks as well!')
parser.add_argument('--finetune_modules', type=list_type, default=None)
# notebook mode?
parser.add_argument('--nb_mode', type=bool, default=False)

# Running PSDTA

Once the feature files are prepared, specify the feature file location and the result path, then run main.py to start training the model.

For example, for PDBBind2016:

In [None]:
python main.py --datafolder dataset/pdb2016 --result_path result/PDB2016_BENCHMARK --regression_task True


In [None]:
import json
import pandas as pd
import torch
import numpy as np
import os
import random
# Utils
from utils.utils import DataLoader, compute_pna_degrees, virtual_screening, CustomWeightedRandomSampler
from utils.dataset import *  # data
from utils.trainer import Trainer
from utils.metrics import *
from utils.utils import extract_data_from_files
# Preprocessing
from utils import protein_init, ligand_init
# Model
# from models.net import net
import argparse
import ast
from AAA.net_tsmiles import net_tsmiles
# from BBB.sub_net import net
# from BBB.sub_trainer import sub_Trainer
from BAN.ban_net import net
from utils.draw import Trainer_draw
from utils.eval_pocket import Trainer_eval_pocket
from Protein_family.family_trainer import Trainer_family

def tuple_type(s):
    try:
        # Safely evaluate the string as a tuple
        value = ast.literal_eval(s)
        if not isinstance(value, tuple):
            raise ValueError
    except (ValueError, SyntaxError):
        raise argparse.ArgumentTypeError(f"Invalid tuple value: {s}")
    return value


def list_type(s):
    try:
        # Safely evaluate the string as a tuple
        value = ast.literal_eval(s)
        if not isinstance(value, list):
            raise ValueError
    except (ValueError, SyntaxError):
        raise argparse.ArgumentTypeError(f"Invalid list value: {s}")
    return value


parser = argparse.ArgumentParser()

### Seed and device
parser.add_argument('--seed', type=int, default=2) #最好的是2
parser.add_argument('--device', type=str, default='cuda:0', help='')
parser.add_argument('--config_path', type=str, default='config.json')
### Data and Pre-processing
parser.add_argument('--datafolder', type=str, default='./dataset/davis/', help='protein data path')  #运行时有指定，此处无所谓
parser.add_argument('--result_path', type=str, default='./result/', help='path to save results')  #运行时有指定，此处无所谓
parser.add_argument('--save_interpret', type=bool, default=True, help='path to save results')

# For PDBBIND datasets - we train for 30K iteration
parser.add_argument('--regression_task', type=bool, help='True if regression else False')
# For any classification type - we train for 100 epochs (same as DrugBAN) [change --total_iters = None]
parser.add_argument('--classification_task', type=bool, help='True if classification else False')
parser.add_argument('--mclassification_task', type=int, help='number of multiclassification, 0 if no multiclass task')
parser.add_argument('--epochs', type=int, default=200 , help='')
parser.add_argument('--evaluate_epoch', type=int, default=1)

parser.add_argument('--total_iters', type=int, default=None)
parser.add_argument('--evaluate_step', type=int, default=500)

# optimizer params - only change this for PDBBind v2016
parser.add_argument('--lrate', type=float, default=1e-5,  #change to 1e-5 for PDBv2016  1e-4 for PDB2020
                    help='learning rate for PSICHIC')  # change to 1e-5 for LargeScaleInteractionDataset
parser.add_argument('--eps', type=float, default=1e-5, help='higher = closer to SGD')  # change to 1e-5 for PDBv2016, 1e-08 for PDB2020
parser.add_argument('--betas', type=tuple_type, default="(0.9,0.999)")  # change to (0.9,0.99) for PDBv2016  (0.9,0.999) for PDB2020
# batch size
parser.add_argument('--batch_size', type=int, default=14)
# sampling method - only used for pretraining large-scale interaction dataset ; allow self specified weights to the samples
parser.add_argument('--sampling_col', type=str, default='')
parser.add_argument('--trained_model_path', type=str, default='', #./result/PDB2016_BENCHMARK/save_model_seed2/
                    help='This does not need to be perfectly aligned, as you can add prediction head for some other tasks as well!')
parser.add_argument('--finetune_modules', type=list_type, default=None)
# notebook mode?
parser.add_argument('--nb_mode', type=bool, default=False)

args = parser.parse_args()

# 根据文件地址构建config
if args.trained_model_path:
    with open(args.config_path, 'r') as f:
        config = json.load(f)
else:
    with open(os.path.join(args.trained_model_path, 'config.json'), 'r') as f:
        config = json.load(f)
# overwrite
config['optimizer']['lrate'] = args.lrate  #parser
config['optimizer']['eps'] = args.eps #1e-8  #parser
config['optimizer']['betas'] = args.betas  ##parser (0.9,0.999)
config['tasks']['regression_task'] = args.regression_task  #parser 回归任务
config['tasks']['classification_task'] = args.classification_task  #parser 分类任务
config['tasks']['mclassification_task'] = args.mclassification_task  #parser 多分类任务

# device
device = torch.device(args.device)
if not os.path.exists(args.result_path):
    os.makedirs(args.result_path)  # 保存结果

model_path = os.path.join(args.result_path, 'save_model_seed{}'.format(args.seed))
if not os.path.exists(model_path):
    os.makedirs(model_path)  # 保存模型

interpret_path = os.path.join(args.result_path, 'interpretation_result_seed{}'.format(args.seed))
if not os.path.exists(interpret_path):
    os.makedirs(interpret_path)  # 保存解释结果

if args.epochs is not None and args.total_iters is not None:
    print('If epochs and total iters are both not None, then we only use iters.')
    args.epochs = None

print(args)
with open(os.path.join(args.result_path, 'model_params.txt'), 'w') as f:
    f.write(str(args))

# seed initialize
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
random.seed(args.seed)
os.environ['PYTHONHASHSEED'] = str(args.seed)


## 2016 import files
train_file=os.path.join(args.datafolder, 'train.csv')
valid_file=os.path.join(args.datafolder, 'test.csv') #valid.csv
test_file=os.path.join(args.datafolder, 'test.csv')  #test.csv
train_df = pd.read_csv(train_file)  # datafolder是路径
test_df = pd.read_csv(test_file)
valid_path = os.path.join(args.datafolder, 'test.csv')
valid_df = None
files = [train_file, valid_file, test_file]  # CSV文件路径列表


if os.path.exists(valid_path):  # 如果有验证集
    valid_df = pd.read_csv(valid_path)
    protein_tuples = extract_data_from_files(files)
    ligand_smiles = list(
        set(train_df['Ligand'].tolist() + test_df['Ligand'].tolist() + valid_df['Ligand'].tolist()))  # 配体SMILES串
else:
    protein_tuples = extract_data_from_files(files)
    ligand_smiles = list(set(train_df['Ligand'].tolist() + test_df['Ligand'].tolist()))

protein_path = os.path.join(args.datafolder, 'protein.pt')  #davis 是protein_v2.pt,2020是protein.pt

if os.path.exists(protein_path):
    print('Loading Protein Graph data...')
    protein_dict = torch.load(protein_path)
else:
    print('Initialising Protein Sequence to Protein Graph...')
    protein_dict = protein_init(protein_tuples)  # 得到序列到蛋白质图（包括序列、序列特征、token特征、残基数、位置索引、邻接索引、邻接权重）
    torch.save(protein_dict, protein_path)  # 保存数据

ligand_path = os.path.join(args.datafolder, 'ligand.pt')
if os.path.exists(ligand_path):
    print('Loading Ligand Graph data...')
    ligand_dict = torch.load(ligand_path)
else:
    print('Initialising Ligand SMILES to Ligand Graph...')
    ligand_dict = ligand_init(ligand_smiles)
    torch.save(ligand_dict, ligand_path)

torch.cuda.empty_cache()  #清空 PyTorch 在 GPU 上分配但不再使用的缓存内存
##TODO: drop any invalid smiles

##TODO: drop any invalid smiles

##
## training loader
train_shuffle = True
train_sampler = None

if args.sampling_col:
    train_weights = torch.from_numpy(train_df[args.sampling_col].values)  # 采样权重


    def sampler_from_weights(weights):
        sampler = CustomWeightedRandomSampler(weights, len(weights), replacement=True)  # 根据权重定义采样器，并允许多次采样同一个值

        return sampler


    train_shuffle = False
    train_sampler = sampler_from_weights(train_weights)

if train_sampler is not None:
    print('shuffle should be False: ', train_shuffle)

train_dataset = ProteinMoleculeDataset(train_df, ligand_dict, protein_dict,
                                       device=args.device)  # train_dataset就是一个包含多个MultiGraphData对象的集合
test_dataset = ProteinMoleculeDataset(test_df, ligand_dict, protein_dict, device=args.device)

# DataLoader需要为这三个特征(分子特征、原子所属团簇、蛋白质序列特征)创建单独的索引区分其属于不同的图。DataLoder会将一个批次中的独立数据进行拼接，每个数据可能大小不同，所以需要batch来区分数据原本属于哪个图
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=train_shuffle,
                          sampler=train_sampler, follow_batch=['mol_x', 'clique_x', 'prot_node_aa','tsml'])

test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False,
                         follow_batch=['mol_x', 'clique_x', 'prot_node_aa'])

valid_dataset, valid_loader = None, None
if valid_df is not None:
    valid_dataset = ProteinMoleculeDataset(valid_df, ligand_dict, protein_dict, device=args.device)
    valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False,
                              follow_batch=['mol_x', 'clique_x', 'prot_node_aa','tsml']
                              )


if not args.trained_model_path:
    degree_path = os.path.join(args.datafolder, 'degree.pt')
    if not os.path.exists(degree_path):
        print('Computing training data degrees for PNA')
        mol_deg, clique_deg, prot_deg = compute_pna_degrees(train_loader)  # 度的直方图张量，反应不同度的频率
        degree_dict = {'ligand_deg': mol_deg, 'clique_deg': clique_deg, 'protein_deg': prot_deg}  # 封装成字典
    else:
        degree_dict = torch.load(degree_path)
        mol_deg, clique_deg, prot_deg = degree_dict['ligand_deg'], degree_dict['clique_deg'], degree_dict['protein_deg']

    torch.save(degree_dict, os.path.join(args.result_path, 'save_model_seed{}'.format(args.seed), 'degree.pt'))
else:
    degree_dict = torch.load(os.path.join(args.trained_model_path, 'degree.pt'))
    param_dict = os.path.join(args.trained_model_path, 'best.pt') #model_test
    mol_deg, prot_deg = degree_dict['ligand_deg'], degree_dict['protein_deg']

model = net(mol_deg, prot_deg,
            # MOLECULE
            mol_in_channels=config['params']['mol_in_channels'], prot_in_channels=config['params']['prot_in_channels'],
            prot_evo_channels=config['params']['prot_evo_channels'],
            hidden_channels=config['params']['hidden_channels'], pre_layers=config['params']['pre_layers'],
            post_layers=config['params']['post_layers'], aggregators=config['params']['aggregators'],
            scalers=config['params']['scalers'], total_layer=config['params']['total_layer'],
            K=config['params']['K'], heads=config['params']['heads'],
            dropout=config['params']['dropout'],
            dropout_attn_score=config['params']['dropout_attn_score'],
            # output
            regression_head=config['tasks']['regression_task'],
            classification_head=config['tasks']['classification_task'],
            multiclassification_head=config['tasks']['mclassification_task'],
            device=device).to(device)


model.reset_parameters()
if args.trained_model_path:
    model.load_state_dict(torch.load(param_dict, map_location=args.device), strict=False)
    print('Pretrained model loaded!!!')

nParams = sum([p.nelement() for p in model.parameters()])
print('Model loaded with number of parameters being:', str(nParams))

with open(os.path.join(args.result_path, 'save_model_seed{}'.format(args.seed), 'config.json'), 'w') as f:
    json.dump(config, f, indent=4)


evaluation_metric = 'rmse'

engine = Trainer(model=model, lrate=config['optimizer']['lrate'], min_lrate=config['optimizer']['min_lrate'],
                 wdecay=config['optimizer']['weight_decay'], betas=config['optimizer']['betas'],
                 eps=config['optimizer']['eps'], amsgrad=config['optimizer']['amsgrad'],
                 clip=config['optimizer']['clip'], steps_per_epoch=len(train_loader),
                 num_epochs=args.epochs, total_iters=args.total_iters,
                 warmup_iters=config['optimizer']['warmup_iters'],
                 lr_decay_iters=config['optimizer']['lr_decay_iters'],
                 schedule_lr=config['optimizer']['schedule_lr'], regression_weight=1, classification_weight=1,
                 evaluate_metric=evaluation_metric, result_path=args.result_path, runid=args.seed,
                 finetune_modules=args.finetune_modules,
                 device=device)

print('-' * 50)
print('start training model')
if args.epochs:
    engine.train_epoch(train_loader, val_loader=valid_loader, test_loader=test_loader,
                       evaluate_epoch=args.evaluate_epoch)
else:
    engine.train_step(train_loader, val_loader=valid_loader, test_loader=test_loader, evaluate_step=args.evaluate_step)

print('finished training model')
print('-' * 50)

print('loading best checkpoint and predicting test data')
print('-' * 50)
stat_dict_path=os.path.join(args.result_path, 'save_model_seed{}'.format(args.seed), 'model.pt')
model.load_state_dict(torch.load())
screen_df = virtual_screening(test_df, model, test_loader,
                              result_path=os.path.join(args.result_path,
                                                       "interpretation_result_seed{}".format(args.seed)),
                              save_interpret=args.save_interpret,
                              ligand_dict=ligand_dict, device=args.device)

screen_df.to_csv(os.path.join(args.result_path, 'test_prediction_seed{}.csv'.format(args.seed)), index=False)

#-----画图------------------
# model.reset_parameters()
#
# param_dict = os.path.join(args.trained_model_path, 'model_test.pt')
# model.load_state_dict(torch.load(param_dict, map_location=args.device), strict=False)
# draw_train =  Trainer_draw(model=model, device=device)
# # draw_train.draw(train_loader, valid_loader, test_loader, args.result_path)
# eval_result = draw_train.test_epoch(test_loader)
# print(eval_result)

#-----打印口袋准确率--------------
# model.reset_parameters()
# param_dict = os.path.join(args.trained_model_path, 'model_test.pt')
# model.load_state_dict(torch.load(param_dict, map_location=args.device), strict=False)
# eval_pocket_train =  Trainer_eval_pocket(model=model, device=device)
# eval_result = eval_pocket_train.test_epoch(test_loader)
# print(eval_result)

#------蛋白质家族测试-------------
# model.reset_parameters()
# param_dict = os.path.join(args.trained_model_path, 'model_test.pt')
# model.load_state_dict(torch.load(param_dict, map_location=args.device), strict=False)
# draw_train =  Trainer_family(model=model, device=device)
# eval_result = draw_train.test_epoch(test_loader)