In [14]:
import os
import torch
import torch.nn.functional as F
import numpy as np
import sys
sys.setrecursionlimit(50000)
import pickle
torch.backends.cudnn.benchmark = True
torch.set_default_tensor_type('torch.cuda.FloatTensor')
torch.nn.Module.dump_patches = True
import pandas as pd
from AttentiveFP import save_smiles_dicts, get_smiles_array
from rdkit import Chem

In [24]:
raw_filename = "../data/data_example.csv"
feature_filename = raw_filename.replace('.csv','.pickle')
filename = raw_filename.replace('.csv','')
prefix_filename = raw_filename.split('/')[-1].replace('.csv','')
smiles_tasks_df = pd.read_csv(raw_filename)

smilesList = smiles_tasks_df.smiles.values
print("number of all smiles: ",len(smilesList))
batch_size = 20
atom_num_dist = []
remained_smiles = []
canonical_smiles_list = []
for smiles in smilesList:
    try:        
        mol = Chem.MolFromSmiles(smiles)
        atom_num_dist.append(len(mol.GetAtoms()))
        canonical_smiles_list.append(Chem.MolToSmiles(Chem.MolFromSmiles(smiles), isomericSmiles=True))
        remained_smiles.append(smiles)
    except:
        print("not successfully processed smiles: ", smiles)
        pass
smiles_tasks_df = smiles_tasks_df[smiles_tasks_df["smiles"].isin(remained_smiles)]
smiles_tasks_df['cano_smiles'] =canonical_smiles_list

smilesList = [smiles for smiles in canonical_smiles_list if len(Chem.MolFromSmiles(smiles).GetAtoms())<200]
if os.path.isfile(feature_filename):
    feature_dicts = pickle.load(open(feature_filename, "rb" ))
else:
    feature_dicts = save_smiles_dicts(smilesList,filename)
remained_df = smiles_tasks_df[smiles_tasks_df["cano_smiles"].isin(feature_dicts['smiles_to_atom_mask'].keys())]

smiles_tasks_df = remained_df
model = torch.load('saved_models/PBMT_chemical_screening_model.pt')
model.eval()
y_val_list = {}
y_pred_list = {}
losses_list = []
valList = np.arange(0,smiles_tasks_df.shape[0])
batch_list = []
y_pred_label=[]
y_pred_adjust_list=[]
for i in range(0, smiles_tasks_df.shape[0], batch_size):
    batch = valList[i:i+batch_size]
    batch_list.append(batch)   
for counter, eval_batch in enumerate(batch_list):
    batch_df = smiles_tasks_df.loc[eval_batch,:]
    smiles_list = batch_df.cano_smiles.values
    x_atom, x_bonds, x_atom_index, x_bond_index, x_mask, smiles_to_rdkit_list = get_smiles_array(smiles_list,feature_dicts)
    atoms_prediction, mol_prediction = model(torch.Tensor(x_atom),torch.Tensor(x_bonds),torch.cuda.LongTensor(x_atom_index),torch.cuda.LongTensor(x_bond_index),torch.Tensor(x_mask))
    y_pred_adjust = F.softmax(mol_prediction,dim=-1).data.cpu().numpy()[:,1]
    for i,y in enumerate(y_pred_adjust):
        if y > 0.5:
            y_pred_label.append(1)
        else:
            y_pred_label.append(0)
smiles_tasks_df['pred_label'] = y_pred_label
smiles_tasks_df.to_csv("../results/screening_results.csv", index=False)

number of all smiles:  30
feature dicts file saved as ../data/data_example.pickle
