# Imports

In [1]:
_fscores = None
import os.path as op
import pandas as pd
from rdkit import Chem
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import torch
import pickle
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Descriptors import ExactMolWt
import pickle
import torch.nn.functional as F
from tqdm import tqdm
from rdkit import RDLogger, Chem, DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
import math
from collections import defaultdict
from rdkit.Chem import Crippen

import os.path as op



# Util functions

In [3]:
def tokenize_smiles(smiles):
    return list(smiles)
class SMILESDataset(Dataset):
    def __init__(self, smiles_list, tokenizer):
        self.smiles_list = smiles_list
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        input_ids = self.tokenizer.encode(smiles, return_tensors='pt').squeeze()
        return input_ids
def collate_fn(batch):
    # Pad sequences to the same length
    max_length = max([item.size(0) for item in batch])
    padded_batch = torch.stack([F.pad(item, (0, max_length - item.size(0)), value=tokenizer.pad_token_id) for item in batch])
    return padded_batch

In [4]:
def validate_mols(list_of_smiles):
    valid_mols = []
    for smi in list_of_smiles:
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            valid_mols.append(mol)
    return valid_mols

def convert_mols_to_smiles(list_of_mols):
    valid_smiles = [Chem.MolToSmiles(mol) for mol in list_of_mols]
    return valid_smiles

def smi_to_mols(smiles):
    valid_mols = []
    for smi in smiles:
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            valid_mols.append(mol)
    return valid_mols

def mol_to_smi(mols):
    valid_smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return valid_smiles

In [5]:

def readFragmentScores(name='fpscores'):
    import gzip
    global _fscores
    # generate the full path filename:
#     if name == "fpscores":
#         name = op.join(op.dirname(__file__), name)
    data = pickle.load(gzip.open('fpscores.pkl.gz'))
    outDict = {}
    for i in data:
        for j in range(1, len(i)):
            outDict[i[j]] = float(i[0])
    _fscores = outDict


def numBridgeheadsAndSpiro(mol, ri=None):
    nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
    nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
    return nBridgehead, nSpiro


def calculateScore(m):
    if _fscores is None:
        readFragmentScores()

    # fragment score
    fp = rdMolDescriptors.GetMorganFingerprint(m,
                                               2)  # <- 2 is the *radius* of the circular fingerprint
    fps = fp.GetNonzeroElements()
    score1 = 0.
    nf = 0
    for bitId, v in fps.items():
        nf += v
        sfp = bitId
        score1 += _fscores.get(sfp, -4) * v
    score1 /= nf

    # features score
    nAtoms = m.GetNumAtoms()
    nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
    ri = m.GetRingInfo()
    nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
    nMacrocycles = 0
    for x in ri.AtomRings():
        if len(x) > 8:
            nMacrocycles += 1

    sizePenalty = nAtoms**1.005 - nAtoms
    stereoPenalty = math.log10(nChiralCenters + 1)
    spiroPenalty = math.log10(nSpiro + 1)
    bridgePenalty = math.log10(nBridgeheads + 1)
    macrocyclePenalty = 0.
    # ---------------------------------------
    # This differs from the paper, which defines:
    #  macrocyclePenalty = math.log10(nMacrocycles+1)
    # This form generates better results when 2 or more macrocycles are present
    if nMacrocycles > 0:
        macrocyclePenalty = math.log10(2)

    score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty

    # correction for the fingerprint density
    # not in the original publication, added in version 1.1
    # to make highly symmetrical molecules easier to synthetise
    score3 = 0.
    if nAtoms > len(fps):
        score3 = math.log(float(nAtoms) / len(fps)) * .5

    sascore = score1 + score2 + score3

    # need to transform "raw" value into scale between 1 and 10
    min = -4.0
    max = 2.5
    sascore = 11. - (sascore - min + 1) / (max - min) * 9.
    # smooth the 10-end
    if sascore > 8.:
        sascore = 8. + math.log(sascore + 1. - 9.)
    if sascore > 10.:
        sascore = 10.0
    elif sascore < 1.:
        sascore = 1.0

    return sascore

In [6]:

def generate_smiles(prompt, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors='pt')#.to('cuda')
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)
    smiles = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return smiles


In [7]:
def curate(l):
    l = [s.replace('!','').replace('<BOS>','') for s in l]
    return l

In [8]:

def iterate_alpha(alpha_code):
    numbers = []
    for letter in alpha_code:
        number = ord(letter)
        numbers.append(number)
    
    if numbers[3]+1 > 90:
        if numbers[2]+1 > 90:
            if numbers[1]+1 > 90:
                if numbers[0]+1 > 90:
                    raise ValueError('Too long for alpha code')
                else:
                    numbers[3] = 65
                    numbers[2] = 65
                    numbers[1] = 65
                    numbers[0] = numbers[0] + 1
            else:
                numbers[3] = 65
                numbers[2] = 65
                numbers[1] = numbers[1] + 1
        else:
            numbers[3] = 65
            numbers[2] = numbers[2] + 1
    else:
        numbers[3] = numbers[3] + 1
    

    new_code = ""
    for number in numbers:
        new_code += chr(number)
    return new_code

In [9]:

def write_gen_to_sdf(mols_for_export, generation):
	id_code = 'AAAA'
	mols_to_export = []
	for mol in mols_for_export:
		codes.append(id_code)
		
		pm = Chem.PropertyMol.PropertyMol(mol)
		title = 'id' + str(id_code) + 'gen'+ str(generation)
		# print(title 		)
		# Enables for tracking which molecule is which in PyRx GUI and PyRx results export
		pm.SetProp('Title', title)
		mols_to_export.append(pm)
		id_code = iterate_alpha(id_code)
	
	half_len = int(len(mols_to_export)*3/4)
	full_len = len(mols_to_export)
	first_batch_mol = list(mols_to_export[0:half_len])
	second_batch_mol = list(mols_to_export[half_len:full_len])




	w = Chem.SDWriter('./generations/gen' +generation+ '-1.sdf')
	for m in first_batch_mol: w.write(m)
	# Noticed an issue where the very last line item of an sdf write is not written correctly until another arbitary write is made
	w = Chem.SDWriter('./generations/junk/todelete.sdf')
	w.write(m)
	
	x = Chem.SDWriter('./generations/gen' +generation+ '-2.sdf')
	for m in second_batch_mol: x.write(m)
	# Noticed an issue where the very last line item of an sdf write is not written correctly until another arbitary write is made
	x = Chem.SDWriter('./generations/junk/todelete.sdf')
	x.write(m)

In [10]:
from Bio.PDB import PDBParser
import numpy as np

def calculate_center_and_size(pdb_file, binding_site_residues, chain_id='A'):
    parser = PDBParser()
    structure = parser.get_structure('protein', pdb_file)
    model = structure[0]
    chain = model[chain_id]

    # Get the coordinates of the binding site residues
    coords = []
    for res_id in binding_site_residues:
        residue = chain[res_id]
        for atom in residue:
            coords.append(atom.coord)

    coords = np.array(coords)
    center = coords.mean(axis=0)
    size = coords.max(axis=0) - coords.min(axis=0)

    return center, size

## Lipsinki rule of 5 / Ro5

The rule of 5 indicates that poor absorption is more likely to occur when there are more than (i) 5 hydrogen-bond donors, (ii) 10 (5 × 2) hydrogen-bond acceptors, (iii) a molecular weight greater than 500 (5 × 100), and (iv) a calculated Log P (cLogP) greater than 5. 

In [32]:
def GetMolW(smi):
    molecule = Chem.MolFromSmiles(smi)
    return Descriptors.MolWt(molecule)

In [15]:
GetMolW('CCO')

46.069

In [16]:
def getHbondDonors(smi):
    molecule = Chem.MolFromSmiles(smi)
    return Chem.Lipinski.NumHDonors(molecule)

In [18]:
getHbondDonors('CCO')

1

In [19]:
def getHbondAcceptors(smi):
    molecule = Chem.MolFromSmiles(smi)
    return Chem.Lipinski.NumHAcceptors(molecule)

In [20]:
getHbondAcceptors('CCO')

1

In [21]:
def getLogP(smi):
    molecule = Chem.MolFromSmiles(smi)
    return Crippen.MolLogP(molecule)

In [22]:
getLogP('CCO')

-0.0014000000000000123

In [23]:
def getQED(smi):
    molecule = Chem.MolFromSmiles(smi)
    return Chem.QED.qed(molecule)

The closer the score is to 1, the more drug-like the molecule.

In [24]:
getQED('CCO')

0.40680796565539457

In [25]:
def GetSAS(smi):
    molecule = Chem.MolFromSmiles(smi)
    return calculateScore(molecule)
    

In [26]:
GetSAS('CCO')

1.9802570386349831

In [28]:
def GetGoodDrugs(smiles):
    arr1= []
    arr2= []
    arr3= []
    arr4= []
    arr5= []
    final = []
    for smile in smiles:
        if GetMolW(smile) <= 500:
            arr1.append(smile)
    for smile in arr1:
        if getHbondDonors(smile) <=5:
            arr2.append(smile)
    for smile in arr2:
        if getHbondAcceptors(smile) <=10:
            arr3.append(smile)
    for smile in arr3:
        if getLogP(smile) <= 5:
            arr4.append(smile)
    for smile in arr4: 
        if getQED(smile) >= 0.65:
            arr5.append(smile)
    for smile in arr5:
        if GetSAS(smile) <=3.5:
            final.append(smile)
            
    return final
    

In [30]:
listofsmiles = ['O=C1Nc2cc(NC(=O)c3c[nH]cc(-c4ccc(C(F)(F)F)cc4)c3=O)ccc2C1=Cc1ccc[nH]1','CC(NC(=O)Nc1cc2[nH]nc(N3CC(C(C)(C)O)C3)c2cn1)c1ccccc1','N=C(N)C1CCCC(NC(=O)CN2CCCCC(NS(=O)(=O)c3ccccc3)C2=O)C1O','CCN1C(C(=O)NC(Cc2ccccc2)C(=O)C(=O)NCCCN2CCCC2=O)Cc2cc3c(cc2S1(=O)=O)OCCO3','COCC(=O)NC1CCC(CCN2CCC(c3cccc4occc34)CC2)CC1','O=C(c1cc(-c2ccc3[nH]ncc3c2)on1)N1CCCCC1','CN(c1ccc(C(O)(C(F)(F)F)C(F)(F)F)cc1)S(=O)(=O)c1cccc(F)c1','CC1(C)Cc2ccccc2N1C(=O)CN1CCN(Cc2ccc(Cl)cc2)CC1','CCC(C)C(=O)C1CCC2C3CCC4NC(=O)C=CC4(C)C3CCC12C','C=CCc1cnc(Cc2cc(C3OC(CO)C(O)C(O)C3O)ccc2Cl)s1','CC(C)CC(NC(=O)c1ccc(N2CCN(C)CC2)cc1)C(=O)N1CCC2OCC(=O)C21','CCOc1ccc(-c2cccc(S(=O)(=O)NC(Cc3cccc(C(=N)N)c3)C(=O)N3CCC(CCN)CC3)c2)cc1','COC(=O)C1OC(SNCc2ccc(S(N)(=O)=O)cc2)C(OC(C)=O)C(OC(C)=O)C1OC(C)=O','Cc1noc(C)c1CCC1CCN(S(=O)(=O)CC2(N(O)C=O)CCCCCC2)CC1','CCC(=O)NCC1CCCc2c1c1cc(OC(F)(F)F)ccc1n2C','Cc1ccnc(-c2nc3cc(F)cc(F)c3c(N3CC4(CCOCC4)c4ncc(N5CCOCC5)cc43)c2C)c1','CC1CN(Cc2cccc(-c3cc(CNC(=O)c4cccc(CN5CCNCC5)c4)ccc3F)c2)CCN1','O=C(O)C1CCCCC1NC(=O)C1CCCN1S(=O)(=O)c1cc(Cl)cc(Cl)c1','CCCCC(CC)C(=O)OCC1(CO)CC(=Cc2ccccc2C(F)(F)F)C(=O)O1','COc1cc(C(=O)NC2(C(=O)NC(C)c3ccc(-c4cc(Cl)cc(F)c4-c4nnn(C)n4)cc3F)COC2)on1','O=S(=O)(c1cccc2ccccc12)n1ccc2c(N3CCNCC3)c(Cl)ccc21','Cn1c(=O)c2c(nc(OCc3cccc(C(F)(F)F)c3)n2C)n(C)c1=O','CCNC(=O)c1noc(-c2cc(C(C)C)c(O)cc2O)c1-c1ccc(CN2CCCCC2)cc1','COc1ccccc1NC(=O)N1CCN(Cc2ccc(Br)c(Br)c2)CC1','CCn1c(=O)c2cn[nH]c2c2ccc(-c3ccccc3CN(C)C)cc21','COc1ccc(C(=O)NCC(c2ccccc2)N2CCN(CC(O)COc3ccc(Cl)cc3)CC2)cc1','CCCCC(CC(CCc1ccc(-c2ccc(C=O)cc2)cc1)C(=O)NC(C(=O)NC)C(C)(C)C)C(=O)O','CN1C2CCC1C(c1cnc(Cl)c(-c3ccccc3)c1)C2','Cc1cc(O)cc(C)c1CC(N)C(=O)NCCc1nc(O)c(CCNC(=O)C(N)Cc2c(C)cc(O)cc2C)nc1C','CC1(c2cc(NC(=O)c3cnc(-c4ncco4)cn3)ccc2F)C=CSC(N)=N1','COc1ccc(C(F)(F)F)cc1-c1cccn2nc(Nc3ccc4c(c3)CCN(CC(=O)N(C)C)C4)nc12','O=C(CCCN1C2CCC1c1c([nH]c3ccccc13)C2)c1cccs1','N=C(N)Nc1ncc(Cl)c2ccc(-c3cccc(C(=O)O)c3)cc12','CC(C)CCn1c(=O)c(C2=NS(=O)(=O)c3cc(OC(C)C(N)=O)ccc3N2)c(O)c2cccnc21','CCC1(O)C(=O)OCc2c1cc1n(c2=O)Cc2c-1nc1ccccc1c2C=Nc1ccccc1C','COC(=O)c1ccc2nc(Nc3c(C)cccc3Cl)c3cncn3c2c1','Cc1cc(NC(C)C)nc2ccc(NC(=O)COc3ccc(C(F)(F)F)cc3)cc12','NS(=O)(=O)c1ccc(NC(S)=NC(CCC(=O)O)C(=O)O)cc1','CC(C)(C)C1(O)CCN(CC2c3ccccc3C=Cc3ccccc32)CC1','CC(C)C(=O)NC1CC2(O)C3Cc4ccc(O)cc4C2(CCN3CC2CC2)CC1=O','CCOC(=O)c1cncc(-c2cccc(C(C)Nc3cc(-c4ccc5occ(C)c5c4)nc(C)n3)c2)c1','COc1ccc2c(c1)C(c1ccc(Cl)cc1)=NC(CC(=O)Nc1cccs1)c1nnc(C)n1-2','CC(CNc1ccc(C(=O)O)cc1)NCC(O)c1cccc(Cl)c1','CN(C)CCNc1ccc2c(=O)n(CCN(C)C)c(=O)n3c4ccccc4c(=O)c1c23','CC(C)Oc1ccc(-c2cc3ncccc3c(OCC3CNC(=O)C3)n2)cc1','Nc1ncnc2c1ncn2C1OC(C(=O)NC23CC4CC(C2)C(C4)C3)C(O)C1O','O=C(c1cc(Br)c2c(c1)C(O)(C(F)(F)F)c1ccccc1-2)N1CCC1','NC(=O)C1C2C=CC(C2)C1Nc1nc(Nc2cnn(CC3CCCN3)c2)ncc1Cl','COc1cc(OCc2csc(N3CCN(c4ccccc4)CC3)n2)c2cc(-c3cn4nc(C)ccc4n3)oc2c1','O=C(O)c1ccc(-c2ccc(Cl)c(OC3OC(CO)C(O)C(O)C3O)c2)cc1','CC(C)(C)n1ncc2c1C(=O)NC1(CCN(C(=O)c3ccc4n[nH]cc4c3)CC1)C2','O=C1C(SCCO)=C(SCCO)C(=O)N1c1ccc(-c2ccccc2)cc1','O=C1C2CCCCN2C(=O)N1CCCCN1CCN(c2ccccc2)CC1','COc1noc2c(F)c3c(cc12)CC1(C(=O)NC(=O)NC1=O)C1C(C)OC(C)CN31','CN(C)CCNC(=O)c1cccc(Nc2nc3cc(C(=O)O)ccc3c3sccc23)c1','CCN(c1ccc(OCC(C)C)c(C(C)C)c1)c1ccc(C(=O)O)cn1','CCCOc1ccc(-c2ccc(-c3ccccc3Cl)n2CC(=O)N=C(N)NCc2cccs2)cc1','OC(CN1CCN(C(c2ccccc2)c2ccccc2)CC1)Cn1cnc2c(-n3cccn3)ncnc21','COc1ccccc1N1CCN(CCCCn2nc(-n3ccnc3)ccc2=O)CC1','CC(C)(C)c1ccc(NC(=O)c2cccc(N3CCc4nc(N)ncc4C3)c2)cc1','CC(C)(N)c1cc2nc(-c3cnc(N)nc3)nc(N3CCOCC3)c2s1','CCc1cc(C(=O)O)c(NC(=O)c2ccc([N+](=O)[O-])o2)s1','O=C(Nc1ccc(I)cc1)c1ccc(Cl)[n+]([O-])c1','O=C(CCCCCCc1ccccc1)c1nc(C(F)(F)F)co1','Cc1scnc1C(=O)Nc1ccc(N2Cc3c(Cl)cccc3C2=O)c(Cl)c1','NC(=Nc1ccc2ccn(CCCN3CCOCC3)c2c1)c1cccs1','O=S(=O)(CC(F)(F)F)N(Cc1cccnc1)c1cccc(OC2CCC2)c1','CCOC(=O)C1CCC2(CC1)OOC1(OO2)C2CC3CC(C2)CC1C3','OC1(c2ccccc2)CCN(Cc2cc3ccccc3s2)CC1','N#Cc1cccc(-c2nc(N)c3cc(CN4CCOCC4)sc3n2)c1','CC(C)n1cc(C(=O)c2cncc(NC(=O)Cn3cnc(Cl)c3Cl)c2)c2cncnc21','CN(C)CCn1c(=O)c2ccc3c4c(nn3CCN3CCCC3)-c3ccccc3-n(c1=O)c24','CCOC(=O)c1cn2ncnc(Nc3ccc(Br)c(C(=O)NOC)c3)c2c1C(C)C','CC(O)c1ccc(-c2c(O)ccc3[nH]c(=O)c4sccc4c23)cc1','CC(Nc1nc(NCCc2ccc(S(N)(=O)=O)cc2)nc(NC(C)C(=O)O)n1)C(=O)O','CC(=O)Nc1cccc(-c2ccc(C(O)(c3c[nH]cn3)C(C)C)cc2)n1','CC1(N2CCc3c(-c4cnc(N)nc4)nc(N4CCOCC4)nc32)CCN(C(=O)CC(F)(F)F)C1','CCCCCC(O)c1ccc(OCc2ccc3ccccc3n2)cc1','CC(Cc1ccccn1)N1C(=O)c2ccccc2C1C(=O)NCc1ccc(OC(F)(F)F)cc1','CC(Nc1cc(F)cc(F)c1)c1cc(C(=O)N2CCSCC2)cc2c(=O)cc(N3CCOCC3)oc12','OC1C(CCl)OC(n2cnc3c(NC4CCCC4)nc(Cl)nc32)C1O','COc1ccc2cccc(OCC(=O)NCC(O)CN3CCc4ccccc4C3)c2n1','C=C(c1cc(Cl)cc(Cl)c1OCC(O)CNC(C)(C)C)n1ccnc1','O=C(O)CCC(c1nc2ccccc2o1)n1cc(C=CC(=O)NO)nn1','COc1cccc(OC)c1-c1cccc2c1CCC(N(C)C)C2','Nc1nc(OC(c2ccccc2-c2ccco2)C(F)(F)F)cc(-c2ccc(CC(N)C(=O)O)cc2)n1','COc1ccccc1-n1nc(C)c2c1C(=O)N(c1cc(C)c3nnc(C)n3c1)C2c1ccc(F)cc1F','Cc1noc(-c2ccccc2C(=O)NC2CCCC2Nc2ncc(C(F)(F)F)cc2F)n1','COc1ccc(NC(C)=O)c(OCC(N)CN2CCC3(CC2)Cc2cc(Cl)ccc2O3)c1','O=[N+]([O-])c1cccc(N2CCN(c3ncc(C(O)(C(F)(F)F)C(F)(F)F)s3)CC2)c1','Cn1ncc(Br)c1-c1cc(NC(=O)Nc2ccc(Cl)cc2)ccc1O','FC(F)Oc1ccc(-c2nnc3cncc(Oc4ccc(C(F)(F)F)cc4)n23)cc1','COC(=O)NC1CCC2(C)C(=CCC3C2CCC2(C)C(n4ccnc4)=CCC32)C1','COc1cc(-c2cc3ncccc3c(OC(C)C3CNC(=O)C3)n2)cc(OC)c1Cl','CC1CN(c2sc(C(F)(F)F)nc2-c2nc3ccccc3[nH]2)CCN1C(=O)Cn1cnc2c1CN(C)CC2','COc1cc(C2c3cc4c(cc3C=C(C=NNc3ccccc3)C2CO)OCO4)cc(OC)c1OC','O=c1[nH]nc2c3cc(CN4CCC(O)CC4)ccc3oc3cccc1c32','O=C(COc1cccc(Cl)c1)NC1CCN(Cc2ccc3ccccc3c2)CC1','C#Cc1cc(C)cc(N2C(=O)c3ccccc3C2(O)c2ccc3[nH]c(NC(=O)OC)nc3c2)c1','Cc1[nH]nc2c1N=C(c1ccccc1)c1cc(F)ccc1N2']

In [34]:
GetGoodDrugs(listofsmiles)

['COCC(=O)NC1CCC(CCN2CCC(c3cccc4occc34)CC2)CC1',
 'O=C(c1cc(-c2ccc3[nH]ncc3c2)on1)N1CCCCC1',
 'CN(c1ccc(C(O)(C(F)(F)F)C(F)(F)F)cc1)S(=O)(=O)c1cccc(F)c1',
 'CC1(C)Cc2ccccc2N1C(=O)CN1CCN(Cc2ccc(Cl)cc2)CC1',
 'CC(C)CC(NC(=O)c1ccc(N2CCN(C)CC2)cc1)C(=O)N1CCC2OCC(=O)C21',
 'CCC(=O)NCC1CCCc2c1c1cc(OC(F)(F)F)ccc1n2C',
 'O=C(O)C1CCCCC1NC(=O)C1CCCN1S(=O)(=O)c1cc(Cl)cc(Cl)c1',
 'Cn1c(=O)c2c(nc(OCc3cccc(C(F)(F)F)c3)n2C)n(C)c1=O',
 'COc1ccccc1NC(=O)N1CCN(Cc2ccc(Br)c(Br)c2)CC1',
 'CC(C)Oc1ccc(-c2cc3ncccc3c(OCC3CNC(=O)C3)n2)cc1',
 'O=C(c1cc(Br)c2c(c1)C(O)(C(F)(F)F)c1ccccc1-2)N1CCC1',
 'O=C1C(SCCO)=C(SCCO)C(=O)N1c1ccc(-c2ccccc2)cc1',
 'CC(C)(C)c1ccc(NC(=O)c2cccc(N3CCc4nc(N)ncc4C3)c2)cc1',
 'CC(C)(N)c1cc2nc(-c3cnc(N)nc3)nc(N3CCOCC3)c2s1',
 'O=S(=O)(CC(F)(F)F)N(Cc1cccnc1)c1cccc(OC2CCC2)c1',
 'OC1(c2ccccc2)CCN(Cc2cc3ccccc3s2)CC1',
 'N#Cc1cccc(-c2nc(N)c3cc(CN4CCOCC4)sc3n2)c1',
 'CC(=O)Nc1cccc(-c2ccc(C(O)(c3c[nH]cn3)C(C)C)cc2)n1',
 'C=C(c1cc(Cl)cc(Cl)c1OCC(O)CNC(C)(C)C)n1ccnc1',
 'COc1cccc(OC)c1-c1cccc2c1C

In [35]:
def get_metrics(Smiles):
    val = len(validate_mols(Smiles))/len(Smiles)#between 0 and 1, 1 being all smiles are valid
    uniqueness = len(list(set(Smiles)))/len(Smiles)#same
    #define other metrics here
    return {"Validity":val,"Uniqueness":uniqueness}

# Loading the model

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('fine_tuned_gpt2_smiles')
tokenizer = GPT2Tokenizer.from_pretrained('fine_tuned_gpt2_smiles')

# Ensure the pad token is set
tokenizer.pad_token = tokenizer.eos_token

num_smiles = 3
generated_smiles_list = []

for _ in tqdm(range(num_smiles)):
    input_ids = tokenizer.encode('<BOS>', return_tensors='pt')
    attention_mask = input_ids.ne(tokenizer.pad_token_id).long()
    
    generated_ids = model.generate(
        input_ids, 
        attention_mask=attention_mask, 
        max_length=100, 
        temperature=1.0, 
        top_k=50, 
        top_p=0.95, 
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_smiles = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    generated_smiles_list.append(generated_smiles)
generated_smiles_list = curate(generated_smiles_list)

print(generated_smiles_list)


In [None]:
mols = smi_to_mols(generated_smiles_list)
len(mols)

In [None]:
for m in mols:
    print(calculateScore(m))

In [None]:

# Example usage
pdb_file = '8pn6.pdb'
binding_site_residues = [50, 51, 52, 53, 54, 55]  # Replace with your binding site residues
chain_id = 'A'  # Adjust if necessary

center, size = calculate_center_and_size(pdb_file, binding_site_residues, chain_id)
print(f"Center: {center}")
print(f"Size: {size}")


In [None]:
import subprocess

# Convert SMILES to 3D structure and save as PDB
smiles = "CCO"  # Example SMILES for ethanol
mol = Chem.MolFromSmiles(smiles)
mol = Chem.AddHs(mol)
AllChem.EmbedMolecule(mol, AllChem.ETKDG())
AllChem.UFFOptimizeMolecule(mol)
Chem.MolToPDBFile(mol, "ligand.pdb")
mgltools_path = r'C:\Program Files (x86)\MGLTools-1.5.7\Lib\site-packages\AutoDockTools'

# Convert PDB to PDBQT using MGLTools
subprocess.run(["python", f"{mgltools_path}\\Utilities24\\prepare_ligand4.py", "-l", "ligand.pdb", "-o", "ligand.pdbqt"])
subprocess.run(["python", f"{mgltools_path}\\Utilities24\\prepare_receptor4.py", "-r", "8pn6.pdb", "-o", "8pn6.pdbqt"])

# Create Vina configuration file
config = """
receptor = 8pn6.pdbqt
ligand = ligand.pdbqt
center_x = -26.03602
center_y = 14.734461
center_z = -14.99357
size_x = 14.377998
size_y = 16.296
size_z = 18.43
"""

with open("config.txt", "w") as file:
    file.write(config)

# Run AutoDock Vina
command = [
    "vina",
    "--config", "config.txt",
    "--log", "log.txt",
    "--out", "out.pdbqt"
]

subprocess.run(command)

# Analyze the results (optional, for example purposes)
with open("log.txt") as log_file:
    print(log_file.read())