In [1]:
import openai
import random
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.metrics import f1_score,accuracy_score
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import DataStructs
from rdkit.Chem import rdMolDescriptors
from rdkit import Chem
import warnings
from rdkit import RDLogger
from steamship import Steamship
import datetime
import os 

Data preprocessing

In [2]:
random.seed(42) 
#read bace dataset
bace = pd.read_csv("/home/kguo2/PycharmProjects/GPT_test/Propperty_prediction/data/BACE.csv")
sample_size = 100
bace_sample= bace.sample(sample_size)
bace.drop(bace_sample.index, inplace = True)

In [3]:
##save sampled dataset

bace_sample.to_csv("/home/kguo2/PycharmProjects/MolR/data/BACE/BACE_test.csv",index = False)
bace.to_csv("/home/kguo2/PycharmProjects/MolR/data/BACE/BACE_train.csv",index =False)
print(bace_sample['Class'].value_counts())

0    57
1    43
Name: Class, dtype: int64


Generate result by GPT(Davinci-003,GPT-3.5, GPT-4)

In [4]:
def generate_response_by_gpt4(prompt):
    client = Steamship(workspace="gpt-4-g4d")

    # Create an instance of this generator
    generator = client.use_plugin('gpt-4', config={"temperature":0.7, "n": 5})

    # Generate text
    task = generator.generate(text=prompt)
    # Wait for completion of the task.
    task.wait()
    # Print the output
#     message = task.output.blocks[0].text.strip()
    message = task.output.blocks
    message = [i.text.strip() for i in message]
    return message

def generate_response_by_gpt35(prompt, model_engine = "gpt-3.5-turbo"):
    completion = openai.ChatCompletion.create(
        model=model_engine, temperature=1, n=5, 
        messages=[{"role": "user", "content": prompt}],
    )
#     message = completion.choices[0].message.content.strip()

    message = completion.choices
    message = [i.message.content.strip() for i in message]
    return message


def generate_response_by_davinci(prompt, model_engine = 'text-davinci-003'):
    completion = openai.Completion.create(
      engine=model_engine,
      prompt=prompt,
      temperature=1.2,
      max_tokens=256,
#       top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=0.0, 
      n=5,
    )
#     message = completion.choices[0]['text'].strip()
    
    message = completion.choices
    message = [i['text'].strip() for i in message]
    return message

Sampling strategy: 1. random sample, 2.scaffold sample


In [5]:
# random sampling
def radom_sample_examples(bace,sample_size):
    positive_examples = bace[bace["Class"] == 1].sample(int(sample_size/2))
    negative_examples = bace[bace["Class"] == 0].sample(int(sample_size/2))
    smiles = positive_examples["mol"].tolist() + negative_examples["mol"].tolist()
    
    class_label = positive_examples["Class"].tolist() + negative_examples["Class"].tolist()
    #convert 1 to "Yes" and 0 to "No"" in class_label
    class_label = ["Yes" if i == 1 else "No" for i in class_label]
    bace_examples = list(zip(smiles, class_label))
    return bace_examples

# scaffold sampling

def top_k_scaffold_similar_molecules(target_smiles, bace_data, k):
    #drop the target_smiles from the dataset
    bace_data = bace_data[bace_data["mol"] != target_smiles]
    molecule_smiles_list = bace_data['mol'].tolist()
    label_list = bace_data['Class'].tolist()
    label_list = ["Yes" if i == 1 else "No" for i in label_list]

    target_mol = Chem.MolFromSmiles(target_smiles)
    if target_mol is not None:
        target_scaffold = MurckoScaffold.GetScaffoldForMol(target_mol)
    else:
        print("Error: Unable to create a molecule from the provided SMILES string.")
        #drop the target_smiles from the dataset
        return None

    target_scaffold = MurckoScaffold.GetScaffoldForMol(target_mol)
    target_fp = rdMolDescriptors.GetMorganFingerprint(target_scaffold, 2)
    RDLogger.DisableLog('rdApp.warning')
    warnings.filterwarnings("ignore", category=UserWarning)
    similarities = []
    
    for i,smiles in enumerate(molecule_smiles_list):
        mol = Chem.MolFromSmiles(smiles)
        try:
            scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            scaffold_fp = rdMolDescriptors.GetMorganFingerprint(scaffold, 2)
            tanimoto_similarity = DataStructs.TanimotoSimilarity(target_fp, scaffold_fp)
            # print(tanimoto_similarity)
            similarities.append((smiles, tanimoto_similarity,label_list[i]))
        except:
            continue
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_5_similar_molecules = similarities[:k]
    return top_5_similar_molecules

Sampling examples

In [6]:
sample_size = 4
target_smiles = "O1C[C@]2(N=C1N)c1cc(ccc1Oc1c2cc(OCC(C)C)cc1)-c1cncnc1"
random_examples = radom_sample_examples(bace_sample,sample_size)
print("randomly sampling examples", radom_sample_examples(bace_sample,sample_size))
print("scaffold sampling examples", top_k_scaffold_similar_molecules(target_smiles, bace_sample,sample_size))

randomly sampling examples [('O(C)c1cc(ccc1)-c1cc(ccc1)C1(N=C(N)N(C)C(=O)C1)C', 'Yes'), ('O(C)c1ccc(cc1)C1(N=C(N2C1=NCCC2)N)c1cc(ccc1)-c1cccnc1', 'Yes'), ('O(C)c1cc(ccc1OC)C(=O)N[C@@H](Cc1ccc([N+](=O)[O-])cc1)C(=O)N[C@H]([C@@H](O)C[C@H](C(=O)NCC(C)C)C)CC(C)C', 'No'), ('Clc1cc2CC([NH+]=C(N[C@@H](Cc3ccccc3)C=3NC(=O)C(=CN=3)C#N)c2cc1)(C)C', 'No')]
scaffold sampling examples [('O1CC(N=C1N)(c1cc(ccc1)-c1cccnc1)c1ccc(OC)cc1', 0.4536082474226804, 'No'), ('Fc1cc(cc(c1)-c1cncnc1)C1(N=C(N)N(C)C1=O)c1ccc(OC(F)(F)F)cc1', 0.4444444444444444, 'No'), ('n1ccc(cc1)C1(N=C(N)c2c1cccc2)c1cc(ccc1)-c1cncnc1', 0.44339622641509435, 'No'), ('Fc1c2c(ccc1)C(N=C2N)(c1cc(C#N)c(OC)c(c1)C)c1cc(ccc1)-c1cncnc1', 0.44339622641509435, 'Yes')]


In [8]:
openai.api_key = "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" #enter your key her

In [12]:
def create_bace_prompt(input_smiles,pp_examples):
    prompt = "You are an expert chemist, your task is to predict the property of molecule using your experienced chemical property prediction knowledge.\nPlease strictly follow the format, no other information can be provided. Given the SMILES string of a molecule, predict the molecular properties of a given chemical compound based on its structure, by analyzing wether it can inhibit(Yes) the Beta-site Amyloid Precursor Protein Cleaving Enzyme 1 (BACE1) or cannot inhibit(No) BACE1. Consider factors such as molecular weight, atom count, bond types, and functional groups in order to assess the compound's drug-likeness and its potential to serve as an effective therapeutic agent for Alzheimer's disease,please answer with only Yes or No. A few examples are provided in the beginning.\n"
    for example in pp_examples:
        prompt += f"SMILES: {example[0]}\nBACE-1 Inhibit: {example[-1]}\n"
    prompt += f"SMILES: {input_smiles}\nBACE-1 Inhibit:\n"
    return prompt

In [13]:
input_smiles = "O1C[C@]2(N=C1N)c1cc(ccc1Oc1c2cc(OCC(C)C)cc1)-c1cncnc1"
example_prompt = create_bace_prompt(input_smiles,random_examples)
print(example_prompt)

You are an expert chemist, your task is to predict the property of molecule using your experienced chemical property prediction knowledge.
Please strictly follow the format, no other information can be provided. Given the SMILES string of a molecule, predict the molecular properties of a given chemical compound based on its structure, by analyzing wether it can inhibit(Yes) the Beta-site Amyloid Precursor Protein Cleaving Enzyme 1 (BACE1) or cannot inhibit(No) BACE1. Consider factors such as molecular weight, atom count, bond types, and functional groups in order to assess the compound's drug-likeness and its potential to serve as an effective therapeutic agent for Alzheimer's disease,please answer with only Yes or No. A few examples are provided in the beginning.
SMILES: S1(=O)(=O)N(c2cc(cc3n(cc(CC1)c23)CC)C(=O)NC([C@H](O)C[NH2+]CCC(F)(F)F)Cc1ccccc1)C
BACE-1 Inhibit: Yes
SMILES: s1cc(cc1)[C@@]1(N=C(N)N(C)C1=O)c1cc(ccc1)-c1cccnc1
BACE-1 Inhibit: Yes
SMILES: O(C)c1ccc(cc1CC)C1(N=C(N)N(C

Inference gpt models for In-context Learning

In [None]:
model_engine = ['gpt-4','gpt-3.5','davinci-003']
# model_engine = [ 'davinci-003']
sample_nums = [4,8]
sample_methods = ['random','scaffold']
detail_save_folder = '' # path to save the generated result 
paras = 0
for sample_method in sample_methods:
    for sample_num in sample_nums:
        for model in model_engine:
            if paras < 0:
                paras += 1
                continue
            # elif paras == 3:
            #     #terminate the program
            #     exit()
            detail_predict_file = detail_save_folder + 'test_{}_{}_{}_{}.csv'.format('bace', model, sample_num, sample_method)
            log_file = detail_save_folder + 'test_{}_{}_{}_{}.log'.format('bace', model, sample_num, sample_method)
            print(detail_predict_file)
            print()
            
            if os.path.exists(detail_predict_file):
                detail_results = pd.read_csv(detail_predict_file)
                #convert the column to list
                detail_results = detail_results.values.tolist()
            else:
                detail_results = []
                
                
            now = datetime.datetime.now()
            date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
            with open(log_file, "a") as file:
                file.write("=" * 30 + date_time_str + "=" * 30 + "\n")
                
            if sample_method == 'random':
                para_index = 0 
                bace_examples = radom_sample_examples(bace,sample_num)
                for i in tqdm(range(0, len(bace_sample))):
                    # print(para_index)
                    if para_index < 0:
                        para_index += 1
                        continue
                    example = [(bace_sample.iloc[i]['mol'],bace_sample.iloc[i]['Class'])]
                    pred_y = []
                    generated_results = []
                    for text in example:
                        prompt = create_bace_prompt(text[0], bace_examples)
                        with open(log_file, "a") as file:
                            file.write(prompt + "\n")
                            file.write("=" * 50 + "\n")
                        if model == 'davinci-003':
                            generated_p = generate_response_by_davinci(prompt)    
                        elif model == 'gpt-3.5-turbo':
                            generated_p = generate_response_by_gpt35(prompt)
                        elif model == 'gpt-4':
                            generated_p = generate_response_by_gpt4(prompt)

                        # generated_p = [1 if i == "Yes" else 0 for i in generated_p]
                        generated_results.append(generated_p)
                        detail_results.append([text[0]] + [text[-1]] + generated_p)
                        
                        print('training iterations',i)
                        if (i+1) % 10 == 0:
                            details_df = pd.DataFrame(detail_results, columns=['bace_smiles', 'class_label', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5'])
                            details_df.to_csv(detail_predict_file, index=False)
                            print('save file')
                details_df = pd.DataFrame(detail_results, columns=['bace_smiles', 'class_label', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5'])
                details_df.to_csv(detail_predict_file, index=False)
            
            elif sample_method == 'scaffold':
                para_index = 0
                for i in tqdm(range(0, len(bace_sample))):
                    example = [(bace_sample.iloc[i]['mol'],bace_sample.iloc[i]['Class'])]
                    pred_y = []
                    generated_results = []
                    # print(para_index)
                    if para_index < 0:
                        para_index += 1
                        continue
                    for text in example:
                        bace_examples = top_k_scaffold_similar_molecules(text[0], bace, sample_num)
                        prompt = create_bace_prompt(text[0],bace_examples)
                        # print(prompt)
                        with open(log_file, "a") as file:
                            file.write(prompt + "\n")
                            file.write("=" * 50 + "\n")
                        if model == 'davinci-003':
                            generated_p = generate_response_by_davinci(prompt)    
                        elif model == 'gpt-3.5-turbo':
                            generated_p = generate_response_by_gpt35(prompt)
                        elif model == 'gpt-4':
                            generated_p = generate_response_by_gpt4(prompt)
                        # convert the result to 1 if "Yes" in the result string else 0
                
                        # generated_p = [1 if "Yes" in i else 0 for i in generated_p]   
                        generated_results.append(generated_p)
                        detail_results.append([text[0]] + [text[-1]] + generated_p)
                        
                        print('training iterations',i)
                        if (i+1) % 10 == 0:
                            details_df = pd.DataFrame(detail_results, columns=['bace_smiles', 'class_label', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5'])
                            details_df.to_csv(detail_predict_file, index=False)
                details_df = pd.DataFrame(detail_results, columns=['bace_smiles', 'class_label', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5'])
                details_df.to_csv(detail_predict_file, index=False)