# load data

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from rdkit import Chem
from tqdm import tqdm
import os
import openai
import datetime


def molToCanonical(smiles):
    mol = Chem.MolFromSmiles(smiles)
    canonical_smiles = Chem.MolToSmiles(mol)
    return canonical_smiles

from steamship import Steamship
# Set up the OpenAI API client
openai.api_key = 'Your API Key'

In [2]:
df = pd.read_csv("Name Prediction/pubchem.csv")

In [3]:
df = df[~df['iupac'].isna()]

In [4]:
train, test = train_test_split(df, test_size=530, random_state=42)

valid, test = train_test_split(test, test_size=500, random_state=42)

In [29]:
def top_n_scaffold_similar_molecules(target_smiles, molecule_scaffold_list, molecule_smiles_list, n=5):
    target_mol = Chem.MolFromSmiles(target_smiles)
    target_scaffold = MurckoScaffold.GetScaffoldForMol(target_mol)
    target_fp = rdMolDescriptors.GetMorganFingerprint(target_scaffold, 2)

    similarities = []

    for idx, scaffold_fp in enumerate(molecule_scaffold_list):
        try:
            tanimoto_similarity = DataStructs.TanimotoSimilarity(target_fp, scaffold_fp)
            similarities.append((idx, tanimoto_similarity))
        except Exception as e:
            print(e)
            continue

    similarities.sort(key=lambda x: x[1], reverse=True)
    top_5_similar_molecules = similarities[:n]

    return [molecule_smiles_list[i[0]] for i in top_5_similar_molecules]

In [30]:
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import DataStructs
from rdkit.Chem import rdMolDescriptors
from tqdm import tqdm

def get_scaffold_fp(x):
    try:
        mol = Chem.MolFromSmiles(x)
        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        scaffold_fp = rdMolDescriptors.GetMorganFingerprint(scaffold, 2)
        return scaffold_fp
    except:
        return None

In [31]:
train['scaffold_fp'] = train['smiles'].apply(lambda x: get_scaffold_fp(x))

[16:49:16] Explicit valence for atom # 39 Cl, 3, is greater than permitted
[16:49:22] Explicit valence for atom # 2 Cl, 3, is greater than permitted
[16:49:25] Explicit valence for atom # 7 Br, 3, is greater than permitted
[16:49:28] Explicit valence for atom # 25 Br, 3, is greater than permitted
[16:49:28] Explicit valence for atom # 2 Br, 3, is greater than permitted
[16:49:29] Explicit valence for atom # 18 Br, 3, is greater than permitted


In [32]:
import numpy as np

In [33]:
train = train[~train['scaffold_fp'].isna()]

In [34]:
len(train)

9356

# Prompt

In [18]:
def create_prompt_smiles2iupac(input_text, examples):
    prompt = "You are an expert chemist. Given the molecular SMILES, your task is to predict the IUPAC name using your experienced chemical IUPAC name knowledge. \n\
Please strictly follow the format, no other information can be provided.\n"
    
    for example in examples:
        prompt += f"Molecular SMILES: {example[0]}\nMolecular IUPAC name: {example[1]}\n"
    prompt += f"Molecular SMILES: {input_text}\nMolecular IUPAC name:"
    return prompt

In [19]:
def create_prompt_iupac2smiles(input_text, examples):
    prompt = "You are an expert chemist. Given the molecular IUPAC name, your task is to predict the molecular SMILES using your experienced chemical IUPAC name knowledge. \n\
Please strictly follow the format, no other information can be provided. You should only reply with molecular SMILES string notations to represent the IUPAC name. The SMILES must be valid and chemically reasonable. \n"
    for example in examples:
        prompt += f"Molecular IUPAC name: {example[0]}\nMolecular SMILES: {example[1]}\n"
    prompt += f"Molecular IUPAC name: {input_text}\nMolecular SMILES:"
    return prompt

In [20]:
def create_prompt_smiles2formula(input_text, examples):
    prompt = "You are an expert chemist. Given the molecular SMILES, your task is to predict the molecular formula using your experienced chemical molecular formula knowledge. \n\
Please strictly follow the format, no other information can be provided.\n"
    
    for example in examples:
        prompt += f"Molecular SMILES: {example[0]}\nMolecular formula: {example[1]}\n"
    prompt += f"Molecular SMILES: {input_text}\nMolecular formula:"
    return prompt

In [21]:
def create_prompt_iupac2formula(input_text, examples):
    prompt = "You are an expert chemist. Given the molecular formula, your task is to predict the molecular SMILES using your experienced chemical molecular formula knowledge. \n\
Please strictly follow the format, no other information can be provided. You should only reply with molecular SMILES string notations to represent the molecular formula. The SMILES must be valid and chemically reasonable.\n"
    
    for example in examples:
        prompt += f"Molecular formula: {example[0]}\nMolecular SMILES: {example[1]}\n"
    prompt += f"Molecular formula: {input_text}\nMolecular SMILES:"
    return prompt

# Validation

In [22]:
import time
def generate_response_by_davinci(prompt, model_engine = 'text-davinci-003'):
    time.sleep(2)
    completion = openai.Completion.create(
      engine=model_engine,
      prompt=prompt,
      temperature=0.2,
      max_tokens=256,
#       top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=0.0, 
      n=5,
    )
#     message = completion.choices[0]['text'].strip()
    
    message = completion.choices
    message = [i['text'].strip() for i in message]
    return message

def generate_response_by_gpt35(prompt, model_engine = "gpt-3.5-turbo"):
    time.sleep(2)
    completion = openai.ChatCompletion.create(
        model=model_engine, temperature=0.2, n=5, 
        messages=[{"role": "user", "content": prompt}],
    )
#     message = completion.choices[0].message.content.strip()

    message = completion.choices
    message = [i.message.content.strip() for i in message]
    return message

def generate_response_by_gpt4(prompt):
    time.sleep(2)
    # Create a Steamship client
    # NOTE: When developing a package, just use `self.client`
    client = Steamship(workspace="gpt-4-67j")

    # Create an instance of this generator
    generator = client.use_plugin('gpt-4', config={"temperature":0.2, "n": 5})

    # Generate text
    task = generator.generate(text=prompt)
    # Wait for completion of the task.
    task.wait()
    # Print the output
#     message = task.output.blocks[0].text.strip()
    message = task.output.blocks
    message = [i.text.strip() for i in message]
    return message

In [23]:
valid.head(3)

Unnamed: 0,CID,smiles,iupac,formula,mol_length
4909,110709753,CC(C)(C)C(=O)CN1CCN(CC1)C(=O)C2=CC(=CC=C2)Cl,"1-[4-(3-chlorobenzoyl)piperazin-1-yl]-3,3-dime...",C17H23ClN2O2,44
7558,65924153,CC1CCCC(C1)(CN)CN2CCN(CC2)C3=NC=CS3,"[3-methyl-1-[[4-(1,3-thiazol-2-yl)piperazin-1-...",C16H28N4S,35
7806,142384555,CCC(C)(CC)C=CC(=O)O,(E)-4-ethyl-4-methylhex-2-enoic acid,C9H16O2,19


In [24]:
def get_input_output_columns_by_task(task):
    if task == 'smiles2iupac':
        return "smiles", "iupac"
    elif task == 'smiles2formula':
        return "smiles", "formula"
    elif task == 'iupac2smiles':
        return "iupac", "smiles"
    elif task == 'formula2smiles':
        return "formula", 'smiles'

In [25]:
def create_prompt(reactant, examples, task):
    if task == 'smiles2iupac':
        return create_prompt_smiles2iupac(reactant, examples)
    elif task == 'smiles2formula':
        return create_prompt_smiles2formula(reactant, examples)
    elif task == 'iupac2smiles':
        return create_prompt_iupac2smiles(reactant, examples)
    elif task == 'formula2smiles':
        return create_prompt_formula2smiles(reactant, examples)

In [26]:
import difflib

def similarity_ratio(s1, s2):
    # Calculate the similarity ratio between the two strings
    ratio = difflib.SequenceMatcher(None, s1, s2).ratio()
    
    # Return the similarity ratio
    return ratio

# Example usage:
s1 = "python"
s2 = "java"
ratio = similarity_ratio(s1, s2)
print(ratio)  # Output: 0.0


def top_n_similar_strings(query, candidates, n=5):
    # Calculate the Levenshtein distance between the query and each candidate
    distances = [(c, similarity_ratio(query, c)) for c in candidates]
    
    # Sort the candidates by their Levenshtein distance to the query
    sorted_distances = sorted(distances, key=lambda x: x[1], reverse=True)
    
    # Get the top n candidates with the smallest Levenshtein distance
    top_candidates = [d[0] for d in sorted_distances[:n]]
    
    # Return the top n candidates
    return top_candidates
top_n_similar_strings("pathon", ['python', 'java'])

0.0


['python', 'java']

In [10]:
def molToCanonical(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        canonical_smiles = Chem.MolToSmiles(mol)
        return canonical_smiles
    except:
        return smiles


In [28]:
valid['smiles'] = valid['smiles'].apply(lambda x: molToCanonical(x))

In [29]:
train['smiles'] = train['smiles'].apply(lambda x: molToCanonical(x))

In [11]:
test['smiles'] = test['smiles'].apply(lambda x: molToCanonical(x))

[16:53:23] Explicit valence for atom # 9 Br, 3, is greater than permitted


In [144]:
# format the running grid search 
# evaluate and save

Models = ['davinci003', 'gpt4', 'gpt35']
ICL_Sample_methods = ['Scaffold_SIM', 'Fixed_ICL']
ICL_Samples = [5, 20]
NAME_Format = ['SMILES']
TASK_Types = ['iupac2smiles', 'formula2smiles', 'smiles2iupac', 'smiles2formula']
detail_save_folder = 'Name Prediction/results/'

params_idx = 0

# 5， 26， 40,  44

for sample_method in ICL_Sample_methods:
    for sample_num in ICL_Samples:
        for name in NAME_Format:
            for task in TASK_Types:
                for model in Models:
                    
                    details_results = []
                    performance_results = []
                    
                    if params_idx < 44:
                        params_idx +=1
                        continue
                    
                    detail_predict_file = detail_save_folder + 'valid_{}_{}_{}_{}_{}.csv'.format(task, model, name, sample_num, sample_method)
                    log_file = detail_save_folder + 'valid_{}_{}_{}_{}_{}.log'.format(task, model, name, sample_num, sample_method)
                    performance_file = detail_save_folder + 'valid_performance_{}_{}_{}_{}_{}.csv'.format(task, model, name, sample_num, sample_method)
                    print(detail_predict_file)

                    # append new date
                    # Get the current date and time
                    now = datetime.datetime.now()
                    # Convert the date and time to a string
                    date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
                    with open(log_file, "w+") as file:
                        file.write("=" * 30 + date_time_str + "=" * 30 + "\n")

                    # init var
                    predicted_products = []
                    
                    input_col, output_col = get_input_output_columns_by_task(task)

                    # query
                    for idx, row in tqdm(valid.iterrows()):
                        reactant = row[input_col]
                        product = row[output_col]

                        # ICL examples
                        if sample_method == 'Fixed_ICL':
                            chunk = train.sample(sample_num, random_state=42)
                        elif sample_method == 'Scaffold_SIM':
                            if input_col == 'smiles':
                                sim = top_n_scaffold_similar_molecules(reactant, list(train['scaffold_fp']), list(train['smiles']), n=sample_num)
                            else:
                                # similarity by leven
                                sim = top_n_similar_strings(reactant, list(train[input_col]), n=sample_num)
                            chunk = train[train[input_col].isin(sim)]
                            
                        examples = list(zip(chunk[input_col].values, chunk[output_col].values))

                        # build prompt and save
                        prompt = create_prompt(reactant, examples, task)
                        
                        with open(log_file, "a") as file:
                            file.write(prompt + "\n")
                            file.write("=" * 50 + "\n")

                        # different model 
                        if model == 'davinci003':
                            predicted_product = generate_response_by_davinci(prompt)
                        elif model == 'gpt35':
                            predicted_product = generate_response_by_gpt35(prompt)
                        elif model == 'gpt4':
                            predicted_product = generate_response_by_gpt4(prompt)

                        predicted_products.append(predicted_product)
                        details_results.append([reactant] + [product] + predicted_product)

                    # evaluate
                    acc_list = []
                    for repeat in range(5):
                        tpredicted_products = [i[repeat] for i in predicted_products]
                        correct = 0
                        all_sample_num = len(valid)
                        for idx, gt in enumerate(list(valid[output_col])):
                            pred = tpredicted_products[idx]
                            if task in ['iupac2smiles', 'formula2smiles']:
                                try:
                                    mol = Chem.MolFromSmiles(pred)
                                    pred = Chem.MolToSmiles(mol)
                                except Exception as e:
                                    continue
                                    
                            if gt == pred:
                                correct += 1
                        acc = correct / all_sample_num
                        acc_list.append(acc)

                    # save to file
                    performance_results.append([task, model, name, sample_num, sample_method, np.mean(acc_list)] + acc_list)
                    print(performance_results)
                    
                    # performance save based on the task
                    tem = pd.DataFrame(performance_results, columns=['task', 'model', 'name', 'sample_num', 'sample_method', 'avg_metric'] + ['metric_{}'.format(i) for i in range(5)])
                    tem.to_csv(performance_file, index=False)
                    
                    details_df = pd.DataFrame(details_results, columns=[input_col, output_col, 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5'])
                    details_df.to_csv(detail_predict_file, index=False)
                    
                    params_idx += 1
                    print(params_idx)

Name Prediction/results/valid_smiles2iupac_gpt35_SMILES_20_Fixed_ICL.csv


30it [03:26,  6.87s/it]


[['smiles2iupac', 'gpt35', 'SMILES', 20, 'Fixed_ICL', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
45
Name Prediction/results/valid_smiles2formula_davinci003_SMILES_20_Fixed_ICL.csv


30it [05:28, 10.96s/it]


[['smiles2formula', 'davinci003', 'SMILES', 20, 'Fixed_ICL', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
46
Name Prediction/results/valid_smiles2formula_gpt4_SMILES_20_Fixed_ICL.csv


30it [03:08,  6.27s/it]


[['smiles2formula', 'gpt4', 'SMILES', 20, 'Fixed_ICL', 0.03333333333333334, 0.03333333333333333, 0.06666666666666667, 0.0, 0.0, 0.06666666666666667]]
47
Name Prediction/results/valid_smiles2formula_gpt35_SMILES_20_Fixed_ICL.csv


30it [01:35,  3.19s/it]

[['smiles2formula', 'gpt35', 'SMILES', 20, 'Fixed_ICL', 0.12666666666666665, 0.13333333333333333, 0.13333333333333333, 0.13333333333333333, 0.13333333333333333, 0.1]]
48





In [145]:
all_performance = pd.DataFrame()
for sample_method in ICL_Sample_methods:
    for sample_num in ICL_Samples:
        for name in NAME_Format:
            for task in TASK_Types:
                for model in Models:
                    
                    performance_file = detail_save_folder + 'valid_performance_{}_{}_{}_{}_{}.csv'.format(task, model, name, sample_num, sample_method)
                    tem = pd.read_csv(performance_file)
                    all_performance = all_performance.append(tem)

  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.appe

In [158]:
all_performance = all_performance.drop_duplicates(subset=['task', 'model', 'name', 'sample_num', 'sample_method'])

In [159]:
all_performance['task'].value_counts()

iupac2smiles      12
formula2smiles    12
smiles2iupac      12
smiles2formula    12
Name: task, dtype: int64

In [164]:
all_performance.to_csv(detail_save_folder + "results_summary.csv", index=False)

In [91]:
cols = ['metric_{}'.format(i) for i in range(5)]

In [92]:
import statistics
all_performance['std'] = all_performance[cols].apply(lambda row: statistics.stdev(row), axis=1)

In [95]:
all_performance['metric'] = all_performance.apply(lambda row: "{:.3f} $\pm$ {:.3f}".format(row['avg_metric'], row['std']), axis=1)

In [108]:
all_performance[all_performance['task'] == 'smiles2formula']

Unnamed: 0,task,model,name,sample_num,sample_method,avg_metric,metric_0,metric_1,metric_2,metric_3,metric_4,std,metric
0,smiles2formula,gpt4,SMILES,20,Scaffold_SIM,0.086,0.06,0.12,0.09,0.04,0.12,0.035777,0.086 $\pm$ 0.036
0,smiles2formula,davinci003,SMILES,20,Scaffold_SIM,0.006,0.01,0.0,0.0,0.01,0.01,0.005477,0.006 $\pm$ 0.005
0,smiles2formula,gpt35,SMILES,20,Scaffold_SIM,0.052,0.05,0.06,0.05,0.05,0.05,0.004472,0.052 $\pm$ 0.004
0,smiles2formula,gpt4,SMILES,5,Scaffold_SIM,0.058,0.08,0.04,0.06,0.05,0.06,0.014832,0.058 $\pm$ 0.015
0,smiles2formula,gpt4,SMILES,20,Fixed_ICL,0.07,0.12,0.05,0.08,0.06,0.04,0.031623,0.070 $\pm$ 0.032


# Test

In [32]:
test = test.reset_index()

In [33]:
import os

In [34]:
# fix 
# real-time evaluation past 50 samples
# rate limit , sleep 2min, rerun 

In [22]:
params = [
    ('Scaffold_SIM', 20, 'SMILES', 'gpt4'),
    ('Scaffold_SIM', 20, 'SMILES', 'davinci003'),
    ('Scaffold_SIM', 20, 'SMILES', 'gpt35'),
    
    ('Scaffold_SIM', 5, 'SMILES', 'gpt4'),
    ('Fixed_ICL', 20, 'SMILES', 'gpt4'),
]

In [39]:
test = test.head(100)

In [59]:
# format the running grid search 
# evaluate and save

detail_save_folder = 'Name Prediction/results_test/'
TASK_Types = ['iupac2smiles', 'formula2smiles', 'smiles2iupac', 'smiles2formula']

for task in TASK_Types:
    for sample_method, sample_num, name, model in params:
        
        details_results = []
        performance_results = []

        detail_predict_file = detail_save_folder + 'valid_{}_{}_{}_{}_{}.csv'.format(task, model, name, sample_num, sample_method)
        log_file = detail_save_folder + 'valid_{}_{}_{}_{}_{}.log'.format(task, model, name, sample_num, sample_method)
        performance_file = detail_save_folder + 'valid_performance_{}_{}_{}_{}_{}.csv'.format(task, model, name, sample_num, sample_method)
        print(detail_predict_file)
        
        
        if os.path.exists(performance_file):
            print("{} exits, continue to next params".format(performance_file))
            continue
        

        # append new date
        # Get the current date and time
        now = datetime.datetime.now()
        # Convert the date and time to a string
        date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
        with open(log_file, "a") as file:
            file.write("=" * 30 + date_time_str + "=" * 30 + "\n")

        # init var
        predicted_products = []
        
        # restore previous data and skip 
        previous_index = []
        if os.path.exists(detail_predict_file):
            previous = pd.read_csv(detail_predict_file)
            previous_index = list(previous['index'])

            # previous data => details_results 
            details_results = previous.values.tolist()
            predicted_products = previous[['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']].values.tolist()

        input_col, output_col = get_input_output_columns_by_task(task)

        # query
        for idx, row in tqdm(test.iterrows()):
            reactant = row[input_col]
            product = row[output_col]
            index = row['index']
        
            if index in previous_index:
                continue

            # ICL examples
            if sample_method == 'Fixed_ICL':
                chunk = train.sample(sample_num, random_state=42)
            elif sample_method == 'Scaffold_SIM':
                if input_col == 'smiles':
                    sim = top_n_scaffold_similar_molecules(reactant, list(train['scaffold_fp']), list(train['smiles']), n=sample_num)
                else:
                    # similarity by leven
                    sim = top_n_similar_strings(reactant, list(train[input_col]), n=sample_num)
                chunk = train[train[input_col].isin(sim)]

            examples = list(zip(chunk[input_col].values, chunk[output_col].values))

            # build prompt and save
            prompt = create_prompt(reactant, examples, task)

            with open(log_file, "a") as file:
                file.write(prompt + "\n")
                file.write("=" * 50 + "\n")

            # different model 
            if model == 'davinci003':
                predicted_product = generate_response_by_davinci(prompt)
            elif model == 'gpt35':
                predicted_product = generate_response_by_gpt35(prompt)
            elif model == 'gpt4':
                predicted_product = generate_response_by_gpt4(prompt)

            predicted_products.append(predicted_product)
            details_results.append([index, reactant] + [product] + predicted_product)
            
            details_df = pd.DataFrame(details_results, columns=['index', input_col, output_col, 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5'])
            details_df.to_csv(detail_predict_file, index=False)
            
            # evaluate
            if len(predicted_products) % 50 == 0:         
                # evaluate
                acc_list = []
                for repeat in range(5):
                    tpredicted_products = [i[repeat] for i in predicted_products]
                    correct = 0
                    all_sample_num = len(predicted_products)
                    for idx, gt in enumerate(list(test.head(all_sample_num)[output_col])):
                        pred = tpredicted_products[idx]
                        if task in ['iupac2smiles', 'formula2smiles']:
                            try:
                                mol = Chem.MolFromSmiles(pred)
                                pred = Chem.MolToSmiles(mol)
                            except Exception as e:
                                continue

                        if gt == pred:
                            correct += 1
                    acc = correct / all_sample_num
                    acc_list.append(acc)
                print(np.mean(acc_list))
            
        # evaluate
        acc_list = []
        for repeat in range(5):
            tpredicted_products = [i[repeat] for i in predicted_products]
            correct = 0
            all_sample_num = len(test)
            for idx, gt in enumerate(list(test[output_col])):
                pred = tpredicted_products[idx]
                if task in ['iupac2smiles', 'formula2smiles']:
                    try:
                        mol = Chem.MolFromSmiles(pred)
                        pred = Chem.MolToSmiles(mol)
                    except Exception as e:
                        continue

                if gt == pred:
                    correct += 1
            acc = correct / all_sample_num
            acc_list.append(acc)

        # save to file
        performance_results.append([task, model, name, sample_num, sample_method, np.mean(acc_list)] + acc_list)
        print(performance_results)

        # performance save based on the task
        tem = pd.DataFrame(performance_results, columns=['task', 'model', 'name', 'sample_num', 'sample_method', 'avg_metric'] + ['metric_{}'.format(i) for i in range(5)])
        tem.to_csv(performance_file, index=False)


Name Prediction/results_test/valid_iupac2smiles_gpt4_SMILES_20_Scaffold_SIM.csv
Name Prediction/results_test/valid_performance_iupac2smiles_gpt4_SMILES_20_Scaffold_SIM.csv exits, continue to next params
Name Prediction/results_test/valid_iupac2smiles_davinci003_SMILES_20_Scaffold_SIM.csv
Name Prediction/results_test/valid_performance_iupac2smiles_davinci003_SMILES_20_Scaffold_SIM.csv exits, continue to next params
Name Prediction/results_test/valid_iupac2smiles_gpt35_SMILES_20_Scaffold_SIM.csv
Name Prediction/results_test/valid_performance_iupac2smiles_gpt35_SMILES_20_Scaffold_SIM.csv exits, continue to next params
Name Prediction/results_test/valid_iupac2smiles_gpt4_SMILES_5_Scaffold_SIM.csv
Name Prediction/results_test/valid_performance_iupac2smiles_gpt4_SMILES_5_Scaffold_SIM.csv exits, continue to next params
Name Prediction/results_test/valid_iupac2smiles_gpt4_SMILES_20_Fixed_ICL.csv
Name Prediction/results_test/valid_performance_iupac2smiles_gpt4_SMILES_20_Fixed_ICL.csv exits, con

50it [04:01,  5.41s/it]

0.072


100it [08:27,  5.07s/it]


0.057999999999999996
[['smiles2formula', 'gpt4', 'SMILES', 5, 'Scaffold_SIM', 0.057999999999999996, 0.08, 0.04, 0.06, 0.05, 0.06]]
Name Prediction/results_test/valid_smiles2formula_gpt4_SMILES_20_Fixed_ICL.csv


50it [04:46,  6.51s/it]

0.084


100it [09:57,  5.97s/it]

0.06999999999999999
[['smiles2formula', 'gpt4', 'SMILES', 20, 'Fixed_ICL', 0.06999999999999999, 0.12, 0.05, 0.08, 0.06, 0.04]]





In [23]:
import pandas as pd
import os
detail_save_folder = 'Name Prediction/results_test/'
TASK_Types = ['iupac2smiles', 'formula2smiles', 'smiles2iupac', 'smiles2formula']

In [24]:
all_performance = pd.DataFrame()
for task in TASK_Types:
    for sample_method, sample_num, name, model in params: 
        performance_file = detail_save_folder + 'valid_performance_{}_{}_{}_{}_{}.csv'.format(task, model, name, sample_num, sample_method)
        tem = pd.read_csv(performance_file)
        all_performance = all_performance.append(tem)

  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)
  all_performance = all_performance.append(tem)


In [88]:
all_performance[all_performance['task'] == 'iupac2smiles']

Unnamed: 0,task,model,name,sample_num,sample_method,avg_metric,metric_0,metric_1,metric_2,metric_3,metric_4
0,iupac2smiles,gpt4,SMILES,20,Scaffold_SIM,0.012,0.01,0.01,0.01,0.02,0.01
0,iupac2smiles,davinci003,SMILES,20,Scaffold_SIM,0.0,0.0,0.0,0.0,0.0,0.0
0,iupac2smiles,gpt35,SMILES,20,Scaffold_SIM,0.01,0.01,0.01,0.01,0.01,0.01
0,iupac2smiles,gpt4,SMILES,5,Scaffold_SIM,0.014,0.01,0.02,0.02,0.0,0.02
0,iupac2smiles,gpt4,SMILES,20,Fixed_ICL,0.01,0.0,0.01,0.01,0.01,0.02


# Check

In [44]:
def top_n_similar_strings_sim(query, candidates, n=5):
    # Calculate the Levenshtein distance between the query and each candidate
    distances = [(c, similarity_ratio(query, c)) for c in candidates]
    
    # Sort the candidates by their Levenshtein distance to the query
    sorted_distances = sorted(distances, key=lambda x: x[1], reverse=True)
    
    # Get the top n candidates with the smallest Levenshtein distance
    top_candidates = [d[0] for d in sorted_distances[:n]]
    
    # Return the top n candidates
    return top_candidates, [d[1] for d in sorted_distances[:n]]

In [54]:
test.iloc[0]['formula']

'C28H35ClN6O5'

In [55]:
a, b = top_n_similar_strings_sim(test.iloc[0]['formula'], list(train['formula']), n=20)

In [57]:
train[train['formula'].isin(a)]

Unnamed: 0,CID,smiles,iupac,formula,mol_length,scaffold_fp
9086,18784961,CCCCC(CNC(Cc1ccc(O)cc1)C(=O)O)NC(=O)OC(C)(C)C,3-(4-hydroxyphenyl)-2-[2-[(2-methylpropan-2-yl...,C20H32N2O5,48,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
2243,41120438,O=C(COC(=O)CCNC1=NS(=O)(=O)c2ccccc21)Nc1cccc(C...,"[2-(3-chloroanilino)-2-oxoethyl] 3-[(1,1-dioxo...",C18H16ClN3O5S,58,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
2523,73750156,Cc1ccc(C=NNc2nnc(C)n2N)o1.Cl,5-methyl-3-N-[(5-methylfuran-2-yl)methylidenea...,C9H13ClN6O,32,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
3823,35517793,Cc1ccc(OCC(=O)OCC(=O)NCCN2CC(C)OC(C)C2)cc1C,"[2-[2-[(2S,6S)-2,6-dimethylmorpholin-4-yl]ethy...",C20H30N2O5,48,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
3959,3778798,CC(C)Oc1cc(NC(=O)c2sccc2Oc2ccc([N+](=O)[O-])cc...,"N-(2,4-dichloro-5-propan-2-yloxyphenyl)-3-(2-f...",C20H15Cl2FN2O5S,73,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
318,135797389,CCCCCCCCCCCCCCCC(=NCc1cccc(OC)c1)C1=C(O)C(CO)O...,3-hydroxy-2-(hydroxymethyl)-4-[N-[(3-methoxyph...,C29H45NO5,53,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
587,108532930,Cc1ccc(C(=O)CCC(=O)N2CCN(C(=O)CCNC(=O)OC(C)(C)...,tert-butyl N-[3-[4-[4-(4-methylphenyl)-4-oxobu...,C23H33N3O5,58,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
4704,38308997,CCCn1c(=O)[nH]c(=O)c2c(C(=O)OC(C)C(=O)Nc3c(CC)...,"[(2S)-1-(2,6-diethylanilino)-1-oxopropan-2-yl]...",C27H34N4O5,70,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
5794,76749614,Cc1cc(C)n(C(=O)N(C)CCCc2[nH]c(C=C3C(=O)Nc4ccc(...,N-[3-[5-[[5-[(3-chlorophenyl)-methylsulfamoyl]...,C37H43ClN8O5S,109,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
9150,11721092,C=CCOC(=O)C=C[N+](CC)(CC)CC.O=[N+]([O-])c1ccc(...,(2-chloro-4-nitrophenyl)-[(2-chloro-4-nitrophe...,C24H28Cl2N6O6,93,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...


In [60]:
def top_n_scaffold_similar_molecules_sim(target_smiles, molecule_scaffold_list, molecule_smiles_list, n=5):
    target_mol = Chem.MolFromSmiles(target_smiles)
    target_scaffold = MurckoScaffold.GetScaffoldForMol(target_mol)
    target_fp = rdMolDescriptors.GetMorganFingerprint(target_scaffold, 2)

    similarities = []

    for idx, scaffold_fp in enumerate(molecule_scaffold_list):
        try:
            tanimoto_similarity = DataStructs.TanimotoSimilarity(target_fp, scaffold_fp)
            similarities.append((idx, tanimoto_similarity))
        except Exception as e:
            print(e)
            continue

    similarities.sort(key=lambda x: x[1], reverse=True)
    top_5_similar_molecules = similarities[:n]

    return [molecule_smiles_list[i[0]] for i in top_5_similar_molecules], [i[1] for i in top_5_similar_molecules]

In [64]:
test.iloc[0]['smiles'],

('CC(C)(C)OC(=O)NC1CCN(C(=O)CN2CCOCC2C(=O)Nc2cc(Cl)cc3c2[nH]c2cnccc23)CC1',)

In [62]:
a, b = top_n_scaffold_similar_molecules_sim(test.iloc[0]['smiles'], list(train['scaffold_fp']), list(train['smiles']), n=20)

In [63]:
train[train['smiles'].isin(a)]

Unnamed: 0,CID,smiles,iupac,formula,mol_length,scaffold_fp
980,110266602,CN(C)c1ncc(-c2ccccc2F)c(C2CCCN(C(=O)CCN3CCOCC3...,1-[3-[2-(dimethylamino)-5-(2-fluorophenyl)pyri...,C24H32FN5O2,57,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1170,75092174,CC(CCN1CCCCC1)C(=O)NC1NNC(c2cnc3ccccc3c2)C1F,N-(4-fluoro-5-quinolin-3-ylpyrazolidin-3-yl)-2...,C22H30FN5O,51,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
3067,27495086,O=C(Nc1cccnc1)c1cn(CC2CCCN(C(=O)C3CC3)C2)c2ccc...,1-[[(3R)-1-(cyclopropanecarbonyl)piperidin-3-y...,C24H26N4O2,59,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
5397,99071111,COC(=O)N1CCCN(C(=O)C(CCSC)NC(=O)c2ccccc2Cl)CC1,methyl 4-[(2R)-2-[(2-chlorobenzoyl)amino]-4-me...,C19H26ClN3O4S,49,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
6536,32535358,Cc1ccc(OCC(=O)N2CCN(C(=O)c3ccoc3)CC2)c2c1C(C)C...,(3R)-7-[2-[4-(furan-3-carbonyl)piperazin-1-yl]...,C22H24N2O5,57,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1951,77903191,CN(C(=O)C1CCCN1C(=O)C(N)Cc1ccc(O)cc1)C(Cc1cccc...,1-[2-[[1-[2-amino-3-(4-hydroxyphenyl)propanoyl...,C29H36N4O6,73,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
1178,26670304,CC1c2ccsc2CCN1CC(=O)Nc1cc(S(=O)(=O)N2CCCCC2)cc...,N-(2-chloro-5-piperidin-1-ylsulfonylphenyl)-2-...,C21H26ClN3O3S2,59,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
4404,100652034,Cc1cc(Br)cc(C(=O)N2CCCC(O)(C(N)=O)C2)c1,(3R)-1-(3-bromo-5-methylbenzoyl)-3-hydroxypipe...,C14H17BrN2O3,42,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
288,32181817,CCS(=O)(=O)N1CCC(NC(=O)c2cccc3c4c([nH]c23)CCCC...,"N-(1-ethylsulfonylpiperidin-4-yl)-5,6,7,8,9,10...",C21H29N3O3S,51,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...
680,159041195,CC(=O)N(C)C1C(C)CN(c2ccncc2Cc2ncc3ccc(-c4c(F)c...,"N-[(3R,4S,5S)-3-amino-1-[3-[[2-[2,6-difluoro-4...",C30H35F2N7O2,82,<rdkit.DataStructs.cDataStructs.UIntSparseIntV...


# Zero-shot

In [73]:
def create_zero_shot_prompt(input_text, task):
    if task == 'smiles2iupac':
        
        prompt = "You are an expert chemist. Given the molecular SMILES: {}, predict the molecular IUPAC name using your experienced chemical molecular SMILES and IUPAC name knowledge. No explanations and other information. \
Only return the molecular IUPAC name.".format(input_text)
        
    elif task == 'smiles2formula':
        
        prompt = "You are an expert chemist. Given the molecular SMILES: {}, predict the chemical molecular formula using your experienced chemical molecular SMILES and formula knowledge. No explanations and other information. \
Only return the molecular formula.".format(input_text)
        
    elif task == 'iupac2smiles':
        
        prompt = "You are an expert chemist. Given the molecular IUPAC name: {}, predict the molecular SMILES using your experienced chemical molecular IUPAC name and SMILES knowledge. No explanations and other information. \
Only return the molecular SMILES.".format(input_text)
        
    elif task == 'formula2smiles':
        
        prompt = "You are an expert chemist. Given the molecular formula: {}, predict the molecular SMILES using your experienced chemical molecular formula and SMILES knowledge. No explanations and other information. \
Only return the molecular SMILES.".format(input_text)
        
    return prompt

In [74]:
model = 'gpt4'

In [76]:
# format the running grid search 
# evaluate and save

detail_save_folder = 'Name Prediction/results_test/'
TASK_Types = ['iupac2smiles', 'formula2smiles', 'smiles2iupac', 'smiles2formula']

for task in TASK_Types:

    details_results = []
    performance_results = []

    detail_predict_file = detail_save_folder + '{}_{}_zero_shot.csv'.format(task, model)
    log_file = detail_save_folder + '{}_{}_zero_shot.log'.format(task, model)
    performance_file = detail_save_folder + '{}_{}_zero_shot_performance.csv'.format(task, model)
    print(detail_predict_file)


    if os.path.exists(performance_file):
        print("{} exits, continue to next params".format(performance_file))
        continue


    # append new date
    # Get the current date and time
    now = datetime.datetime.now()
    # Convert the date and time to a string
    date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
    with open(log_file, "a") as file:
        file.write("=" * 30 + date_time_str + "=" * 30 + "\n")

    # init var
    predicted_products = []

    # restore previous data and skip 
    previous_index = []
    if os.path.exists(detail_predict_file):
        previous = pd.read_csv(detail_predict_file)
        previous_index = list(previous['index'])

        # previous data => details_results 
        details_results = previous.values.tolist()
        predicted_products = previous[['pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5']].values.tolist()

    input_col, output_col = get_input_output_columns_by_task(task)

    # query
    for idx, row in tqdm(test.iterrows()):
        reactant = row[input_col]
        product = row[output_col]
        index = row['index']

        if index in previous_index:
            continue

    
        prompt = create_zero_shot_prompt(reactant, task)
        with open(log_file, "a") as file:
            file.write(prompt + "\n")
            file.write("=" * 50 + "\n")

        # different model 
        if model == 'davinci003':
            predicted_product = generate_response_by_davinci(prompt)
        elif model == 'gpt35':
            predicted_product = generate_response_by_gpt35(prompt)
        elif model == 'gpt4':
            predicted_product = generate_response_by_gpt4(prompt)

        predicted_products.append(predicted_product)
        details_results.append([index, reactant] + [product] + predicted_product)

        details_df = pd.DataFrame(details_results, columns=['index', input_col, output_col, 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5'])
        details_df.to_csv(detail_predict_file, index=False)

    # evaluate
    acc_list = []
    for repeat in range(5):
        tpredicted_products = [i[repeat] for i in predicted_products]
        correct = 0
        all_sample_num = len(test)
        for idx, gt in enumerate(list(test[output_col])):
            pred = tpredicted_products[idx]
            if task in ['iupac2smiles', 'formula2smiles']:
                try:
                    mol = Chem.MolFromSmiles(pred)
                    pred = Chem.MolToSmiles(mol)
                except Exception as e:
                    continue

            if gt == pred:
                correct += 1
        acc = correct / all_sample_num
        acc_list.append(acc)

    # save to file
    performance_results.append([task, np.mean(acc_list)] + acc_list)
    print(performance_results)

    # performance save based on the task
    tem = pd.DataFrame(performance_results, columns=['task', 'avg_metric'] + ['metric_{}'.format(i) for i in range(5)])
    tem.to_csv(performance_file, index=False)


Name Prediction/results_test/iupac2smiles_gpt4_zero_shot.csv
Name Prediction/results_test/iupac2smiles_gpt4_zero_shot_performance.csv exits, continue to next params
Name Prediction/results_test/formula2smiles_gpt4_zero_shot.csv


100it [04:28,  2.68s/it]
[10:21:49] Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 17 18 19 21 22
[10:21:49] SMILES Parse Error: extra close parentheses while parsing: C1=C(NC(=O)NS(=O)(=O)C2=C(C=CC=C2)F)C(=O)C2=CC(NNC3=NC4=C(C=O)N3C3=CCCCC3)=C(CC3=CC(=CC=C3)F)C=C4)=CC=C12
[10:21:49] SMILES Parse Error: Failed parsing SMILES 'C1=C(NC(=O)NS(=O)(=O)C2=C(C=CC=C2)F)C(=O)C2=CC(NNC3=NC4=C(C=O)N3C3=CCCCC3)=C(CC3=CC(=CC=C3)F)C=C4)=CC=C12' for input: 'C1=C(NC(=O)NS(=O)(=O)C2=C(C=CC=C2)F)C(=O)C2=CC(NNC3=NC4=C(C=O)N3C3=CCCCC3)=C(CC3=CC(=CC=C3)F)C=C4)=CC=C12'
[10:21:49] Explicit valence for atom # 24 S, 7, is greater than permitted
[10:21:49] SMILES Parse Error: syntax error while parsing: C16H27F3N2O3
[10:21:49] SMILES Parse Error: Failed parsing SMILES 'C16H27F3N2O3' for input: 'C16H27F3N2O3'
[10:21:49] SMILES Parse Error: syntax error while parsing: C54H58FN3O10
[10:21:49] SMILES Parse Error: Failed parsing SMILES 'C54H58FN3O10' for input: 'C54H58FN3O10'
[10:21:49] Explicit valence for ato

[['formula2smiles', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
Name Prediction/results_test/smiles2iupac_gpt4_zero_shot.csv


100it [21:24, 12.85s/it]


[['smiles2iupac', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
Name Prediction/results_test/smiles2formula_gpt4_zero_shot.csv


100it [10:45,  6.45s/it]

[['smiles2formula', 0.048, 0.02, 0.05, 0.08, 0.05, 0.04]]





In [50]:
import statistics

In [82]:
tem = pd.read_csv(detail_save_folder + "formula2smiles_gpt4_zero_shot_performance.csv")

In [83]:
metric_cols = ['metric_{}'.format(i) for i in range(5)]

In [84]:
metric_cols

['metric_0', 'metric_1', 'metric_2', 'metric_3', 'metric_4']

In [85]:
statistics.stdev(list(tem.iloc[0][metric_cols]))

0.0

In [86]:
tem

Unnamed: 0,task,avg_metric,metric_0,metric_1,metric_2,metric_3,metric_4
0,formula2smiles,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
detail_save_folder

'Name Prediction/results_test/'

In [157]:
df = pd.read_csv(detail_save_folder + "valid_formula2smiles_gpt4_SMILES_20_Scaffold_SIM.csv")

In [158]:
idx = 2

In [159]:
df.head()

Unnamed: 0,index,formula,smiles,pred_1,pred_2,pred_3,pred_4,pred_5
0,2928,C28H35ClN6O5,CC(C)(C)OC(=O)NC1CCN(C(=O)CN2CCOCC2C(=O)Nc2cc(...,CCc1cc(C(=O)N2CCN(C)CC2)c(NNC(=O)COC(=O)C23CCN...,CC(=O)Nc1cc(CC(=O)COC(=O)C2(C)CCCC(=O)N(CCC)n3...,COc1cc(NS(=O)(=O)c2ccccc2NCCOc2cccc(Cl)c2)ccc1...,COCCN1C(=O)c2cc(Cl)c(OC)c(OCc3ccc(N4CCC=N4)cc3...,COc1ccc(CN2CCN(CCNC(=O)c3cc(NCc4ccccc4Cl)nc([N...
1,5735,C21H26N2O5S,Cc1nc2cc(C(O)CC(=O)N3C4CC5CCC4(CS3(=O)=O)C5(C)...,CCOC(=O)C1=C(NC(=O)CSc2ncccc2)C(O)CC(C)C1OC,COc1ccc(S(=O)(=O)N2CCN(C(=O)c3ccccc3)CCC2)c2c1...,COc1ccc(C(=O)Nc2cccc(OS(=O)(=O)N3CCCC3)c2)cc1OC,CCSCC(NC(=O)OC(=O)C1=CC=C(C)N=C1)C(=O)N1Cc2ccc...,CCOc1ccc(NC(=O)CCC(=O)NNC2CCCS2)cc1OCC
2,1624,C8H10N4O2,Cc1noc(CCn2cc[nH]c2=O)n1,CCn1cnc(=O)c(=O)n1Cc1cccnc1,Nc1nc(O)ccc1NC(=O)Nc1ccc[nH]1,NNc1ccc(N)cc1OC(=O)C=N,COc1ccnc(OC)c1N1C(=O)C(=O)N=N1,Cc1nc(NC(=O)C=N)c(N)nc1O
3,3047,C8H13NO3,CCOC(=O)C(C(C)=O)=C(C)N,CC(=O)N1CCCC1C(=O)O,CC(=O)N1CCCC1C(=O)O,CC(=O)NCC(C)C(O)C(=O)O,CC1(N)CCCC(C(=O)O)C1,CC(C(=O)O)N1CCCCCC1=O
4,8848,C22H17N3O3,CC(=O)Nc1cccc(-c2nc(-c3ccc(Oc4ccccc4)cc3)no2)c1,Cc1ccc2c(c1)C(=O)N(C)C3CCC(N3c2)c1ccc(=O)n(C)n1,O=C(Nc1cccc(Nc2ccncc2)c1)c1cc(-c2ccccn2)ccc1O,O=C(Nc1cccc2c1cccc2)C1=C(C=CC=N1)c1cc(C)nc2c1c...,O=C(Nc1cccc2ccccc12)c1cccnc1-c1cnc2c(=O)[nH]cn...,O=C(Nc1ccccc1)c1cc2c(c(N3CCOCC3)c1C=O)CCCC2


In [163]:
df.iloc[idx]['smiles']

'Cc1noc(CCn2cc[nH]c2=O)n1'

In [164]:
df.iloc[idx]['formula']

'C8H10N4O2'

In [165]:
df.iloc[idx]['pred_1']

'CCn1cnc(=O)c(=O)n1Cc1cccnc1'