In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

In [2]:
ori_df = pd.read_excel("../s0_prepData/s0e_prepData_LnAn/db_LnAn_full.xlsx", header=1)
print(ori_df.shape)
ori_df.head(3)

(8075, 17)


Unnamed: 0,Entry_ID,Extractants_count,SMILES,Extractant_conc_M,Solvent_A,Solvent_B,Volume_fraction_A,Volume_fraction_B,Metal,Metal_conc_mM,Acid_type,Acid_conc_M,Temperature_K,Distribution_ratio,Log_D,DOI,Comments
0,0,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,La(III),0.01,HNO3,4.3,295.15,0.321764,-0.492462,https://doi.org/10.1002/chem.201806161,DGA LANL
1,1,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Ce(III),0.01,HNO3,4.3,295.15,0.622257,-0.20603,https://doi.org/10.1002/chem.201806161,DGA LANL
2,2,1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,0.1,hydrogenated tetrapropylene,,1.0,0.0,Pr(III),0.01,HNO3,4.3,295.15,0.757525,-0.120603,https://doi.org/10.1002/chem.201806161,DGA LANL


In [3]:
SF_threshold = 10
metal1 = "Am(III)"
metal2 = "Eu(III)"

In [4]:
uni_smiles_df = pd.DataFrame(ori_df['SMILES'].drop_duplicates().reset_index(drop=True))
print(uni_smiles_df.shape)
uni_smiles_df.head(3)

(295, 1)


Unnamed: 0,SMILES
0,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...
1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@@H](C)C(=O...
2,CN(C(=O)COCC(=O)N(C)c1ccccc1)c1ccccc1


## Eval_DOI

In [5]:
# Define custom sort order based on (Target_metal, Other_metal) combinations
combo_order = {
    ("ORGANIC", "AQUEOUS"): 0,
    ("ORGANIC", "UNSELECTIVE"): 1,
    ("UNSELECTIVE", "AQUEOUS"): 2,
    ("ORGANIC", "ORGANIC"): 3,
    ("UNSELECTIVE", "UNSELECTIVE"): 4,
    ("AQUEOUS", "AQUEOUS"): 5,
    ("UNSELECTIVE", "ORGANIC"): 6,
    ("AQUEOUS", "UNSELECTIVE"): 7,
    ("AQUEOUS", "ORGANIC"): 8,
    
    ("ORGANIC", "UNTESTED"): 9,
    ("UNSELECTIVE", "UNTESTED"): 10,
    ("AQUEOUS", "UNTESTED"): 11,
    
    ("UNTESTED", "AQUEOUS"): 12,
    ("UNTESTED", "UNSELECTIVE"): 13,
    ("UNTESTED", "ORGANIC"): 14,
    ("UNTESTED", "UNTESTED"): 15,
}

In [6]:
# With DOI

def classify_D(D_value):
    if D_value:
        if D_value > 1:
            return "ORGANIC"
        elif 0.1 <= D_value <= 1:
            return "UNSELECTIVE"
        else:
            return "AQUEOUS"
    else:
        return "UNTESTED"

def extract_metal_label(OUA_tuple_list):
    # Score oua tuple list
    score_dic_list = []
    for combo in OUA_tuple_list:
        score = combo_order.get(combo)
        score_dic_list.append({combo: score})
    
    # Extract metal label from score
    flattened = []
    for d in score_dic_list:
        for key, value in d.items():
            flattened.append((key, value))
    min_entry = min(flattened, key=lambda x: x[1])
    (metal1_label, metal2_label), score = min_entry
    
    return metal1_label, metal2_label

def check_D(smiles):
    # Filter ori_df
    smiles_df = ori_df[ori_df['SMILES'] == smiles]

    columns_to_compare = ['Extractant_conc_M', 'Solvent_A', 'Solvent_B', 'Acid_type', 'Acid_conc_M', 'Temperature_K', 'Distribution_ratio']
    
    metal1_conditions_D = smiles_df.loc[smiles_df['Metal'] == metal1, columns_to_compare]
    metal2_conditions_D = smiles_df.loc[smiles_df['Metal'] == metal2, columns_to_compare]
    
    DOI_col = ['DOI']
    # If the SMILES doesn't have any metal data, the DOI is empty
    DOI_df = smiles_df.loc[(smiles_df['Metal'] == metal1) | (smiles_df['Metal'] == metal2), DOI_col].drop_duplicates()
    
    OUA_tuple_list = []
    if not metal1_conditions_D.empty and not metal2_conditions_D.empty:
        # Iterate over rows
        sameConditionsExist = False
        for _, row1 in metal1_conditions_D.iterrows():
            for _, row2 in metal2_conditions_D.iterrows():
                if row1[:-1].equals(row2[:-1]):
                    sameConditionsExist = True
                    dr1 = row1["Distribution_ratio"]
                    dr2 = row2["Distribution_ratio"]
                    OUA_tuple_list.append((classify_D(dr1), classify_D(dr2)))
        if sameConditionsExist:
            metal1_label, metal2_label = extract_metal_label(OUA_tuple_list)

        elif sameConditionsExist == False:
            for _, row2 in metal2_conditions_D.iterrows():
                dr1 = False
                dr2 = row2["Distribution_ratio"]
                OUA_tuple_list.append((classify_D(dr1), classify_D(dr2)))
                    
            for _, row1 in metal1_conditions_D.iterrows():
                dr1 = row1["Distribution_ratio"]
                dr2 = False
                OUA_tuple_list.append((classify_D(dr1), classify_D(dr2)))
                
            metal1_label, metal2_label = extract_metal_label(OUA_tuple_list)
 
    elif metal1_conditions_D.empty and not metal2_conditions_D.empty:        
        for _, row2 in metal2_conditions_D.iterrows():
            dr1 = False
            dr2 = row2["Distribution_ratio"]
            OUA_tuple_list.append((classify_D(dr1), classify_D(dr2)))

        metal1_label, metal2_label = extract_metal_label(OUA_tuple_list)
        
    elif metal2_conditions_D.empty and not metal1_conditions_D.empty:      
        for _, row1 in metal1_conditions_D.iterrows():
            dr1 = row1["Distribution_ratio"]
            dr2 = False
            OUA_tuple_list.append((classify_D(dr1), classify_D(dr2)))
    
        metal1_label, metal2_label = extract_metal_label(OUA_tuple_list)
    else:
        dr1 = False
        dr2 = False
        OUA_tuple_list.append((classify_D(dr1), classify_D(dr2)))
        metal1_label, metal2_label = extract_metal_label(OUA_tuple_list)
        
    return pd.Series([metal1_label, metal2_label, ",".join(DOI_df['DOI'])], index=["Target_metal", "Other_metal", 'DOI'])
    

In [7]:
orig_eval_DOI_df = uni_smiles_df.copy()
orig_eval_DOI_df['Target_metal'] = ''
orig_eval_DOI_df['Other_metal'] = ''
orig_eval_DOI_df['Source'] = 'Experimental'
orig_eval_DOI_df['Similarity_to_Experimental'] = 'Reference'
orig_eval_DOI_df['Similarity_to_Generated'] = 'Reference'
orig_eval_DOI_df['LogP'] = ''

# Apply the function to each SMILES
orig_eval_DOI_df[["Target_metal", "Other_metal", 'DOI']] = orig_eval_DOI_df['SMILES'].apply(check_D)
print(orig_eval_DOI_df.shape)
orig_eval_DOI_df.head(2)

(295, 8)


Unnamed: 0,SMILES,Target_metal,Other_metal,Source,Similarity_to_Experimental,Similarity_to_Generated,LogP,DOI
0,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@H](C)C(=O)...,UNTESTED,UNSELECTIVE,Experimental,Reference,Reference,,https://doi.org/10.1002/chem.201806161
1,CCCCCCCCN(CCCCCCCC)C(=O)[C@H](C)O[C@@H](C)C(=O...,UNTESTED,ORGANIC,Experimental,Reference,Reference,,https://doi.org/10.1002/chem.201806161


In [8]:
# Apply sorting key
orig_eval_DOI_df['sort_key'] = orig_eval_DOI_df.apply(lambda row: combo_order.get((row['Target_metal'], row['Other_metal']), 16), axis=1)

# Sort the df using the custom key and drop the helper column
orig_eval_DOI_df = orig_eval_DOI_df.sort_values(by="sort_key").drop(columns="sort_key")
orig_eval_DOI_df.head(2)

Unnamed: 0,SMILES,Target_metal,Other_metal,Source,Similarity_to_Experimental,Similarity_to_Generated,LogP,DOI
61,CC12CCC(c3nnc(-c4cccc(-c5nnc6c(n5)C5(C)CCC6C5(...,ORGANIC,AQUEOUS,Experimental,Reference,Reference,,"https://doi.org/10.1080/07366299.2011.539129,h..."
172,CC(C)(C)c1cc(-c2cc(C(C)(C)C)cc(-c3nnc4c(n3)C(C...,ORGANIC,AQUEOUS,Experimental,Reference,Reference,,https://cordis.europa.eu/project/id/211267


In [9]:
def classify_logP(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 'Invalid'
    logP = rdMolDescriptors.CalcCrippenDescriptors(mol)[0]
    if logP > 3:
        return 'ORGANIC'
    elif 2 <= logP <= 3:
        return 'UNSELECTIVE'
    else:
        return 'AQUEOUS'

orig_eval_DOI_df['LogP'] = orig_eval_DOI_df['SMILES'].apply(classify_logP)

orig_eval_DOI_df.head(2)

Unnamed: 0,SMILES,Target_metal,Other_metal,Source,Similarity_to_Experimental,Similarity_to_Generated,LogP,DOI
61,CC12CCC(c3nnc(-c4cccc(-c5nnc6c(n5)C5(C)CCC6C5(...,ORGANIC,AQUEOUS,Experimental,Reference,Reference,ORGANIC,"https://doi.org/10.1080/07366299.2011.539129,h..."
172,CC(C)(C)c1cc(-c2cc(C(C)(C)C)cc(-c3nnc4c(n3)C(C...,ORGANIC,AQUEOUS,Experimental,Reference,Reference,ORGANIC,https://cordis.europa.eu/project/id/211267


In [None]:
# orig_eval_DOI_df.to_excel('expSMILESeval_AmEu/expSMILESeval_DOI.xlsx', index=False)

## Remove DOI

In [11]:
orig_eval_df = orig_eval_DOI_df[['SMILES', 'Target_metal','Other_metal','Source','Similarity_to_Experimental','Similarity_to_Generated', 'LogP']]

In [12]:
orig_eval_df.to_excel('expSMILESeval_AmEu/expSMILESeval.xlsx', index=False)