In [89]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import uuid
import os
import selfies as sf

In [90]:
df = pd.read_csv("../BioT5/data/kaggle/DDH Data.csv")
df = df.loc[df["pIC50 (IC50 in microM)"] != "BLINDED"]
df = df.drop(["Unnamed: 3", 'Compound No.'], axis = 1)
df.head()

Unnamed: 0,SMILES,pIC50 (IC50 in microM)
0,ClC1=CC(NC(=O)CSC2=NC=CC(=N2)C2=CSC(=N2)C2=CC=...,-0.477121255
1,CN1N=C(C=C1C(F)(F)F)C1=CC=C(S1)C1=CC=NC(SCC(=O...,-1.0
2,CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...,-1.041392685
4,CC1=NC(=CS1)C1=NC(=CS1)C1=NC(SCC(=O)NC2=CC=C(C...,-1.146128036
5,ClC1=CC=C(NC(=O)CSC2=NC=CC(=N2)C2=CC(=NO2)C2=C...,-1.176091259


In [91]:
def get_selfies(row):
    return sf.encoder(row["SMILES"])

df["selfies"] = df.apply(get_selfies, axis=1)


In [92]:
def get_activity(row):
    return "Yes." if float(row["pIC50 (IC50 in microM)"]) > -1 else "No."

df["activity"] = df.apply(get_activity, axis=1)
df.head(), len(df)

(                                              SMILES pIC50 (IC50 in microM)  \
 0  ClC1=CC(NC(=O)CSC2=NC=CC(=N2)C2=CSC(=N2)C2=CC=...           -0.477121255   
 1  CN1N=C(C=C1C(F)(F)F)C1=CC=C(S1)C1=CC=NC(SCC(=O...                     -1   
 2  CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...           -1.041392685   
 4  CC1=NC(=CS1)C1=NC(=CS1)C1=NC(SCC(=O)NC2=CC=C(C...           -1.146128036   
 5  ClC1=CC=C(NC(=O)CSC2=NC=CC(=N2)C2=CC(=NO2)C2=C...           -1.176091259   
 
                                              selfies activity  
 0  [Cl][C][=C][C][Branch2][Ring2][Ring1][N][C][=B...     Yes.  
 1  [C][N][N][=C][Branch1][=N][C][=C][Ring1][Branc...      No.  
 2  [C][S][C][=C][Branch2][Ring1][Branch1][C][Bran...      No.  
 4  [C][C][=N][C][=Branch1][Branch1][=C][S][Ring1]...      No.  
 5  [Cl][C][=C][C][=C][Branch2][Ring2][Ring1][N][C...      No.  ,
 94)

In [93]:
# Add data from ttd. Assume all drugs in TTD are active
drugs_from_ttd = pd.read_csv("../BioT5/data/ttd/processed_dataset.csv")
for i, row in drugs_from_ttd.iterrows():
    if row["selfies"] not in df["selfies"]:
        new_row = {"SMILES": [row["smiles"]],
                       "pIC50 (IC50 in microM)": [0],
                       "selfies": [row["selfies"]],
                       "activity": ["Yes."]}
        df = pd.concat([df, pd.DataFrame(new_row)], ignore_index=True)
        df.reset_index()
    else:
        print("duplicate", row["drug_name"])
df.head(), len(df)

(                                              SMILES pIC50 (IC50 in microM)  \
 0  ClC1=CC(NC(=O)CSC2=NC=CC(=N2)C2=CSC(=N2)C2=CC=...           -0.477121255   
 1  CN1N=C(C=C1C(F)(F)F)C1=CC=C(S1)C1=CC=NC(SCC(=O...                     -1   
 2  CSC1=C(C(C)=C(S1)C1=NC(C)=CS1)C1=CC=NC(SCC(=O)...           -1.041392685   
 3  CC1=NC(=CS1)C1=NC(=CS1)C1=NC(SCC(=O)NC2=CC=C(C...           -1.146128036   
 4  ClC1=CC=C(NC(=O)CSC2=NC=CC(=N2)C2=CC(=NO2)C2=C...           -1.176091259   
 
                                              selfies activity  
 0  [Cl][C][=C][C][Branch2][Ring2][Ring1][N][C][=B...     Yes.  
 1  [C][N][N][=C][Branch1][=N][C][=C][Ring1][Branc...      No.  
 2  [C][S][C][=C][Branch2][Ring1][Branch1][C][Bran...      No.  
 3  [C][C][=N][C][=Branch1][Branch1][=C][S][Ring1]...      No.  
 4  [Cl][C][=C][C][=C][Branch2][Ring2][Ring1][N][C...      No.  ,
 255)

In [94]:
task_id = 203 #arbitrary id number, biot5's task ids go up to 180. 

os.makedirs("../BioT5/data/splits/covid/prediction", exist_ok=True)
os.makedirs("../deepchem_property_prediction/data", exist_ok=True)

train_df, val_test_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df[["activity"]])
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42, stratify=val_test_df[["activity"]])
for dset, df in enumerate((train_df, val_df, test_df)):
    content = {"Contributors": ["Nguyen Duy Minh"], "Categories": ["Classification"], "Reasoning": [], "URL": ["https://www.kaggle.com/datasets/divyansh22/drug-discovery-data"], "Instruction_language": ["English"], "Domains": ["Chemistry"], "Positive Examples": [], "Negative Examples": [],"Instances": [], "Source": ["Predict the property of the given molecule."], "Definition": ["You are given a molecule SELFIES. If the given molecule can inhibit HIV replication, indicate via \"Yes\". Otherwise, response via \"No\"."], "Input_language": ["SELFIES"], "Output_language": ["English"], "Instance License": ["Unknown"]}
    for _, row in df.iterrows():
        content["Instances"].append(        {
        "id": f"task{task_id}-{uuid.uuid4().hex}",
        "input": f'Molecule: <bom>{row["selfies"]}<eom>',
        "output": [
            row["activity"]
        ]
    })
    with open(f'../BioT5/data/tasks/task{task_id}_COVID_drug_activity_prediction_{["train", "validation", "test"][dset]}.json', "w") as out:
        json.dump(content, out, indent=4)
    with open(f'../BioT5/data/splits/covid/prediction/{["train", "validation", "test"][dset]}_tasks.txt', "w") as out:
        out.write(f'task{task_id}_COVID_drug_activity_prediction_{["train", "validation", "test"][dset]}')
    df.to_csv(f'../deepchem_property_prediction/data/{["train", "validation", "test"][dset]}.csv')