In [7]:
# ── Download the complete hERG dataset from TDC and save to disk ──
from pathlib import Path
from tdc.single_pred import Tox

# 1. Choose a local folder
root = Path("Data_v2/hERG")
root.mkdir(parents=True, exist_ok=True)

# 2. Fetch the full raw dataframe  (≈7 k compounds, columns: 'Drug', 'SMILES', 'Y')
df_all = Tox(name="hERG").get_data()      # no split, whole dataset

# 3. Save as CSV
csv_path = root / "all.csv"
df_all.to_csv(csv_path, index=False)
print(f"✅ Saved full hERG dataset to {csv_path}  |  shape = {df_all.shape}")


Found local copy...
Loading...
Done!


✅ Saved full hERG dataset to Data_v2\hERG\all.csv  |  shape = (655, 3)


In [3]:
from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')   # silence warnings
mol = Chem.MolFromSmiles("c1ccccc1O")
print(mol.GetNumAtoms())         # should print 7

7


In [4]:
import os
import numpy as np
import pandas as pd
import deepchem as dc

# ========== Config ==========
save_dir = "Data_v2/tox21"
os.makedirs(save_dir, exist_ok=True)

# Choose a single task for binary classification
selected_task = 'SR-ARE'  # Change to 'AR', 'ER', etc., if needed

# ========== Load Tox21 ==========
print("Downloading and loading Tox21 dataset...")
tasks, datasets, transformers = dc.molnet.load_tox21(
    featurizer='ECFP',  # You can also try 'GraphConv', 'MACCSKeys', etc.
    split='random'
)
train_dataset, valid_dataset, test_dataset = datasets

# ========== Extract and Combine Data ==========
def extract_data(dataset):
    X = dataset.X
    y = dataset.y[:, tasks.index(selected_task)]  # Only the selected task
    w = dataset.w[:, tasks.index(selected_task)]  # Weights (to mask invalid labels)
    ids = dataset.ids  # These are SMILES strings
    return pd.DataFrame({
        "smiles": ids,
        "label": y,
        "weight": w
    }), X

# Extract and stack
df_train, X_train = extract_data(train_dataset)
df_valid, X_valid = extract_data(valid_dataset)
df_test,  X_test  = extract_data(test_dataset)

df_all = pd.concat([df_train, df_valid, df_test], axis=0).reset_index(drop=True)
X_all = np.vstack([X_train, X_valid, X_test])

# Filter out invalid labels (weight == 0)
df_valid = df_all[df_all["weight"] == 1].drop(columns=["weight"]).reset_index(drop=True)
X_valid = X_all[df_all["weight"] == 1]

# ========== Save to Disk ==========
df_valid.to_csv(f"{save_dir}/tox21_{selected_task}.csv", index=False)
np.save(f"{save_dir}/X_{selected_task}.npy", X_valid)

print(f"✅ Saved SMILES + labels to: {save_dir}/tox21_{selected_task}.csv")
print(f"✅ Saved features to:       {save_dir}/X_{selected_task}.npy")


ImportError: Traceback (most recent call last):
  File "d:\Coding Projects\Predicting-Drug-Response-Using-Multi-Omics-Data-with-XAI\god\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [12]:
import pandas as pd

# --- 1. Choose the file and (optionally) sheet --------------------------------
file_path = "Data_v2/drug screening/GDSC2_fitted_dose_response_27Oct23.xlsx"   # <-- change this
sheet_name = 0          # 0 = first sheet, or use the sheet’s string name, e.g. "Sheet1"

# --- 2. Read the Excel sheet into a DataFrame ---------------------------------
df = pd.read_excel(
    file_path,
    sheet_name=sheet_name,
    engine="openpyxl"   # the default; explicitly stating it avoids warnings
)

# --- 3. Inspect a few rows -----------------------------------------------------
print("DataFrame shape:", df.shape)
display(df.head(10))     # in a Jupyter notebook; use print(df.head(10)) in a console


DataFrame shape: (242036, 19)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC2,343,15946310,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.463887,0.93022,0.089052,0.433123
1,GDSC2,343,15946548,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-4.869455,0.61497,0.111351,-1.4211
2,GDSC2,343,15946830,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.360586,0.791072,0.142855,-0.599569
3,GDSC2,343,15947087,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-5.04494,0.59266,0.135539,-1.516647
4,GDSC2,343,15947369,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-3.741991,0.734047,0.128059,-0.807232
5,GDSC2,343,15947651,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-5.142961,0.582439,0.137581,-1.570016
6,GDSC2,343,15947932,687448,COLO-829,SIDM00909,SKCM,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.235034,0.867348,0.09347,0.557727
7,GDSC2,343,15948212,687452,5637,SIDM00807,BLCA,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.632632,0.834067,0.076169,-0.203221
8,GDSC2,343,15948491,687455,RT4,SIDM01085,BLCA,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-2.963191,0.821438,0.094466,-0.3832
9,GDSC2,343,15948772,687457,SW780,SIDM01160,BLCA,1003,Camptothecin,TOP1,DNA replication,1046,Y,0.0001,0.1,-1.449138,0.90505,0.074109,0.441154
