# Data preparation

The aim of this Notebook is to prepare data for downstream processing. In particualr we will load Tahoe dataset, create the hold-outs and save them into the corresponding folders.

### Import the libraries

In [32]:

import scanpy as sc
import numpy as np 
from scanpy import AnnData
from tqdm import tqdm
import ast
import pandas as pd

### Create different data splits

In [4]:

# load the data
adata_full=sc.read_h5ad('/home/ubuntu/anatoly-tahoe-100/data/datatahoe-100m.h5ad')




### Create the data slicing

In [27]:

def safe_parse(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return None


In [35]:

# create parsed list
parsed_list = []

# parse the data
for val in adata_full.obs["drugname_drugconc_x"]:
    try:
        parsed = ast.literal_eval(val)
        parsed_list.append(parsed[0])  # Only take the first tuple
    except Exception:
        parsed_list.append((None, None, None))  # Fallback in case of error

# Step 2: Convert to DataFrame
parsed_df = pd.DataFrame(parsed_list, columns=["drug_name", "concentration", "concentration_unit"])

# add the parsed data to the adata
adata_full.obs["drug_name"] = parsed_df["drug_name"].values
adata_full.obs["concentration"] = parsed_df["concentration"].values
adata_full.obs["concentration_unit"] = parsed_df["concentration_unit"].values


In [36]:

adata_full.obs


Unnamed: 0,drug,sample,BARCODE_SUB_LIB_ID,cell_line_id,moa-fine,canonical_smiles,pubchem_cid,plate,mean_gene_count_x,mean_tscp_count_x,...,mean_pcnt_mito_x,drugname_drugconc_x,mean_gene_count_y,mean_tscp_count_y,mean_mread_count_y,mean_pcnt_mito_y,drugname_drugconc_y,drug_name,concentration,concentration_unit
0,8-Hydroxyquinoline,smp_1783,01_001_052-lib_1105,CVCL_0480,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",8-Hydroxyquinoline,0.05,uM
1,8-Hydroxyquinoline,smp_1783,01_001_105-lib_1105,CVCL_0546,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",8-Hydroxyquinoline,0.05,uM
2,8-Hydroxyquinoline,smp_1783,01_001_165-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",8-Hydroxyquinoline,0.05,uM
3,8-Hydroxyquinoline,smp_1783,01_003_094-lib_1105,CVCL_1717,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",8-Hydroxyquinoline,0.05,uM
4,8-Hydroxyquinoline,smp_1783,01_003_164-lib_1105,CVCL_1056,unclear,C1=CC2=C(C(=C1)O)N=CC=C2,1923.0,plate4,1478.268171,2341.339094,...,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",1478.268171,2341.339094,2738.463797,0.023783,"[('8-Hydroxyquinoline', 0.05, 'uM')]",8-Hydroxyquinoline,0.05,uM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,(R)-Verapamil (hydrochloride),smp_1799,17_160_097-lib_1124,CVCL_1495,unclear,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,170014.0,plate4,1513.610111,2523.233013,...,0.069336,"[('(R)-Verapamil (hydrochloride)', 0.05, 'uM')]",1513.610111,2523.233013,2950.299370,0.069336,"[('(R)-Verapamil (hydrochloride)', 0.05, 'uM')]",(R)-Verapamil (hydrochloride),0.05,uM
999996,(R)-Verapamil (hydrochloride),smp_1799,17_160_108-lib_1124,CVCL_1125,unclear,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,170014.0,plate4,1513.610111,2523.233013,...,0.069336,"[('(R)-Verapamil (hydrochloride)', 0.05, 'uM')]",1513.610111,2523.233013,2950.299370,0.069336,"[('(R)-Verapamil (hydrochloride)', 0.05, 'uM')]",(R)-Verapamil (hydrochloride),0.05,uM
999997,(R)-Verapamil (hydrochloride),smp_1799,17_160_112-lib_1124,CVCL_0320,unclear,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,170014.0,plate4,1513.610111,2523.233013,...,0.069336,"[('(R)-Verapamil (hydrochloride)', 0.05, 'uM')]",1513.610111,2523.233013,2950.299370,0.069336,"[('(R)-Verapamil (hydrochloride)', 0.05, 'uM')]",(R)-Verapamil (hydrochloride),0.05,uM
999998,(R)-Verapamil (hydrochloride),smp_1799,17_160_165-lib_1124,CVCL_0546,unclear,CC(C)C(CCCN(C)CCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,170014.0,plate4,1513.610111,2523.233013,...,0.069336,"[('(R)-Verapamil (hydrochloride)', 0.05, 'uM')]",1513.610111,2523.233013,2950.299370,0.069336,"[('(R)-Verapamil (hydrochloride)', 0.05, 'uM')]",(R)-Verapamil (hydrochloride),0.05,uM
