In [1]:
import os
import glob
import pickle
import numpy as np
import pandas as pd
import datasetdatabase as dsdb

os.environ["DSDB_PROCESS_LIMIT"] = "16"

# MEM, DNA and FOV Datasets

Connecting to dsdb

In [2]:
prod = dsdb.DatasetDatabase(config="/allen/aics/assay-dev/Analysis/QCB_database/prod_config.json")

Loading metadata

In [3]:
with open("../../data-raw/QCB_drug_cell_meta.pkl", "rb") as fp:
    df_meta = pickle.load(fp)

df_meta.head()

Unnamed: 0,cell_id,cell_ver,czi_filename,drug_label,idx_in_stack,roi,str_ver,structure_name
0,0ade9e16-db16-420a-9ccb-8cad3b32ee04,0.1.0,3500001171_100X_20170811_1-Scene-1-P7-D05.czi,Paclitaxol,1,"[35, 141, 113, 380, 264, 469]",0.1.0,tubulin
1,ca1f5d66-da95-4208-bd89-65c6c614a9f1,0.1.0,3500001171_100X_20170811_1-Scene-1-P7-D05.czi,Paclitaxol,2,"[35, 139, 183, 427, 327, 608]",0.1.0,tubulin
2,662de948-69c0-4b93-92f8-f3bcc7e38655,0.1.0,3500001171_100X_20170811_1-Scene-1-P7-D05.czi,Paclitaxol,3,"[35, 139, 225, 458, 23, 406]",0.1.0,tubulin
3,7806500f-6df9-4b9f-8b23-fde6fd09d40e,0.1.0,3500001171_100X_20170811_1-Scene-1-P7-D05.czi,Paclitaxol,4,"[35, 144, 403, 529, 156, 439]",0.1.0,tubulin
4,17fcb999-72a5-425d-b485-b02edb16651c,0.1.0,3500001171_100X_20170811_1-Scene-1-P7-D05.czi,Paclitaxol,9,"[35, 147, 293, 450, 168, 397]",0.1.0,tubulin


Upload exiting local pkl files

In [4]:
DSToIngest = [
    {"file": "QCB_DRUG_MEM_feature.pkl",
     "name": "QCB_DRUG_MEM_feature",
     "description": "Cell membrane features extracted from QCB DRUG data",
     "na_allowed": True,
     "meta_id": "cell_id"},
    {"file": "QCB_DRUG_DNA_feature.pkl",
     "name": "QCB_DRUG_DNA_feature",
     "description": "Nucleus features extracted from QCB DRUG data",
     "na_allowed": True,
     "meta_id": "cell_id"}]

Purge datasets first

In [5]:
for data in DSToIngest:
    
    print("Purging: ",data["file"])
    
    try:
        prod._purge_dataset(name=data["name"])
    except: pass

Purging:  QCB_DRUG_MEM_feature.pkl
Purging:  QCB_DRUG_DNA_feature.pkl


In [6]:
for data in DSToIngest:
    
    print("running:",data["file"])
    
    with open(os.path.join("../../data-raw/",data["file"]), "rb") as fp:
        df = pickle.load(fp)
        
    if data["na_allowed"] is False:
        assert df.isnull().values.any() == False

    if data["meta_id"] is not None:
        df = pd.concat([df.reset_index(drop=True), df_meta[data["meta_id"]]], axis=1)
    
    ds = dsdb.Dataset(df, name=data["name"], description=data["description"])
    
    ds.upload_to(prod)

running: QCB_DRUG_MEM_feature.pkl
Storing algorithm parameters.
Input dataset already exists in database. 1
Tearing down object...
running: QCB_DRUG_DNA_feature.pkl
Storing algorithm parameters.
Input dataset already exists in database. 1
Input dataset already exists in database. 57


Checking uploaded datasets

In [36]:
for data in DSToIngest:
    
    try:
        print(prod.preview(name=data["name"]))
    except:
        print(data["name"]+" fail")
        pass

{'info': {'id': 56, 'name': 'QCB_DRUG_MEM_feature', 'description': 'Cell membrane features extracted from QCB DRUG data', 'introspector': 'datasetdatabase.introspect.dataframe.DataFrameIntrospector', 'created': datetime.datetime(2018, 10, 9, 0, 23, 46, 716721)}, 'shape': (1519, 102), 'keys': ['str_1st_axis_length_mean', 'str_1st_axis_x_mean', 'str_1st_axis_y_mean', 'str_1st_axis_z_mean', 'str_1st_eigenvalue_mean', 'str_2nd_axis_length_mean', 'str_2nd_axis_x_mean', 'str_2nd_axis_y_mean', 'str_2nd_axis_z_mean', 'str_2nd_eigenvalue_mean', 'str_3rd_axis_length_mean', 'str_3rd_axis_x_mean', 'str_3rd_axis_y_mean', 'str_3rd_axis_z_mean', 'str_3rd_eigenvalue_mean', 'str_equator_eccentricity_mean', 'str_highest_z_mean', 'str_lowest_z_mean', 'str_meridional_eccentricity_mean', 'str_sphericity_mean', 'str_surface_area_mean', 'str_volume_mean', 'str_x_centroid_mean', 'str_y_centroid_mean', 'str_z_centroid_mean', 'str_1st_axis_length_std', 'str_1st_axis_x_std', 'str_1st_axis_y_std', 'str_1st_axis_z

# Structures Datasets

Uploading existing structures pkl files

In [11]:
DSToIngest = [
    {"file": "tmp_str_golgi_drug_ds.pkl",
     "name": "QCB_DRUG_ST6GAL_feature",
     "description": "Features extracted from QCB images of ST6GAL-tagged Golgi.",
     "na_allowed": True,
     "meta_id": "cell_id"},
    {"file": "tmp_str_er_drug_ds.pkl",
     "name": "QCB_DRUG_SEC61B_feature",
     "description": "Features extracted from QCB DRUG images of SEC61B-tagged ER.",
     "na_allowed": True,
     "meta_id": "cell_id"},
    {"file": "tmp_str_tubulin_drug_ds.pkl",
     "name": "QCB_DRUG_TUBA1B_feature",
     "description": "Features extracted from QCB DRUG images of TUBA1B-tagged microtubules.",
     "na_allowed": True,
     "meta_id": "cell_id"}]

In [12]:
for data in DSToIngest:
    
    print("running:",data["file"])
    
    with open(data["file"], "rb") as fp:
        df = pickle.load(fp)
        
    if data["na_allowed"] is False:
        assert df.isnull().values.any() == False
    
    df = df.reset_index(drop=False)
    df = df.rename(index=str, columns={"index":"cell_id"})
    
    ds = dsdb.Dataset(df, name=data["name"], description=data["description"])
    
    ds.upload_to(prod)

running: tmp_str_golgi_drug_ds.pkl
Storing algorithm parameters.
Input dataset already exists in database. 1
Tearing down object...


In [7]:
ds.md5

'c062b067037fe289f7b8e4787542c38d'