# Chapter 2 - Accessing the Dataset

- 
-
-
-
-

#### Assets 

In [1]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nbvv
from upath import UPath as Path
from aicsimageio import AICSImage
from ome_zarr.reader import Reader
from ome_zarr.io import parse_url
import logging
logging.getLogger("bfio").setLevel(logging.ERROR)
logging.getLogger("aicsimageio").setLevel(logging.ERROR)


def read_ome_zarr(path, level=0, image_name="default"):
    path = str(path if image_name is None else Path(path) / image_name)
    reader = Reader(parse_url(path))

    node = next(iter(reader()))
    pps = node.metadata["coordinateTransformations"][0][0]["scale"][-3:]
   
    return AICSImage(
        node.data[level].compute(),
        channel_names=node.metadata["name"],
        physical_pixel_sizes=pps
    )

def rescale_image(img_data, channels):
    img_data = img_data.squeeze().astype(np.float32)
    
    for ix, channel in enumerate(channels):
        if "_seg" not in channel:
            img_data[ix] -= 1
            
            img_data[ix] = np.where(
                img_data[ix] >= 0,
                img_data[ix] / img_data.max(),
                -1
            )
    return img_data.astype(np.float16)


In [2]:
df = pd.read_parquet("s3://variance-dataset/processed/manifest.parquet")

#### Jupyter Notebook Help

#### Python Pandas Resources

## Chapter 2.1 Understanding the Dataset

### The hiPSC Single-Cell Image Dataset
Our dataset consists of 3D live-cell images of *normal human induced pluripotent stem cells (hiPSCs)*, which are a useful human cell model system. The dataset includes 25 cell lines representing fluorescently tagged major organelles, cellular structures, and compartments. Images of these cells were obtained in 3D using spinning-disk confocal microscopes and were further processed using deep learning-based segmentation. For the Purpose of this hackathon this Dataset has been filtered down to 214037 cells.

<img src="resources/hipsc_single_cell_image_dataset_summary.png"/>

### From FOVs to single-cell images
The hiPSC Single-Cell Image dataset consists of **215,081** single-cell images which were extracted from **18,100** fields of view (FOVs) of our cell colonies. Each row of the dataframe represents a cell, while the columns contain metadata, metrics, imaging data, etc...

In [7]:
# Breif Pre-Processing Description

In [None]:
# Some Visualizations of the dataset

## Chapter 2.2 Sub-Datasets

### Dataset Catagories

The dataset columns have been broken down into 4 Catagories [`cell metadata`, `field-of-view metadata`,`cell metrics`,`cell images`]. These catagories are recognized as a secondary header.

In [4]:
df.head()

Unnamed: 0,CellId,roi,crop_raw,crop_seg,name_dict,fov_path,fov_seg_path,struct_seg_path,structure_name,this_cell_nbr_complete,...,max_projection_y,mean_projection_y,median_projection_y,max_projection_x,mean_projection_x,median_projection_x,center_slice,fits_z,fits_y,fits_x
0,230741,"[27, 146, 267, 548, 476, 744]",/allen/aics/assay-dev/computational/data/dna_c...,/allen/aics/assay-dev/computational/data/dna_c...,"{'crop_raw': ['dna', 'membrane', 'structure'],...",/allen/programs/allencell/data/proj0/df4/7de/f...,/allen/programs/allencell/data/proj0/bf4/d80/8...,/allen/programs/allencell/data/proj0/4c8/d6b/c...,TOMM20,1,...,s3://variance-dataset/max_projection_y/230741....,s3://variance-dataset/mean_projection_y/230741...,s3://variance-dataset/median_projection_y/2307...,s3://variance-dataset/max_projection_x/230741....,s3://variance-dataset/mean_projection_x/230741...,s3://variance-dataset/median_projection_x/2307...,s3://variance-dataset/center_slice/230741.ome....,True,True,True
1,230745,"[27, 132, 124, 412, 443, 755]",/allen/aics/assay-dev/computational/data/dna_c...,/allen/aics/assay-dev/computational/data/dna_c...,"{'crop_raw': ['dna', 'membrane', 'structure'],...",/allen/programs/allencell/data/proj0/df4/7de/f...,/allen/programs/allencell/data/proj0/bf4/d80/8...,/allen/programs/allencell/data/proj0/4c8/d6b/c...,TOMM20,1,...,s3://variance-dataset/max_projection_y/230745....,s3://variance-dataset/mean_projection_y/230745...,s3://variance-dataset/median_projection_y/2307...,s3://variance-dataset/max_projection_x/230745....,s3://variance-dataset/mean_projection_x/230745...,s3://variance-dataset/median_projection_x/2307...,s3://variance-dataset/center_slice/230745.ome....,True,True,True
2,230746,"[27, 130, 224, 559, 334, 574]",/allen/aics/assay-dev/computational/data/dna_c...,/allen/aics/assay-dev/computational/data/dna_c...,"{'crop_raw': ['dna', 'membrane', 'structure'],...",/allen/programs/allencell/data/proj0/df4/7de/f...,/allen/programs/allencell/data/proj0/bf4/d80/8...,/allen/programs/allencell/data/proj0/4c8/d6b/c...,TOMM20,0,...,s3://variance-dataset/max_projection_y/230746....,s3://variance-dataset/mean_projection_y/230746...,s3://variance-dataset/median_projection_y/2307...,s3://variance-dataset/max_projection_x/230746....,s3://variance-dataset/mean_projection_x/230746...,s3://variance-dataset/median_projection_x/2307...,s3://variance-dataset/center_slice/230746.ome....,True,True,True
3,230748,"[27, 138, 287, 577, 386, 614]",/allen/aics/assay-dev/computational/data/dna_c...,/allen/aics/assay-dev/computational/data/dna_c...,"{'crop_raw': ['dna', 'membrane', 'structure'],...",/allen/programs/allencell/data/proj0/df4/7de/f...,/allen/programs/allencell/data/proj0/bf4/d80/8...,/allen/programs/allencell/data/proj0/4c8/d6b/c...,TOMM20,0,...,s3://variance-dataset/max_projection_y/230748....,s3://variance-dataset/mean_projection_y/230748...,s3://variance-dataset/median_projection_y/2307...,s3://variance-dataset/max_projection_x/230748....,s3://variance-dataset/mean_projection_x/230748...,s3://variance-dataset/median_projection_x/2307...,s3://variance-dataset/center_slice/230748.ome....,True,True,True
4,230754,"[27, 141, 11, 227, 502, 808]",/allen/aics/assay-dev/computational/data/dna_c...,/allen/aics/assay-dev/computational/data/dna_c...,"{'crop_raw': ['dna', 'membrane', 'structure'],...",/allen/programs/allencell/data/proj0/df4/7de/f...,/allen/programs/allencell/data/proj0/bf4/d80/8...,/allen/programs/allencell/data/proj0/4c8/d6b/c...,TOMM20,0,...,s3://variance-dataset/max_projection_y/230754....,s3://variance-dataset/mean_projection_y/230754...,s3://variance-dataset/median_projection_y/2307...,s3://variance-dataset/max_projection_x/230754....,s3://variance-dataset/mean_projection_x/230754...,s3://variance-dataset/median_projection_x/2307...,s3://variance-dataset/center_slice/230754.ome....,True,True,True


### Creating Sub-Datasets

In [None]:
# By Catagory

In [None]:
# By Columns

## Chapter 2.3 Interacting with the Dataset

In [None]:
# Querying

In [6]:
# Grouping

## Chapter 2.4 Column Descriptions


The dataset contains over 1200 columns with important metrics which you may want to incorporate during the challenge. We have filtered this list of columns to 87 of which we think will be important to your efforts. In this section we'll create a tiny dash app that runs within the notebook in order to display an interactive table of the columns and their descriptions. **The table is searchable** lending itself to more efficient querying.

In [8]:
from jupyter_dash import JupyterDash
import dash
from dash import dcc
from dash import html
JupyterDash.infer_jupyter_proxy_config()

ModuleNotFoundError: No module named 'jupyter_dash'

In [None]:
col_df = pd.read_csv("resources/colname_test.csv",delimiter=":",header=None)
col_df.columns = ["column name","description"]

In [5]:
# Some sort of Look up method built in for looking at column definitions 
from dash import dash_table
app = JupyterDash(__name__)
server = app.server

app.layout = dash_table.DataTable(
    col_df.to_dict('records'), 
    [{"name": i, "id": i} for i in col_df.columns],
    style_data={
        'whiteSpace':'normal',
        'height': 'auto',
        'lineHeight':'15px',
        'backgroundColor': 'rgb(50,50,50)',
        'color': 'white',
    },
    style_header={
        'backgroundColor':'rgb(30,30,30)',
        'color':'white'
    },  
    style_cell={
        'textAlign':'left'
    },
    filter_action="native",
)


In [None]:
app.run_server(mode="inline", port=8588,) # This runs on a specified port. Either enable port forwarding on your maching. #TODO work with Gui to enable this directly from the app rather than port forwarding. 