In [1]:
import os
import pandas as pd
os.environ["FSSPEC_CONFIG_DIR"] = "/allen/aics/assay-dev/users/Alex/cytodata/fsspec.d"

In [2]:
# Read in Datasets
df = pd.read_parquet("s3://variance-dataset/processed/manifest.parquet")
columns = pd.read_csv("./resources/cytodata_column_manifest.csv.csv")
cell_line_anno = pd.read_csv("./resources/cell_line_annotation.txt",delimiter="\t")
updated_edge = pd.read_csv("./resources/updatedEDGEcalls_22020914.csv")
human_mito_annotations = pd.read_csv("./resources/draft_plus_human_mito_annotations.csv")

In [3]:
# Remove spherical harmonics coeff columns
df_filt = df[df.columns.drop(list(df.filter(regex='shcoeffs')))]
df_filt.columns

Index(['CellId', 'roi', 'crop_raw', 'crop_seg', 'name_dict', 'fov_path',
       'fov_seg_path', 'struct_seg_path', 'structure_name',
       'this_cell_nbr_complete', 'this_cell_nbr_dist_2d', 'scale_micron',
       'edge_flag', 'FOVId', 'this_cell_index', 'PlateId', 'WellId',
       'cell_stage', 'InstrumentId', 'WorkflowId', 'ProtocolId', 'PiezoId',
       'ChannelNumberStruct', 'ChannelNumberBrightfield', 'ChannelNumber405',
       'ChannelNumber638', 'meta_fov_position', 'meta_imaging_mode',
       'meta_fov_outside_overview', 'meta_fov_xcoord', 'meta_fov_ycoord',
       'meta_fov_edgedist', 'meta_colony_label', 'meta_colony_centroid',
       'meta_colony_area', 'meta_plate_bad_segmentation',
       'meta_plate_confluency', 'meta_well_passage_at_imaging',
       'meta_well_passage_at_thaw', 'outlier', 'NUC_shape_volume',
       'NUC_position_depth', 'NUC_roundness_surface_area', 'MEM_shape_volume',
       'MEM_position_depth', 'MEM_roundness_surface_area', 'STR_shape_volume',
       

In [4]:
# Add Updated Edge data
df_filt = pd.merge(df_filt, updated_edge, on='CellId')
df_filt = df_filt.drop(columns=['edge_flag','edge_flag_OLDandWRONG','Unnamed: 0'])
df_filt = df_filt.rename(columns={"edge_flag_NEWandCORRECT": "edge_flag"})
df_filt.shape

(215081, 86)

In [5]:
#  Add Cell line Annotations (4 New Columns)
df_filt_cell_anno = pd.merge(df_filt,cell_line_anno, left_on='structure_name',right_on='Gene', how = 'outer' )
df_filt = df_filt_cell_anno.drop(columns = ['Gene'])
df_filt.shape

(215082, 90)

In [6]:
# Add Mito Annotations (6 New Columns)
df_filt = pd.merge(df_filt,human_mito_annotations, on = ['CellId','FOVId'], how = 'outer')
df_filt = df_filt.drop(columns=['Unnamed: 0'])
df_filt.shape

(215082, 96)

In [7]:
# Remove Outlier Cells 
df_filt = df_filt[df_filt['outlier']  == 'No']
df_filt.shape

(214037, 96)

In [8]:
# Drop Columns that contain /allen paths (+ outlier)
column_criteria = [
    'success',
    'roi',
    'crop_raw',
    'crop_seg',
    'name_dict',
    'fov_path',
    'fov_seg_path',
    'struct_seg_path'
]

df_filt = df_filt.drop(columns= column_criteria, axis = 1)

In [9]:
# Drop Misc Columns
column_criteria = [
    'outlier',
    'ChannelNumber405',
    'ChannelNumber638',
    'ChannelNumberBrightfield',
    'ChannelNumberStruct',
    'Draft M6/M7 complete',
    'Expert mitotic state resolved',
    'CellIndex',
    'Draft mitotic state coarse',
    'Expert mitotic state coarse',
]

df_filt = df_filt.drop(columns = column_criteria,axis = 1 )
df_filt.shape

(214037, 78)

In [10]:
#Alphabatize
df_filt = df_filt[sorted(df_filt.columns.tolist(), key=str.casefold)]

In [11]:
# Fliter Columns dataframe to just needed Columns Should have shape [(# of columns), 7]
lst = df_filt.columns.tolist()
columns_filtered  = columns.query('Col_ID in @lst')

In [12]:
# Adding Catagory as a Secondary Header 
headers = [columns_filtered['Category'], df_filt.columns.tolist()]
df_filt.columns = headers

In [13]:
# Rename Confusing Columns 
new_names = {
    "structure_name":"gene",
    "NUC_shape_volume":"nuclear_volume",
    "NUC_position_depth":"nuclear_height",
    "NUC_roundness_surface_area":"nuclear_surface_area",
    "MEM_shape_volume":"cell_volume",
    "MEM_position_depth":"cell_height",
    "MEM_roundness_surface_area":"cell_surface_area",
    "STR_shape_volume":"structure_volume",
    "STR_connectivity_cc":"structure_connected_components",
    "NUC_MEM_PC1":"shape_mode_1_height",
    "NUC_MEM_PC2":"shape_mode_2_volume",
    "NUC_MEM_PC3":"shape_mode_3_major_tilt",
    "NUC_MEM_PC4":"shape_mode_4_minor_tilt",
    "NUC_MEM_PC5":"shape_mode_5_elongation",
    "NUC_MEM_PC6":"shape_mode_6_bean-ness",
    "NUC_MEM_PC7":"shape_mode_7_pear-ness",
    "NUC_MEM_PC8":"shape_mode_8_wedge",
}

df_filt.rename(columns = new_names,inplace=True, level = 1)

In [14]:
df_filt.columns

MultiIndex([(           'cell metric',                          'angle'),
            (           'cell metric',                     'bbox_max_x'),
            (           'cell metric',                     'bbox_max_y'),
            (           'cell metric',                     'bbox_max_z'),
            (           'cell metric',                     'bbox_min_x'),
            (           'cell metric',                     'bbox_min_y'),
            (           'cell metric',                     'bbox_min_z'),
            (           'cell metric',                     'bf_clip_hi'),
            (           'cell metric',                     'bf_clip_lo'),
            (         'cell metadata',                     'cell_stage'),
            (         'cell metadata',                         'CellId'),
            (         'cell metadata',             'Cellular Component'),
            (           'cell images',                   'center_slice'),
            (         'cell metadata',