In [20]:
import os
import pandas as pd
os.environ["FSSPEC_CONFIG_DIR"] = "/allen/aics/assay-dev/users/Alex/cytodata/fsspec.d"

In [24]:
# Read in Datasets
df = pd.read_parquet("s3://variance-dataset/processed/manifest.parquet")
columns = pd.read_csv("./resources/cytodata_column_manifest.csv")
cell_line_anno = pd.read_csv("./resources/cell_line_annotation.txt",delimiter="\t")
updated_edge = pd.read_csv("./resources/updatedEDGEcalls_22020914.csv")
human_mito_annotations = pd.read_csv("./resources/draft_plus_human_mito_annotations.csv")

In [25]:
# Remove spherical harmonics coeff columns
df_filt = df[df.columns.drop(list(df.filter(regex='shcoeffs')))]
df_filt.columns

Index(['CellId', 'roi', 'crop_raw', 'crop_seg', 'name_dict', 'fov_path',
       'fov_seg_path', 'struct_seg_path', 'structure_name',
       'this_cell_nbr_complete', 'this_cell_nbr_dist_2d', 'scale_micron',
       'edge_flag', 'FOVId', 'this_cell_index', 'PlateId', 'WellId',
       'cell_stage', 'InstrumentId', 'WorkflowId', 'ProtocolId', 'PiezoId',
       'ChannelNumberStruct', 'ChannelNumberBrightfield', 'ChannelNumber405',
       'ChannelNumber638', 'meta_fov_position', 'meta_imaging_mode',
       'meta_fov_outside_overview', 'meta_fov_xcoord', 'meta_fov_ycoord',
       'meta_fov_edgedist', 'meta_colony_label', 'meta_colony_centroid',
       'meta_colony_area', 'meta_plate_bad_segmentation',
       'meta_plate_confluency', 'meta_well_passage_at_imaging',
       'meta_well_passage_at_thaw', 'outlier', 'NUC_shape_volume',
       'NUC_position_depth', 'NUC_roundness_surface_area', 'MEM_shape_volume',
       'MEM_position_depth', 'MEM_roundness_surface_area', 'STR_shape_volume',
       

In [26]:
# Add Updated Edge data
df_filt = pd.merge(df_filt, updated_edge, on='CellId')
df_filt = df_filt.drop(columns=['edge_flag','edge_flag_OLDandWRONG','Unnamed: 0'])
df_filt = df_filt.rename(columns={"edge_flag_NEWandCORRECT": "edge_flag"})
df_filt.shape

(215081, 86)

In [27]:
#  Add Cell line Annotations (4 New Columns)
df_filt_cell_anno = pd.merge(df_filt,cell_line_anno, left_on='structure_name',right_on='Gene', how = 'outer' )
df_filt = df_filt_cell_anno.drop(columns = ['Gene'])
df_filt.shape

(215082, 90)

In [28]:
# Add Mito Annotations (6 New Columns)
df_filt = pd.merge(df_filt,human_mito_annotations, on = ['CellId','FOVId'], how = 'outer')
df_filt = df_filt.drop(columns=['Unnamed: 0'])
df_filt.shape

(215082, 96)

In [29]:
# Remove Outlier Cells 
df_filt = df_filt[df_filt['outlier']  == 'No']
df_filt.shape

(214037, 96)

In [30]:
# Drop Columns that contain /allen paths (+ outlier)
column_criteria = [
    'success',
    'roi',
    'crop_raw',
    'crop_seg',
    'name_dict',
    'fov_path',
    'fov_seg_path',
    'struct_seg_path'
]

df_filt = df_filt.drop(columns= column_criteria, axis = 1)

In [31]:
# Drop Misc Columns
column_criteria = [
    'outlier',
    'ChannelNumber405',
    'ChannelNumber638',
    'ChannelNumberBrightfield',
    'ChannelNumberStruct',
    'Draft M6/M7 complete',
    'Expert mitotic state resolved',
    'CellIndex',
    'Draft mitotic state coarse',
    'Expert mitotic state coarse',
]

df_filt = df_filt.drop(columns = column_criteria,axis = 1 )
df_filt.shape

(214037, 78)

In [32]:
#Alphabatize
df_filt = df_filt[sorted(df_filt.columns.tolist(), key=str.casefold)]

In [33]:
# Fliter Columns dataframe to just needed Columns Should have shape [(# of columns), 7]
lst = df_filt.columns.tolist()
columns_filtered  = columns.query('Col_ID in @lst')

In [34]:
# Adding Catagory as a Secondary Header 
headers = [columns_filtered['Category'], df_filt.columns.tolist()]
df_filt.columns = headers

In [35]:
# Rename Confusing Columns 
new_names = {
    "structure_name":"gene",
    "NUC_shape_volume":"nuclear_volume",
    "NUC_position_depth":"nuclear_height",
    "NUC_roundness_surface_area":"nuclear_surface_area",
    "MEM_shape_volume":"cell_volume",
    "MEM_position_depth":"cell_height",
    "MEM_roundness_surface_area":"cell_surface_area",
    "STR_shape_volume":"structure_volume",
    "STR_connectivity_cc":"structure_connected_components",
    "NUC_MEM_PC1":"shape_mode_1_height",
    "NUC_MEM_PC2":"shape_mode_2_volume",
    "NUC_MEM_PC3":"shape_mode_3_major_tilt",
    "NUC_MEM_PC4":"shape_mode_4_minor_tilt",
    "NUC_MEM_PC5":"shape_mode_5_elongation",
    "NUC_MEM_PC6":"shape_mode_6_bean-ness",
    "NUC_MEM_PC7":"shape_mode_7_pear-ness",
    "NUC_MEM_PC8":"shape_mode_8_wedge",
}

df_filt.rename(columns = new_names,inplace=True, level = 1)

In [36]:
df_filt.columns

MultiIndex([(           'cell metric',                          'angle'),
            (           'cell metric',                     'bbox_max_x'),
            (           'cell metric',                     'bbox_max_y'),
            (           'cell metric',                     'bbox_max_z'),
            (           'cell metric',                     'bbox_min_x'),
            (           'cell metric',                     'bbox_min_y'),
            (           'cell metric',                     'bbox_min_z'),
            (           'cell metric',                     'bf_clip_hi'),
            (           'cell metric',                     'bf_clip_lo'),
            (         'cell metadata',                     'cell_stage'),
            (         'cell metadata',                         'CellId'),
            (         'cell metadata',             'Cellular Component'),
            (           'cell images',                   'center_slice'),
            (         'cell metadata',

In [38]:
df_filt.head()

Category,cell metric,cell metric,cell metric,cell metric,cell metric,cell metric,cell metric,cell metric,cell metric,cell metadata,...,cell metric,cell metadata,cell images,cell metric,cell metadata,cell metadata,cell metadata,cell metric,field-of-view metadata,field-of-view metadata
Unnamed: 0_level_1,angle,bbox_max_x,bbox_max_y,bbox_max_z,bbox_min_x,bbox_min_y,bbox_min_z,bf_clip_hi,bf_clip_lo,cell_stage,...,structure_volume,Structure,structure_clip_hi,structure_clip_lo,gene,this_cell_index,this_cell_nbr_complete,this_cell_nbr_dist_2d,WellId,WorkflowId
0,-86.508019,252.0,247.0,114.0,39.0,49.0,10.0,46104.0,32994.0,M4M5,...,195930.0,mitochondria,514.0,407.0,TOMM20,1.0,1.0,"[(230745, 112.01229336146552), (230746, 158.51...",24822.0,['Pipeline 4']
1,5.235116,308.0,271.0,96.0,73.0,57.0,6.0,44990.0,34108.0,M0,...,116763.0,mitochondria,527.0,405.0,TOMM20,5.0,1.0,"[(230741, 112.01229336146552), (230746, 187.55...",24822.0,['Pipeline 4']
2,-60.156382,361.0,274.0,103.0,86.0,135.0,15.0,45393.0,33473.0,M0,...,70647.0,mitochondria,546.0,411.0,TOMM20,6.0,0.0,"[(230741, 158.5175782133776), (230745, 187.555...",24822.0,['Pipeline 4']
3,82.318691,272.0,215.0,105.0,51.0,62.0,9.0,45295.0,33806.0,M6M7_single,...,88514.0,mitochondria,507.0,410.0,TOMM20,8.0,0.0,"[(230741, 115.43538034418327), (230745, 193.90...",24822.0,['Pipeline 4']
4,-3.020769,274.0,197.0,108.0,42.0,57.0,9.0,45120.0,34334.0,M0,...,113858.0,mitochondria,547.0,409.0,TOMM20,14.0,0.0,"[(230757, 195.03496287454146), (230758, 101.72...",24822.0,['Pipeline 4']


In [39]:
df_filt.to_parquet("./resources/hackathon_manifest_092022.parquet")