In [None]:
import os
import pandas as pd
os.environ["FSSPEC_CONFIG_DIR"] = "/allen/aics/assay-dev/users/Alex/cytodata/fsspec.d"

In [None]:
# Read in Datasets
df = pd.read_parquet("s3://variance-dataset/processed/manifest.parquet")
columns = pd.read_csv("./resources/cytodata_column_manifest_092122.csv")
cell_line_anno = pd.read_csv("./resources/cell_line_annotation.txt",delimiter="\t")
updated_edge = pd.read_csv("./resources/updatedEDGEcalls_22020914.csv")
human_mito_annotations = pd.read_csv("./resources/draft_plus_human_mito_annotations.csv")

In [None]:
# Remove spherical harmonics coeff columns
df_filt = df[df.columns.drop(list(df.filter(regex='shcoeffs')))]
df_filt.columns

In [None]:
# Add Updated Edge data
df_filt = pd.merge(df_filt, updated_edge, on='CellId')
df_filt = df_filt.drop(columns=['edge_flag','edge_flag_OLDandWRONG','Unnamed: 0'])
df_filt = df_filt.rename(columns={"edge_flag_NEWandCORRECT": "edge_flag"})
df_filt.shape

In [None]:
#  Add Cell line Annotations (4 New Columns) #Change AAVS1 name to match annotation
df_filt['structure_name'] = df_filt['structure_name'].replace({'AAVS1':'Safe Harbor Locus (AAVS1)'})
df_filt_cell_anno = pd.merge(df_filt,cell_line_anno, left_on='structure_name',right_on='Gene', how = 'outer' )
df_filt = df_filt_cell_anno.drop(columns = ['Gene'])
df_filt.shape

In [None]:
# Add Mito Annotations (6 New Columns)
df_filt = pd.merge(df_filt,human_mito_annotations, on = ['CellId','FOVId'], how = 'outer')
df_filt = df_filt.drop(columns=['Unnamed: 0'])
df_filt.shape

In [None]:
# Remove Outlier Cells 
df_filt = df_filt[df_filt['outlier']  == 'No']
df_filt.shape

In [None]:
# Drop Columns that contain /allen paths (+ outlier)
column_criteria = [
    'success',
    'roi',
    'crop_raw',
    'crop_seg',
    'name_dict',
    'fov_path',
    'fov_seg_path',
    'struct_seg_path'
]

df_filt = df_filt.drop(columns= column_criteria, axis = 1)

In [None]:
# Drop Misc Columns
column_criteria = [
    'outlier',
    'ChannelNumber405',
    'ChannelNumber638',
    'ChannelNumberBrightfield',
    'ChannelNumberStruct',
    'Draft M6/M7 complete',
    'Expert mitotic state resolved',
    'CellIndex',
    'Draft mitotic state coarse',
    'Expert mitotic state coarse',
]

df_filt = df_filt.drop(columns = column_criteria,axis = 1 )
df_filt.shape

In [None]:
#Alphabatize
df_filt = df_filt[sorted(df_filt.columns.tolist(), key=str.casefold)]

In [None]:
# Fliter Columns dataframe to just needed Columns Should have shape [(# of columns), 7]
lst = df_filt.columns.tolist()
columns_filtered  = columns.query('Col_ID in @lst')

In [None]:
# Adding Catagory as a Secondary Header 
#headers = [columns_filtered['Category'], df_filt.columns.tolist()]
#df_filt.columns = headers

In [None]:
# Rename Confusing Columns 
new_names = {
    "structure_name":"gene",
    "NUC_shape_volume":"nuclear_volume",
    "NUC_position_depth":"nuclear_height",
    "NUC_roundness_surface_area":"nuclear_surface_area",
    "MEM_shape_volume":"cell_volume",
    "MEM_position_depth":"cell_height",
    "MEM_roundness_surface_area":"cell_surface_area",
    "STR_shape_volume":"structure_volume",
    "STR_connectivity_cc":"structure_connected_components",
    "NUC_MEM_PC1":"shape_mode_1_height",
    "NUC_MEM_PC2":"shape_mode_2_volume",
    "NUC_MEM_PC3":"shape_mode_3_major_tilt",
    "NUC_MEM_PC4":"shape_mode_4_minor_tilt",
    "NUC_MEM_PC5":"shape_mode_5_elongation",
    "NUC_MEM_PC6":"shape_mode_6_bean-ness",
    "NUC_MEM_PC7":"shape_mode_7_pear-ness",
    "NUC_MEM_PC8":"shape_mode_8_wedge",
}

df_filt.rename(columns = new_names,inplace=True,)

In [None]:
df_filt.columns.tolist()

In [None]:
df_filt['nuclear_height'].head()

In [None]:
#scale to microns from pixels
# columns to scele :nuclear_height, nuclear_surface_area, nuclear_volume, cell_height, cell_volume, cell_surface_area,structure_volume  

pix_size = 0.108333
#df_filt[('cell metric', 'nuclear_height')] = df_filt[('cell metric', 'nuclear_height')]*pix_size
#df_filt[('cell metric', 'cell_height')] = df_filt[('cell metric', 'cell_height')]*pix_size
#df_filt[('cell metric', 'cell_volume')] = df_filt[('cell metric', 'cell_volume')] * (pix_size**3)
#df_filt[('cell metric', 'nuclear_volume')] = df_filt[('cell metric', 'nuclear_volume')] * (pix_size**3)
#df_filt[('cell metric', 'cell_surface_area')] = df_filt[('cell metric', 'cell_surface_area')] * (pix_size**2)
#df_filt[('cell metric', 'nuclear_surface_area')] = df_filt[('cell metric', 'nuclear_surface_area')] * (pix_size**2)
#df_filt[('cell metric', 'structure_volume')] = df_filt[('cell metric', 'structure_volume')] * (pix_size**3)

df_filt['nuclear_height'] = df_filt['nuclear_height']*pix_size
df_filt['cell_height'] = df_filt['cell_height']*pix_size

df_filt['nuclear_surface_area'] = df_filt['nuclear_surface_area']*pix_size**2
df_filt['cell_surface_area'] = df_filt['cell_surface_area']*pix_size**2

df_filt['nuclear_volume'] = df_filt['nuclear_volume']*pix_size**3
df_filt['cell_volume'] = df_filt['cell_volume']*pix_size**3
df_filt['structure_volume'] = df_filt['structure_volume']*pix_size**3



In [None]:
#df_filt[('cell metric', 'nuclear_height')].head()
df_filt['nuclear_height']

In [None]:
df_filt.to_parquet("./resources/hackathon_manifest_09292022.parquet")