In [None]:
import pandas as pd
import os
import pycytominer

In [None]:
# CellProfiler profiles can be found in /scr/nmoshkov/jump_evaluation_fbm or in CellPainting Gallery AWS
all_plates = os.listdir('./cellprofiler_profiles/2020_11_04_CPJUMP1')
groups = []
for plate in all_plates:
    groups.append(pd.read_csv(f'./cellprofiler_profiles/2020_11_04_CPJUMP1/{plate}/{plate}_spherized.csv.gz')['Metadata_Plate'].unique())

groups = [list(x) for x in set(tuple(x) for x in groups)]

In [None]:
# Normalize DeepProfiler CP-CNN features 

feature_columns = ['emb_' + str(i) for i in range(672)] # 672 - feature vector size

for group in groups:
    group_df = pd.DataFrame()
    for plate in group:
        cellprofiler_plate = pd.read_csv(f'./cellprofiler_profiles/2020_11_04_CPJUMP1/{plate}/{plate}_normalized_feature_select_negcon_batch.csv.gz')
        cellprofiler_plate = cellprofiler_plate[[i for i in cellprofiler_plate.columns if 'Metadata' in i]]
        deepprofiler_plate = pd.read_parquet(f'./cpcnn_profiles/2020_11_04_CPJUMP1/{plate}/{plate}.parquet')
        deepprofiler_plate = pd.merge(cellprofiler_plate, deepprofiler_plate, how = 'left', left_on=['Metadata_Plate', 'Metadata_Well'], right_on = ['plate', 'well']).reset_index(drop=True)
        deepprofiler_plate.drop(columns = ['source', 'batch', 'plate', 'well'], inplace = True)
        deepprofiler_plate = pd.concat([deepprofiler_plate.drop(columns=['all_emb']), deepprofiler_plate['all_emb'].apply(pd.Series)], axis=1).reset_index(drop = True)
        deepprofiler_plate.rename(columns = dict(zip([i for i in range(672)], feature_columns)), inplace = True, errors='raise')
        deepprofiler_plate.to_csv(f'./cpcnn_profiles/2020_11_04_CPJUMP1/{plate}/{plate}_raw.csv.gz', compression='gzip', index = False)
        group_df = pd.concat((group_df, deepprofiler_plate)).reset_index(drop = True)
    
    normalized_df = pycytominer.normalize(group_df, features = feature_columns, meta_features = 'infer', method = 'spherize', spherize_epsilon = 1e-3, samples = "Metadata_control_type == 'negcon'")
    for plate in normalized_df.Metadata_Plate.unique():
        to_save = normalized_df[normalized_df.Metadata_Plate == plate].reset_index(drop = True)
        to_save.to_csv(f'./cpcnn_profiles/2020_11_04_CPJUMP1/{plate}/{plate}_group_spherized_0.001.csv.gz', compression='gzip', index = False)


In [None]:
# Normalize DINO4Cells features
feature_columns = ['emb_' + str(i) for i in range(384)] # 384 - feature vector size

for group in groups:
    group_df = pd.DataFrame()
    for plate in group:
        cellprofiler_plate = pd.read_csv(f'./cellprofiler_profiles/2020_11_04_CPJUMP1/{plate}/{plate}_normalized_feature_select_negcon_batch.csv.gz')
        cellprofiler_plate = cellprofiler_plate[[i for i in cellprofiler_plate.columns if 'Metadata' in i]]
        dino_plate = pd.read_parquet(f'./dino4cells/2020_11_04_CPJUMP1/{plate}/{plate}.parquet')
        dino_plate = pd.merge(cellprofiler_plate, dino_plate, how = 'left', left_on=['Metadata_Plate', 'Metadata_Well'], right_on = ['plate', 'well']).reset_index(drop=True)
        dino_plate.drop(columns = ['source', 'batch', 'plate', 'well'], inplace = True)
        dino_plate.to_csv(f'./dino4cells/2020_11_04_CPJUMP1/{plate}/{plate}_raw.csv.gz', compression='gzip', index = False)
        group_df = pd.concat((group_df, dino_plate)).reset_index(drop = True)
    
    normalized_df = pycytominer.normalize(group_df, features = feature_columns, meta_features = 'infer', method = 'spherize', spherize_epsilon = 1e-3, samples = "Metadata_control_type == 'negcon'")
    for plate in normalized_df.Metadata_Plate.unique():
        to_save = normalized_df[normalized_df.Metadata_Plate == plate].reset_index(drop = True)
        to_save.to_csv(f'./dino4cells/2020_11_04_CPJUMP1/{plate}/{plate}_group_spherized_0.001.csv.gz', compression='gzip', index = False)
