In [1]:
import rdkit
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from molvs import standardize_smiles

In [3]:
# 1. Step: Prerequisite
cols = pd.read_csv("../input/Raw_Image_data.txt", nrows=1, sep='\t').columns.tolist()

# definition of meta and data column names (manually for meta_cols) 
meta_cols = ['Metadata_broad_sample', 'CPD_SMILES', 'Metadata_Plate', 'Metadata_Plate_Map_Name','Metadata_ASSAY_WELL_ROLE','Metadata_mmoles_per_liter']
data_cols = cols[18:]

# callable that can be used as input for usecols argument (s.b.)
use_cols = lambda col: col in meta_cols+data_cols
    
# Load and merge the dataframes in one command
df = pd.merge(right=pd.read_csv("../input/Raw_Image_data.txt", sep='\t', usecols=use_cols), left=pd.read_csv('../input/cp_annotations.csv', usecols=use_cols), how='right', on='Metadata_broad_sample')

KeyboardInterrupt: 

In [3]:
# 2. Step: Center the dataframe
def center_df(df,data_cols):
    
    # split into mock (treated with DMSO) and treated (treated with some small molecule)
    mock = df.query('Metadata_ASSAY_WELL_ROLE == "mock"')
    df = df.query('Metadata_ASSAY_WELL_ROLE == "treated"').dropna(axis=0, subset=['CPD_SMILES'])
    
    # take the platewise mean for mock samples (data cols only)
    mock = pd.concat([mock.loc[:,'Metadata_Plate'], mock.loc[:,data_cols]], axis=1, sort=False)
    mock = mock.groupby('Metadata_Plate').mean().reset_index()
    
    # define new list of names to refer to the names in the merged dataframe
    mock_data_cols = [i+'_mock' for i in data_cols]
    
    ### Subtract the mean of the mock samples platewise from the treated samples
    # merge the treated and mock dataframe on the respective plates
    df = pd.merge(left=df, right=mock, how='left', suffixes=['', '_mock'], on='Metadata_Plate')
    
    # split into data columns for later subtraction and metadata for retrieving all information of the final df 
    treated_meta = df.drop(data_cols+mock_data_cols, axis=1)
    mock         = df.loc[:,mock_data_cols]
    df           = df.loc[:,data_cols]
    
    # subtraction only works if the columns names aswell as the dimensions off the 2 dfs are identical
    mock.columns = mock.columns.str.replace(r'_mock$', '')
    
    # elementwise subtraction of the mock from the treated
    df = df.sub(mock)
    
    # retrieve the metadata to gather all information for the final dataframe
    df = pd.concat([treated_meta, df], axis=1, sort=False)
    
    return df

In [4]:
df = center_df(df,data_cols)

In [5]:
# 3. Step: average concentrations of single concentration compounds
def average_concentrations(centered_df,meta_cols,data_cols):
    
    # generate canonical SMILES for treated samples
    smiles = df.CPD_SMILES.to_list()
    df['CAN_SMILES'] = [Chem.MolToSmiles(Chem.MolFromSmiles(smile),True) for smile in smiles]

    # group the meta data by canonical smiles codes
    smiles_group = df[meta_cols].groupby('CAN_SMILES')

    # initialize empty list to hold smiles codes that present multi concentrations
    smiles_multi_concs = []

    # iterate through the individual groups of smiles_group (second argument; here 'group')
    for _, group in smiles_group:
        # is the number of individual concentrations in the group higher than 1?
        if len(group.Metadata_mmoles_per_liter.value_counts()) > 1:
            # if so: append the canonical smiles to the list multi concentration smiles codes
            smiles_multi_concs.append(group.CAN_SMILES.iloc[0])

    # split the df into compounds with single/multi concentrations using the list as a filter
    df_multi_concs = df[df['CAN_SMILES'].isin(smiles_multi_concs)]
    df_singl_concs = df[~df['CAN_SMILES'].isin(smiles_multi_concs)]

    # split the single_concs df into meta and raw data for stitching the complete information together later
    df_singl_concs_meta = df_singl_concs[meta_cols].drop_duplicates(subset='CAN_SMILES')
    df_singl_concs = pd.concat([df_singl_concs.CAN_SMILES, df_singl_concs[data_cols]], axis=1, sort=False)

    # for the df with the single concentrations use the split-apply-combine method via groupby
    df_singl_concs = df_singl_concs.groupby(['CAN_SMILES']).median().reset_index()

    # merge the single concs meta back together with single conc raw data to get all information together
    df_singl_concs = pd.merge(right=df_singl_concs, left=df_singl_concs_meta, how='left', on='CAN_SMILES')

    # create a column that labels if an entry has single or multi concentration
    df_singl_concs['SINGLE_CONC'] = True
    df_multi_concs['SINGLE_CONC'] = False
    
    # concatenate the multi columns without further processing. this has to be done manually if necessary at all
    df_avr_concs = pd.concat([df_singl_concs,df_multi_concs])

    return df_avr_concs

In [6]:
df = average_concentrations(df,meta_cols,data_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
df.to_csv('cp_center_median.csv')

In [5]:
pd.read_csv('cp_center_median.csv')

Unnamed: 0.1,Unnamed: 0,Metadata_broad_sample,CPD_SMILES,Metadata_Plate,Metadata_Plate_Map_Name,Metadata_ASSAY_WELL_ROLE,Metadata_mmoles_per_liter,CAN_SMILES,Cells_AreaShape_Area,Cells_AreaShape_Center_X,...,Nuclei_Texture_Variance_ER_10_0,Nuclei_Texture_Variance_ER_3_0,Nuclei_Texture_Variance_ER_5_0,Nuclei_Texture_Variance_Mito_10_0,Nuclei_Texture_Variance_Mito_3_0,Nuclei_Texture_Variance_Mito_5_0,Nuclei_Texture_Variance_RNA_10_0,Nuclei_Texture_Variance_RNA_3_0,Nuclei_Texture_Variance_RNA_5_0,SINGLE_CONC
0,0,BRD-K18250272-003-03-7,CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC,24277,H-BIOA-004-3,treated,3.022516,CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC,143.429688,2.789062,...,-0.000870,-0.070603,-0.081174,-0.148791,-0.189952,-0.182775,-0.099114,-0.086938,-0.081457,True
1,1,BRD-K18316707-001-01-9,COc1cc(C)cc(OC)c1[C@@H]1C=C(C)CC[C@H]1C(C)=C,24277,H-BIOA-004-3,treated,5.000000,C=C(C)[C@@H]1CCC(C)=C[C@H]1c1c(OC)cc(C)cc1OC,224.734375,13.925781,...,-0.093223,-0.148464,-0.150041,-0.146006,-0.157577,-0.160493,-0.098040,-0.106515,-0.092425,True
2,2,BRD-K18438502-001-02-6,COc1cc(O)cc(\C=C\c2ccccc2)c1,24277,H-BIOA-004-3,treated,5.000000,COc1cc(O)cc(/C=C/c2ccccc2)c1,-40.601562,-2.425781,...,-0.031940,-0.063221,-0.069118,-0.039471,-0.028921,-0.046624,-0.050277,-0.057803,-0.040747,True
3,3,BRD-K18550767-001-02-8,COc1c(O)cc2C(=O)O[C@H]3[C@@H](O)[C@H](O)[C@@H]...,24277,H-BIOA-004-3,treated,5.000000,COc1c(O)cc2c(c1O)[C@H]1O[C@H](CO)[C@@H](O)[C@H...,204.468750,-0.546875,...,-0.130060,-0.133544,-0.152315,-0.090233,-0.135499,-0.106445,-0.085849,-0.101692,-0.085511,True
4,4,BRD-K18574842-323-03-3,CCOc1ccc2ccccc2c1C(=O)N[C@H]1[C@H]2SC(C)(C)[C@...,24277,H-BIOA-004-3,treated,2.195487,CCOc1ccc2ccccc2c1C(=O)N[C@@H]1C(=O)N2[C@@H](C(...,200.398438,-10.324219,...,-0.111039,-0.127590,-0.123794,-0.096817,-0.125233,-0.118672,-0.058254,-0.084084,-0.072438,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31687,126774,BRD-K31618614-001-02-1,CN1[C@@H]2CC[C@@H](CC(=O)NCC3CCCCC3)O[C@@H]2CO...,26772,H-CBLN-001-4,treated,5.099641,CN1C(=O)c2cc(NC(=O)c3cccc(F)c3)ccc2OC[C@H]2O[C...,1306.312500,-42.171875,...,-0.171040,-0.081307,-0.133505,-0.230991,-0.258775,-0.275599,-0.019076,-0.053118,-0.048084,False
31688,126775,BRD-K31618614-001-01-3,CN1[C@@H]2CC[C@@H](CC(=O)NCC3CCCCC3)O[C@@H]2CO...,26767,H-CBLN-001-4,treated,4.983182,CN1C(=O)c2cc(NC(=O)c3cccc(F)c3)ccc2OC[C@H]2O[C...,1375.382812,-6.507812,...,0.119383,0.154671,0.144301,-0.183637,-0.286036,-0.264595,-0.244280,-0.238312,-0.224176,False
31689,126776,BRD-K31618614-001-01-3,CN1[C@@H]2CC[C@@H](CC(=O)NCC3CCCCC3)O[C@@H]2CO...,26768,H-CBLN-001-4,treated,4.983182,CN1C(=O)c2cc(NC(=O)c3cccc(F)c3)ccc2OC[C@H]2O[C...,1668.093750,-18.796875,...,0.091517,0.090336,0.093177,-0.200287,-0.269464,-0.297880,-0.167939,-0.234359,-0.207291,False
31690,126777,BRD-K31618614-001-01-3,CN1[C@@H]2CC[C@@H](CC(=O)NCC3CCCCC3)O[C@@H]2CO...,26771,H-CBLN-001-4,treated,4.983182,CN1C(=O)c2cc(NC(=O)c3cccc(F)c3)ccc2OC[C@H]2O[C...,1445.031250,55.812500,...,0.233523,0.171183,0.163368,-0.276584,-0.316513,-0.290360,-0.212205,-0.240141,-0.201639,False
