In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from scipy.stats import ttest_ind

In [2]:
datapath = r'./Cellprofiler outputs/test_set_features.csv'
cell_df = pd.read_csv(datapath)

In [3]:
def get_corrs(df, name, groups):
  experiment_groups = df.groupby(groups)

  correlations = pd.DataFrame()

  for feature in df.columns:
    if 'FileName' in feature or 'PathName' in feature or 'Metadata' in feature:
      pass
    elif 'real' in feature:
      pred_feature = feature.replace('real', name, 1)

      pcorrelation = experiment_groups[[feature, pred_feature]].corr('pearson').unstack().iloc[:,1].reset_index()

      pcorrelation.columns = groups + ['pcc']
      pcorrelation['feature'] = feature
      pcorrelation['feature_cat'] = feature.split('_', 1)[0]
      pcorrelation['method'] = 'pearson'
      correlations = pd.concat([correlations, pcorrelation], axis=0)
  return correlations

In [4]:
groups = ['Metadata_marker', 'Metadata_fold', 'Metadata_inputs']
pred_corrs = get_corrs(cell_df, 'prediction', groups)

In [None]:
tmp = pred_corrs[(pred_corrs.Metadata_inputs == 'TD') | (pred_corrs.Metadata_inputs == 'TD_DAPI_phalloidin_Bcatenin')]
tmp = tmp[['feature_cat', 'Metadata_fold', 'Metadata_inputs', 'pcc']].groupby(['feature_cat', 'Metadata_fold', 'Metadata_inputs']).agg(np.median).reset_index()


fig = px.box(
    tmp, 
    x=f'feature_cat',
    y='pcc',
    color='Metadata_inputs',
)

fig.update_traces(boxmean=True)
fig.update_traces(boxpoints=False)

fig.show()

In [None]:
for f in ['Intensity','RadialDistribution','Texture']:
    group1 = tmp[(tmp['Metadata_inputs']=='TD') & (tmp['feature_cat']==f)]
    group2 = tmp[(tmp['Metadata_inputs']=='TD_DAPI_phalloidin_Bcatenin')  & (tmp['feature_cat']==f)]

    test = ttest_ind(group1['pcc'], group2['pcc'], equal_var=False)
    print(f'{f} : {test.pvalue}')


In [None]:
tmp = pred_corrs[(pred_corrs.Metadata_inputs == 'TD') | (pred_corrs.Metadata_inputs == 'TD_DAPI_phalloidin_Bcatenin')]
tmp = tmp[tmp.feature == 'Intensity_MeanIntensity_real']
tmp = tmp[['feature_cat', 'Metadata_fold', 'Metadata_marker', 'Metadata_inputs', 'pcc']].groupby(['feature_cat', 'Metadata_marker', 'Metadata_fold', 'Metadata_inputs']).agg(np.median).reset_index()


fig = px.box(
    tmp, 
    x=f'Metadata_marker',
    y='pcc',
    color='Metadata_inputs',
)

fig.update_traces(boxmean=True)
fig.update_traces(boxpoints=False)

fig.show()

In [None]:
for m in tmp.Metadata_marker.drop_duplicates():
    group1 = tmp[(tmp['Metadata_inputs']=='TD') & (tmp['Metadata_marker']==m)]
    group2 = tmp[(tmp['Metadata_inputs']=='TD_DAPI_phalloidin_Bcatenin')  & (tmp['Metadata_marker']==m)]

    test = ttest_ind(group1['pcc'], group2['pcc'], equal_var=False)
    print(f'{m} : {test.pvalue}')