In [55]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.figure_factory as ff

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

from scipy.stats import ttest_ind

In [40]:
def run_classification(df, m, features, input, name, results):

    cell_data = df[(df['Metadata_marker'] == m) & (df['Metadata_inputs'] == input)]

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    features_cell_df = cell_data[features].values

    fold = 0
    for train_index, test_index in skf.split(features_cell_df, cell_data.condition.astype('category')):
        fold = fold + 1
        X_train = features_cell_df[train_index]
        X_test = features_cell_df[test_index]
        Y_train = cell_data.condition.astype('category').values[train_index]
        Y_test = cell_data.condition.astype('category').values[test_index]

        scaler = StandardScaler()
        scaled_X_train = scaler.fit_transform(X_train)

        pca = PCA(n_components = 'mle')
        pca.fit(scaled_X_train)
        
        pca_X_train = pca.transform(scaled_X_train)

        classifier = SVC()
        classifier.fit(pca_X_train, Y_train)

        scaled_X_test = scaler.fit_transform(X_test)
        pca_X_test = pca.transform(scaled_X_test)

        y_hat = classifier.predict(pca_X_test)

        class_report = classification_report(Y_test, y_hat, target_names=['cond1', 'cond2', 'cond3'], output_dict=True)
        f1 = class_report['weighted avg']['f1-score']
        precision = class_report['weighted avg']['precision']
        recall = class_report['weighted avg']['recall']

        results['fold'].append(fold)
        results['f1'].append(f1)
        results['precision'].append(precision)
        results['recall'].append(recall)
        results['experiment'].append(name)
        results['inputs'].append(input)
        results['marker'].append(m)
        

    return results

In [41]:
test_set_df = pd.read_csv(r'./Cellprofiler outputs/test_set_features.csv')

exif_df = pd.read_csv(r'./Cellprofiler outputs/full_exif_cells.csv')
td_exif_df = pd.read_csv(r'./Cellprofiler outputs/td_exif_cells.csv')
fluorescence_exif_df = pd.read_csv(r'./Cellprofiler outputs/fluorescence_exif_cells.csv')


In [None]:
markers =  ['CD44', 'CD44std', 'CD44v9', 'Ecadherin', 'EpCAM', 'Ncadherin', 'PTEN', 'Vimentin']

In [68]:
test_set_df = pd.read_csv(r'./Cellprofiler outputs/test_set_features.csv')

In [71]:
test_set_df['condition'] = test_set_df.Metadata_well.str[-1].astype(int)/2
test_set_df['condition'] = test_set_df['condition'].apply(np.ceil).astype(int)

In [None]:
# histograms of mean intensity per condition

# real common markers
for m in ['Bcatenin', 'DAPI', 'Phalloidin']:
    tmp_df = test_set_df
    x= [tmp_df[tmp_df['condition'] == 1][f'Intensity_MeanIntensity_{m}'], tmp_df[tmp_df['condition'] == 2][f'Intensity_MeanIntensity_{m}'], tmp_df[tmp_df['condition'] == 3][f'Intensity_MeanIntensity_{m}']]
    fig = ff.create_distplot(x, [1,2,3], show_hist=False, show_rug=False, bin_size=.2)

    fig.show()

# real variable markers
for m in markers:
    tmp_df = test_set_df[test_set_df.Metadata_marker == m]
    x= [tmp_df[tmp_df['condition'] == 1][f'Intensity_MeanIntensity_real'], tmp_df[tmp_df['condition'] == 2][f'Intensity_MeanIntensity_real'], tmp_df[tmp_df['condition'] == 3][f'Intensity_MeanIntensity_real']]
    fig = ff.create_distplot(x, [1,2,3], show_hist=False, show_rug=False, bin_size=.2)

    fig.show()

# virtual variable markers
for m in markers:
    tmp_df = exif_df
    x= [tmp_df[tmp_df['condition'] == 1][f'Intensity_MeanIntensity_{m}'], tmp_df[tmp_df['condition'] == 2][f'Intensity_MeanIntensity_{m}'], tmp_df[tmp_df['condition'] == 3][f'Intensity_MeanIntensity_{m}']]
    fig = ff.create_distplot(x, [1,2,3], show_hist=False, show_rug=False, bin_size=.2)

    fig.show()

In [42]:
context_features = [         
       'AreaShape_Area',
       'AreaShape_Compactness', 
       'AreaShape_Eccentricity',
       'AreaShape_EquivalentDiameter', 
       'AreaShape_Extent', 
       'AreaShape_FormFactor', 
       'AreaShape_MajorAxisLength',
       'AreaShape_MaxFeretDiameter', 
       'AreaShape_MaximumRadius',
       'AreaShape_MeanRadius', 
       'AreaShape_MedianRadius',
       'AreaShape_MinFeretDiameter', 
       'AreaShape_MinorAxisLength',
       'AreaShape_Perimeter', 
       'AreaShape_Solidity',
       'Neighbors_NumberOfNeighbors_Adjacent',
       'Neighbors_PercentTouching_Adjacent',
]

In [43]:
general_marker_features = []
variable_predicted_features = []

for x in exif_df.columns:
    if x.startswith('Intensity') or x.startswith('RadialDistribution') or x.startswith('Texture') :
        if ('DAPI' in x) or ('Bcatenin' in x) or ('Phalloidin' in x):
            general_marker_features.append(x)
        else:
            variable_predicted_features.append(x)

In [44]:
results = {
    'marker':[],
    'f1':[],
    'inputs':[],
    'precision':[],
    'recall':[],
    'fold': [],
    'experiment':[],
}

In [46]:
# general markers classification
for m in markers:
    results = run_classification(exif_df, m, context_features + general_marker_features, 'TD_DAPI_phalloidin_Bcatenin', 'common markers', results)

In [None]:
# Label replacement
td_common_features = []
td_var_features = []

for m in markers:

    for x in td_exif_df.columns:
        if x.startswith('Intensity') or x.startswith('RadialDistribution') or x.startswith('Texture') :
            if f'_{m}_' in x or x.endswith(m):
                td_var_features.append(x)

            if 'pred' in x:
                td_common_features.append(x)

    results = run_classification(td_exif_df, m, context_features + td_common_features + td_var_features, 'TD', 'TD label replacement', results)


In [92]:
# # 4-plex data
real_features = []

for x in test_set_df.columns:
    if x.startswith('Intensity') or x.startswith('RadialDistribution') or x.startswith('Texture') :
        if 'DAPI' in x or 'Phalloidin' in x or 'Bcatenin' in x or 'real' in x:
            real_features.append(x)



for m in ['CD44', 'CD44std', 'CD44v9', 'Ecadherin', 'EpCAM', 'Ncadherin', 'PTEN', 'Vimentin']:
    results = run_classification(test_set_df, m, context_features + real_features, 'TD', '4-plex', results)


In [None]:
# td exif classification
td_common_features = []
td_var_features = []

for x in td_exif_df.columns:
    if x.startswith('Intensity') or x.startswith('RadialDistribution') or x.startswith('Texture') :
        if ('DAPI' not in x) and ('Bcatenin' not in x) and ('Phalloidin' not in x):
            td_var_features.append(x)

        if 'pred' in x:
            td_common_features.append(x)


for m in ['CD44', 'CD44std', 'CD44v9', 'Ecadherin', 'EpCAM', 'Ncadherin', 'PTEN', 'Vimentin']:
    results = run_classification(td_exif_df, m, context_features + td_common_features + td_var_features, 'TD', 'TD ExIF', results)


In [None]:
# fluorescence exif classification
for m in ['CD44', 'CD44std', 'CD44v9', 'Ecadherin', 'EpCAM', 'Ncadherin', 'PTEN', 'Vimentin']:
    results = run_classification(fluorescence_exif_df, m, context_features + general_marker_features + variable_predicted_features, 'DAPI_phalloidin_Bcatenin', 'fluo ExIF', results)

In [None]:
# Full ExIF
for m in ['CD44', 'CD44std', 'CD44v9', 'Ecadherin', 'EpCAM', 'Ncadherin', 'PTEN', 'Vimentin']:
    results = run_classification(exif_df, m, context_features + general_marker_features + variable_predicted_features, 'TD_DAPI_phalloidin_Bcatenin', 'full ExIF', results)

In [None]:
results_df = pd.DataFrame.from_dict(results)[['experiment', 'fold', 'f1']].groupby(['experiment', 'fold']).agg(np.median).reset_index()

fig = px.box(
    results_df,
    x='experiment',
    y='f1',
    category_orders={'experiment': ['common markers', 'TD label replacement', 'real 4-plex', 'TD ExIF', 'fluo ExIF', 'full ExIF']}
    
)

fig.update_traces(boxmean=True)

In [None]:
exp_results_df = results_df


group1 = exp_results_df[(exp_results_df['experiment']=='common markers')]
group2 = exp_results_df[(exp_results_df['experiment']=='TD label replacement')]

t = ttest_ind(group1['f1'], group2['f1'], equal_var=False)
print(f'common vs replacement {t.pvalue}')


group1 = exp_results_df[(exp_results_df['experiment']=='TD label replacement')]
group2 = exp_results_df[(exp_results_df['experiment']=='real 4-plex')]

t = ttest_ind(group1['f1'], group2['f1'], equal_var=False)
print(f'replacement vs stdIF {t.pvalue}')


group1 = exp_results_df[(exp_results_df['experiment']=='real 4-plex')]
group2 = exp_results_df[(exp_results_df['experiment']=='TD ExIF')]

t = ttest_ind(group1['f1'], group2['f1'], equal_var=False)
print(f'stdIF vs td_exif {t.pvalue}')


group1 = exp_results_df[(exp_results_df['experiment']=='common markers')]
group2 = exp_results_df[(exp_results_df['experiment']=='fluo ExIF')]

t = ttest_ind(group1['f1'], group2['f1'], equal_var=False)
print(f'relevant_common vs notd_exif {t.pvalue}')

group1 = exp_results_df[(exp_results_df['experiment']=='full ExIF')]
group2 = exp_results_df[(exp_results_df['experiment']=='fluo ExIF')]

t = ttest_ind(group1['f1'], group2['f1'], equal_var=False)
print(f'exif vs notd_exif {t.pvalue}')

group1 = exp_results_df[(exp_results_df['experiment']=='real 4-plex')]
group2 = exp_results_df[(exp_results_df['experiment']=='full ExIF')]

t = ttest_ind(group1['f1'], group2['f1'], equal_var=False)
print(f'stdIF vs exif {t.pvalue}')
