In [None]:
# Import all relevant packages
import pandas as pd
import numpy as np
from fcsy import DataFrame
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
from sklearn.metrics import adjusted_rand_score
from scipy.stats import gaussian_kde, pearsonr
from utils import load_data, load_Nilsson_data, load_Mosmann_data
%load_ext autoreload
%autoreload 2
plt.style.use('plotstyle.mplstyle')

In [None]:
imputed_path = ""
gt_path = ""
figures_path = ""

nilsson_path =  ""
mosmann_path = ""

In [None]:
data = load_data(gt_path, imputed_path)
nilsson = load_Nilsson_data(nilsson_path, nilsson_path)
mosmann = load_Mosmann_data(mosmann_path, mosmann_path)

In [None]:
def calculate_statistics(expression_data,
                         methods=['CyTOFmerge', 'cyCombine', 'Infinicyt'],
                         sample_col='sample_id'):
    bin_results = []
    results = []
    fold_changes = []
    for method in methods:
        method_data = expression_data[expression_data['method']==method]
        for sample_id in expression_data[sample_col].unique():
            sample_data = method_data[method_data[sample_col]==sample_id]
            imputed_data = sample_data[sample_data['imp_state'] == 1]
            gt_data = sample_data[sample_data['imp_state'] == 0]
            
            # Calculate ARI
            if method != 'CytoBackBone':
                cluster_ARI = adjusted_rand_score(gt_data['fSOM_cluster'], imputed_data['fSOM_metacluster'])
                metacluster_ARI = adjusted_rand_score(gt_data['fSOM_metacluster'], imputed_data['fSOM_metacluster'])
                
                for cluster in list(gt_data['fSOM_metacluster'].unique()) + list(imputed_data['fSOM_metacluster'].unique()):
                    gt_count = len(gt_data[gt_data['fSOM_metacluster']==cluster])
                    gt_prop = gt_count / len(gt_data)
                    imp_count = len(imputed_data[imputed_data['fSOM_metacluster']==cluster])
                    imp_prop = imp_count / len(imputed_data)

                    if gt_count > 0 and imp_count > 0:
                        perc_change = ((imp_prop - gt_prop) / gt_prop) * 100
                    else:
                        perc_change = None

                    fold_changes.append({'Method': method,
                                         'Sample': sample_id,
                                         'Metacluster': cluster,
                                         'gt_count': gt_count,
                                         'gt_prop':gt_prop,
                                         'imp_count': imp_count,
                                         'imp_prop':imp_prop,
                                         'perc_change': perc_change})
            else:
                cluster_ARI = None
                metacluster_ARI = None

            results.append({'Method':method,
                            'Sample':sample_id,
                            'cluster_ARI': cluster_ARI,
                            'metacluster_ARI': metacluster_ARI,
                            'n_cells':len(gt_data)})
    results = pd.DataFrame(results)
    fold_changes = pd.DataFrame(fold_changes)
    return results, fold_changes

In [None]:
flow_statistics, flow_clusters = calculate_statistics(data)

In [None]:
nilsson_statistics, nilsson_clusters = calculate_statistics(nilsson)
mosmann_statistics, mosmann_clusters = calculate_statistics(mosmann)

In [None]:
flow_variable1 = ['FITC-A', 'APC-A', 'BV605-A', 'BV786-A']
flow_variable2 = ['PE-A', 'PE-CF594-A', 'BV711-A', 'PC7-A']
flow_bb = ["HV500c-A", "BUV395-A", "PerCP-Cy5-5-A", "BUV737-A", "BUV496-A", "BV421-A", "APC-R700-A"]

# Define how the different channels should be renamed in plots
flow_rename = {'APC-A':'KLRG1',
               'BV711-A':'TIM-3',
               'FITC-A':'CD57',
               'BV786-A':'CD27',
               'PE-A':'CD28',
               'PE-CF594-A':'CD95',
               'PC7-A':'TIGIT',
               'BV605-A':'PD-1'}

# Define which markers are imputed in which dataset
flow_marker_setup = {'CD57':2,
                     'KLRG1':2,
                     'PD-1':2,
                     'CD27':2,
                     'CD28':1,
                     'CD95':1,
                     'TIM-3':1,
                     'TIGIT':1}

In [None]:
flow_statistics['Dataset'] = 'In-house MM'
nilsson_statistics['Dataset'] = 'Nilsson_rare'
mosmann_statistics['Dataset'] = 'Mosmann_rare'

In [None]:
all_statistics = pd.concat([flow_statistics, nilsson_statistics, mosmann_statistics])

In [None]:
# Calculate interquartile-ranges
stdev = all_statistics.groupby(['Method', 'Dataset']).std()
for column in ['metacluster_ARI']:
    stdev = stdev.rename(columns={column:column+' SD'})
means = all_statistics.groupby(['Method', 'Dataset']).mean()

statistics = pd.merge(means.reset_index(), stdev.reset_index(), on=['Method', 'Dataset'])
statistics = statistics[['Method', 'Dataset', 'metacluster_ARI', 'metacluster_ARI SD']]

table = []
for record in statistics.round(2).to_dict(orient='records'):
    table.append({'Method':record['Method'],
                  'Dataset':record['Dataset'],
                  'Mean ARI': str(record['metacluster_ARI'])+' ('+str(record['metacluster_ARI SD'])+')'})
statistics = pd.DataFrame(table)

# Supplemental Figure 8

In [None]:
fig = plt.figure(figsize=(8, 6))
gs = fig.add_gridspec(nrows=3, ncols=2)

ax = fig.add_subplot(gs[0, 0])
sample = data[(data['sample_id']=='22B_021') & (data['method']=='Infinicyt')]
cluster = sample[sample['fSOM_metacluster']==40]
temp = sample[sample['imp_state']==0]
kernel = gaussian_kde(np.vstack([temp['PE-CF594-A'].sample(n=1000, random_state=1), 
                                 temp['APC-A'].sample(n=1000, random_state=1)]))
c = kernel(np.vstack([temp['PE-CF594-A'], temp['APC-A']]))
ax.scatter(temp['PE-CF594-A'], temp['APC-A'], s=1, c=c, edgecolor='none', rasterized=True, 
           cmap=sns.color_palette("Spectral_r", as_cmap=True))
sns.scatterplot(data=cluster, x='PE-CF594-A', y='APC-A', s=5, color='black', ax=ax)
ax.set_ylim(-2, 6)
plt.title('Imputed')
ax.set_ylabel('KLRG1')
ax.set_xlabel('CD95')

ax = fig.add_subplot(gs[0, 1])
ax.scatter(temp['PE-CF594-A'], temp['APC-A'], s=1, c=c, edgecolor='none', rasterized=True, 
           cmap=sns.color_palette("Spectral_r", as_cmap=True))
imputed = cluster[cluster['imp_state'] == 1]
gt = sample[sample['imp_state']==0]
gt = gt[gt['original_ID'].isin(imputed['original_ID'])]
sns.scatterplot(data=gt, x='PE-CF594-A', y='APC-A', s=5, color='black', ax=ax)
ax.spines.left.set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.set_ylim(-2, 6)
plt.title('Ground truth')
ax.set_xlabel('CD95')

ax = fig.add_subplot(gs[1, 0])

sample = nilsson[(nilsson['method']=='Infinicyt')]
cluster = sample[sample['fSOM_metacluster']==25]
temp = sample[sample['imp_state']==0]

x = 'DAPI-A'
y = 'PE-A'
kernel = gaussian_kde(np.vstack([temp[x].sample(n=1000, random_state=1), 
                                 temp[y].sample(n=1000, random_state=1)]))
c = kernel(np.vstack([temp[x], temp[y]]))
ax.scatter(temp[x], temp[y], s=1, c=c, edgecolor='none', rasterized=True, 
           cmap=sns.color_palette("Spectral_r", as_cmap=True))
sns.scatterplot(data=cluster, x=x, y=y, s=5, color='black')
ax.set_ylim(-1, 6)
ax.set_ylabel('CD123')
ax.set_xlabel('CD45RA')

ax = fig.add_subplot(gs[1, 1])
c = kernel(np.vstack([temp[x], temp[y]]))
ax.scatter(temp[x], temp[y], s=1, c=c, edgecolor='none', rasterized=True, 
           cmap=sns.color_palette("Spectral_r", as_cmap=True))
gt = sample[sample['imp_state']==0]
gt = gt[gt['original_ID'].isin(imputed['original_ID'])]
sns.scatterplot(data=gt, x=x, y=y, s=5, color='black')
ax.spines.left.set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.set_ylim(-1, 6)
ax.set_xlabel('CD45RA')

ax = fig.add_subplot(gs[2, 0])
sample = mosmann[(mosmann['method']=='Infinicyt')]
cluster = sample[sample['fSOM_metacluster']==5]
temp = sample[sample['imp_state']==0]

x = 'Violet H 450/50-A'
y = 'Red C 660/20-A'
kernel = gaussian_kde(np.vstack([temp[x].sample(n=1000, random_state=1), 
                                 temp[y].sample(n=1000, random_state=1)]))
c = kernel(np.vstack([temp[x], temp[y]]))
ax.scatter(temp[x], temp[y], s=1, c=c, edgecolor='none', rasterized=True, 
           cmap=sns.color_palette("Spectral_r", as_cmap=True))
sns.scatterplot(data=cluster, x=x, y=y, s=5, color='black', ax=ax)
ax.set_ylim(-2, 7.6)
ax.set_ylabel('TNFa')
ax.set_xlabel('IL-5')

ax = fig.add_subplot(gs[2, 1])
c = kernel(np.vstack([temp[x], temp[y]]))
ax.scatter(temp[x], temp[y], s=1, c=c, edgecolor='none', rasterized=True, 
           cmap=sns.color_palette("Spectral_r", as_cmap=True))
gt = sample[sample['imp_state']==0]
gt = gt[gt['original_ID'].isin(imputed['original_ID'])]
sns.scatterplot(data=gt, x=x, y=y, s=5, color='black')
ax.spines.left.set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.set_ylim(-2, 7.6)
ax.set_xlabel('IL-5')

plt.subplots_adjust(wspace=0.05, hspace=0.4)