Author: Tim Mocking

Contact: t.r.mocking@amsterdamumc.nl

In [None]:
# Path where aggregate FCS files are saved
agg_path = ''

In [None]:
# Import all relevant packages
import os
import argparse
import pandas as pd
import numpy as np
from fcsy import DataFrame
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.colors import ListedColormap
import seaborn as sns
from statistics import mean, median, stdev
from scipy.stats import iqr
from math import log2
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score
import umap
plt.style.use('plotstyle.mplstyle')

# Investigate single-marker gating

In [None]:
order = ['CyTOFmerge', 'CytoBackBone', 'cyCombine', 'Infinicyt']
scores = []

# Define which markers are imputed in which dataset
flow_marker_setup = {'CD57':2, 'KLRG1':2, 'PD-1':2, 'CD27':2,
                     'CD28':1, 'CD95':1, 'TIM-3':1, 'TIGIT':1}

for name in order:
    # Load the concatenated expression data
    exprs = DataFrame.from_fcs(agg_path +'Flow_'+name+'_agg.fcs')
    exprs = exprs.reset_index(drop=True)
    gates = pd.read_csv(agg_path + 'Flow_'+name+'SingleGate_labels.csv', index_col=0)
    for i in gates.columns:
        gates = gates.rename(columns={i:i[1:]})
    gates = gates.rename(columns={'PD1':'PD-1', 'TIM3':'TIM-3'})
    gates = gates.astype(str)
    gate_names = gates.columns
    gates = gates.reset_index(drop=True)
    exprs = pd.concat([exprs, gates], axis=1)
     
    # Loop over files in the expression matrix
    for file in exprs['File'].unique():
        gt_data = exprs[(exprs['File']==file) & (exprs['imp_state']==0)]
        imp_data = exprs[(exprs['File']==file) & (exprs['imp_state']==1)]
        
        # Ensure cell ordering between GT and imputed cells
        if name != 'CytoBackBone':
            assert len(gt_data) == len(imp_data)
            imp_data = imp_data.sort_values('original_ID')
            gt_data = gt_data.sort_values('original_ID')
        
        for gate in gate_names:
            if name != 'CytoBackBone':
                # Only use cells from the other tube when imputing one marker
                gt_temp = gt_data[gt_data['dataset']==flow_marker_setup[gate]]
                imp_temp = imp_data[imp_data['dataset']==flow_marker_setup[gate]]

                # Store counts/proportions
                gt_count = len(gt_temp[gt_temp[gate]=='1'])
                gt_prop = gt_count / len(gt_temp)
                imp_count = len(imp_temp[imp_temp[gate]=='1'])
                imp_prop = imp_count / len(imp_temp)
                if gt_count != 0:
                    perc_change = ((imp_prop - gt_prop) / gt_prop) * 100
                else:
                    perc_change = None
                
                # Calculate classification statistics if sufficient cells are present
                if len(gt_temp[gt_temp[gate]=='1']) != 0 and len(imp_temp[imp_temp[gate]=='1']) != 0:
                    sensitivity = recall_score(gt_temp[gate], imp_temp[gate], pos_label='1')
                    specificity = recall_score(gt_temp[gate], imp_temp[gate], pos_label='0')
                    f1 = f1_score(gt_temp[gate], imp_temp[gate], pos_label='1')
                elif len(gt_temp[gt_temp[gate]=='1']) != 0 and len(imp_temp[imp_temp[gate]=='1']) == 0:
                    sensitivity = 0
                    specificity = 0
                    f1 = 0
                else:
                    sensitivity = None
                    specificity = None
                    f1 = None
            else:
                # Store counts/proportions
                gt_count = len(gt_data[gt_data[gate]=='1'])
                gt_prop = gt_count / len(gt_data)
                imp_count = len(imp_data[imp_data[gate]=='1'])
                imp_prop = imp_count / len(imp_data)
                if gt_count != 0:
                    perc_change = ((imp_prop - gt_prop) / gt_prop) * 100
                else:
                    perc_change = None
                sensitivity = None
                specificity = None
                f1 = None
            
            # Save results
            result = {'Method': name,
                      'Population': gate,
                      'File':file,
                      'Sensitivity': sensitivity,
                      'Specificity': specificity,
                      'F-score': f1,
                      'Ground-truth count': gt_count,
                      'Ground-truth proportion': gt_prop,
                      'Imputed count': imp_count,
                      'Imputed proportion': imp_prop,
                      '% Change': perc_change}
            scores.append(result)
                              
singlemarker_data = pd.DataFrame(scores)

In [None]:
# Calculate interquartile-ranges
stdev = singlemarker_data.groupby(['Population', 'Method']).std()
for column in ['Sensitivity', 'Specificity', 'F-score']:
    stdev = stdev.rename(columns={column:column+' SD'})
means = singlemarker_data.groupby(['Population', 'Method']).mean()
statistics = pd.merge(means.reset_index(), stdev.reset_index(), on=['Population', 'Method'])
statistics = statistics[['Population', 'Method', 'Sensitivity', 'Sensitivity SD',
                         'Specificity', 'Specificity SD', 'F-score', 'F-score SD']]
statistics = statistics[statistics['Method']!='CytoBackBone']
marker_order = {'APC-A':'KLRG1', 'BV711-A':'TIM-3', 'FITC-A':'CD57',
               'BV786-A':'CD27', 'PE-A':'CD28', 'PE-CF594-A':'CD95',
               'PC7-A':'TIGIT', 'BV605-A':'PD-1'}
table = []
for marker in marker_order.values():
    temp = statistics[statistics['Population']==marker]
    for record in temp.round(2).to_dict(orient='records'):
        table.append({'Marker':record['Population'],
                      'Method':record['Method'],
                      'Mean sensitivity': str(record['Sensitivity'])+' ('+str(record['Sensitivity SD'])+ ')',
                      'Mean specificity': str(record['Specificity'])+' ('+str(record['Specificity SD'])+ ')',
                      'Mean F-score': str(record['F-score'])+' ('+str(record['F-score SD'])+ ')'})
statistics = pd.DataFrame(table)
# statistics.to_excel('../Tables/SingleMarker_Classification.xlsx')

In [None]:
singlemarker_data[singlemarker_data['Method']=='cyCombine']['F-score'].median()

# Investigate population-level gates

In [None]:
# Prefixes in the labeling
pop_prefixes = ['CD27-, TIGIT subset', 'SCM', 'TIGIT+, KLRG1+ CM', 'TIM3 positive EM CD4',
               'CD57, CD28 subset','KLRG1, TIGIT subset']

# Suffixes and how they should be renamed
rename = {'TIM3 positive EM CD4+': 'TIM-3+ \n CD4+ $T_{EM}$',
          'TIM3 positive EM CD4-': 'TIM-3- \n CD4+ $T_{EM}$',
          'SCM+': 'CD4+ $T_{SCM}$',
          'SCM-': 'CD4+ $T_{N}$',
          'TIGIT+, KLRG1+ CM+': 'TIGIT+/KLRG1+ \n CD8+ $T_{CM}$',
          'TIGIT+, KLRG1+ CM-': 'Other \n CD8+ $T_{CM}$ cells',
          'CD57, CD28 subset-': 'Other $T_{EMRA}$ cells',
          'KLRG1, TIGIT subset-': 'Other $T_{EMRA}$ cells',
          'CD27-, TIGIT subset+': '$T_{SN}$',
          'CD27-, TIGIT subset-': 'Other $T_{EMRA}$ cells'}

# Which names form a discrete subset
subsets = {'TIM-3+ CD4+ $T_{EM}$': ['TIM-3+ \n CD4+ $T_{EM}$', 
                             'TIM-3- \n CD4+ $T_{EM}$'],
           'CD4+ $T_{SCM}$': ['CD4+ $T_{SCM}$', 
                              'CD4+ $T_{N}$'],
           'TIGIT+/KLRG1+ CD8+ $T_{CM}$': ['TIGIT+/KLRG1+ \n CD8+ $T_{CM}$', 
                             'Other \n CD8+ $T_{CM}$ cells'],
           'CD8+ $T_{SN}$': ['$T_{SN}$', 
                             'Other $T_{EMRA}$ cells']}

In [None]:
order = ['CyTOFmerge', 'CytoBackBone', 'cyCombine', 'Infinicyt']
scores = []
matrices = {}

for name in order:
    print(name)
    # Store confusion matrix for every method for one file
    matrices[name] = {}
    
    # Load the concatenated expression data
    exprs = DataFrame.from_fcs(agg_path+'Flow_'+name+'_agg.fcs')
    exprs = exprs.reset_index(drop=True)
    gates = pd.read_csv(agg_path+'Flow_'+name+'_labels.csv', index_col=0)
    gates = gates.reset_index(drop=True)
    exprs = pd.concat([exprs, gates], axis=1)
    
    # Parse and rename populations
    exprs['Population'] = [i.split('/')[-1] for i in exprs['V1']]
    for i in pop_prefixes:
        exprs['Population'] = exprs['Population'].replace(i+'-', rename[i+'-'])
        try:
            exprs['Population'] = exprs['Population'].replace(i+'+', rename[i+'+'])
        except KeyError:
            continue
            
    # Loop over files in the expression matrix
    for file in exprs['File'].unique():
        print(file)
        gt_data = exprs[(exprs['File']==file) & (exprs['imp_state']==0)]
        imp_data = exprs[(exprs['File']==file) & (exprs['imp_state']==1)]
        
        # Ensure cell ordering between GT and imputed cells
        if name != 'CytoBackBone':
            assert len(gt_data) == len(imp_data)
            imp_data = imp_data.sort_values('original_ID')
            gt_data = gt_data.sort_values('original_ID')
        
        for subset in subsets:
            if name != 'CytoBackBone':
                # Add one-vs-rest classification labeling
                gt_temp = gt_data[gt_data['Population'].isin(subsets[subset])]
                imp_temp = imp_data[imp_data['Population'].isin(subsets[subset])]
                # Assumes that first item in subset is the "positive" class
                gt_temp[subset] = np.where(gt_temp['Population'] == subsets[subset][0], True, False)
                imp_temp[subset] = np.where(imp_temp['Population'] == subsets[subset][0], True, False)

                # Only use cells from the other tube when imputing one marker
                if subset == "CD4+ $T_{EM}$" or subset ==  'CD4+ $T_{SCM}$':
                    gt_temp = gt_temp[gt_temp['dataset'] == 1]
                    imp_temp = imp_temp[imp_temp['dataset'] == 1]

                # Store counts/proportions
                gt_count = len(gt_temp[gt_temp['Population']==subsets[subset][0]])
                gt_prop = gt_count / len(gt_temp)
                imp_count = len(imp_temp[imp_temp['Population']==subsets[subset][0]])
                imp_prop = imp_count / len(imp_temp)
                if gt_count != 0:
                    perc_change = ((imp_prop - gt_prop) / gt_prop) * 100
                else:
                    perc_change = None
                
                # Calculate classification statistics if sufficient cells are present
                if len(gt_temp[gt_temp[subset]==True]) != 0 and len(imp_temp[imp_temp[subset]==True]) != 0:
                    imp_temp[subset] = np.where(imp_temp['Population'] == subsets[subset][0], True, False)
                    sensitivity = recall_score(gt_temp[subset], imp_temp[subset], pos_label=True)
                    specificity = recall_score(gt_temp[subset], imp_temp[subset], pos_label=False)
                    f1 = f1_score(gt_temp[subset], imp_temp[subset], pos_label=True)
                elif len(gt_temp[gt_temp[subset]==True]) != 0 and len(imp_temp[imp_temp[subset]==True]) == 0:
                    sensitivity = 0
                    specificity = 0
                    f1 = 0
                else:
                    sensitivity = None
                    specificity = None
                    f1 = None
                
                # Store confusion matrix for one patient
                if file == 16:
                    matrix = confusion_matrix(gt_temp['Population'], imp_temp['Population'], 
                                              labels=subsets[subset]) 
                    matrices[name][subset] = pd.DataFrame(matrix, index=subsets[subset], 
                                                          columns=subsets[subset])

            else:
                # Store counts/proportions
                gt_count = len(gt_data[gt_data['Population']==subsets[subset][0]])
                gt_prop = gt_count / len(gt_data)
                imp_count = len(imp_data[imp_data['Population']==subsets[subset][0]])
                imp_prop = imp_count / len(imp_data)
                if gt_count != 0:
                    perc_change = ((imp_prop - gt_prop) / gt_prop) * 100
                else:
                    perc_change = None

                sensitivity = None
                specificity = None
                f1 = None
            
            # Save results
            result = {'Method': name,
                      'Population': subset,
                      'File':file,
                      'Sensitivity': sensitivity,
                      'Specificity': specificity,
                      'F-score': f1,
                      'Ground-truth count': gt_count,
                      'Ground-truth proportion': gt_prop,
                      'Imputed count': imp_count,
                      'Imputed proportion': imp_prop,
                      '% Change': perc_change}
            scores.append(result)
                              
pop_data = pd.DataFrame(scores)

In [None]:
pop_data[pop_data['Population']=='CD4+ $T_{SCM}$']['Ground-truth count'].min()

In [None]:
pop_data[pop_data['Population']=='CD4+ $T_{SCM}$']['Ground-truth proportion'].median()

In [None]:
pop_data[pop_data['Population']=='CD4+ $T_{SCM}$']['Ground-truth count'].max()

In [None]:
statistics

In [None]:
# Calculate interquartile-ranges
stdev = pop_data.groupby(['Population', 'Method']).std()
for column in ['Sensitivity', 'Specificity', 'F-score']:
    stdev = stdev.rename(columns={column:column+' SD'})
means = pop_data.groupby(['Population', 'Method']).mean()
statistics = pd.merge(means.reset_index(), stdev.reset_index(), on=['Population', 'Method'])
statistics = statistics[['Population', 'Method', 'Sensitivity', 'Sensitivity SD',
                         'Specificity', 'Specificity SD', 'F-score', 'F-score SD']]
statistics = statistics[statistics['Method']!='CytoBackBone']

population_order = {'CD4+ $T_{SCM}$': 'CD4+ TSCM',
                    'TIM-3+ CD4+ $T_{EM}$' : 'TIM-3+ CD4+ TEM',
                    'TIGIT+/KLRG1+ CD8+ $T_{CM}$': 'TIGIT+/KLRG1+ CD8+ TCM',
                    'CD8+ $T_{SN}$': 'CD8+ TSN'}
table = []
for pop in population_order.keys():
    temp = statistics[statistics['Population']==pop]
    temp['Population'] = population_order[pop]
    for record in temp.round(2).to_dict(orient='records'):
        table.append({'Population':record['Population'],
                      'Method':record['Method'],
                      'Mean sensitivity': str(record['Sensitivity'])+' ('+str(record['Sensitivity SD'])+ ')',
                      'Mean specificity': str(record['Specificity'])+' ('+str(record['Specificity SD'])+ ')',
                      'Mean F-score': str(record['F-score'])+' ('+str(record['F-score SD'])+ ')'})
statistics = pd.DataFrame(table)
# statistics.to_excel('../Tables/Population_Classification.xlsx')

In [None]:
pop_data[pop_data['Population']=='TIM-3+ CD4+ $T_{EM}$']['Ground-truth count'].median()

In [None]:
pop_data[pop_data['Population']=='CD8+ $T_{SN}$']['Ground-truth count'].median()

In [None]:
pop_data[pop_data['Population']=='TIGIT+/KLRG1+ CD8+ $T_{CM}$']['Ground-truth count'].median()

# Unused

In [None]:
# def plot_abundance(dataframe, method, pop, color, plot_xlabel, plot_ylabel, title=None):
#     df = dataframe.copy()
#     print(method, pop)
#     df = df[(df['Method']== method) & (df['Population'] == pop)]
#     melt = pd.melt(df, id_vars=['Method', 'Population', 'File'], 
#                    value_vars=['Ground-truth proportion', 'Imputed proportion'])
#     for file in melt['File'].unique():
#         sns.pointplot(data=melt[melt['File']==file], x='variable', y='value', 
#                       color='black', scale=0.5, ax=ax)
#     sns.boxplot(data=melt, x='variable', y='value', color=color, ax=ax, saturation=1)

#     if title != None:
#         ax.set_title(title, fontweight='bold', fontsize=14)
#     ax.set_xlabel('')
#     if plot_xlabel:
#         ax.set_xticklabels(['Ground-truth', 'Imputed'], rotation=15, ha='right')
#     else:
#         ax.set_xticklabels([])
#     if plot_ylabel:
#         ax.set_ylabel('')
#     else:
#         ax.set_ylabel('')

In [None]:
# order = ['Infinicyt', 'CyTOFmerge',  'CytoBackBone', 'cyCombine']
# palette = ['#f47f2a', '#009cb4', '#ee1d24', '#694893']
# titles = ['TIM-3+ CD4+ 'r'$\bf{T_{EM}}$ cells',
#           'CD4+ 'r'$\bf{T_{SCM}}$ cells',
#           'TIGIT+/KLRG1+ \n CD8+ 'r'$\bf{T_{CM}}$ cells',
#           r'$\bf{T_{SN}}$ cells']

# fig = plt.figure(figsize=(12, 10))
# gs = fig.add_gridspec(nrows=5, ncols=5, width_ratios=[0.1, 0.225, 0.225, 0.225, 0.225])

# subsets = {'None':[],
#            'TIM-3+ CD4+ $T_{EM}$': ['TIM-3+ \n CD4+ $T_{EM}$', 
#                              'TIM-3- \n CD4+ $T_{EM}$'],
#            'CD4+ $T_{SCM}$': ['CD4+ $T_{SCM}$', 
#                               'CD4+ $T_{N}$'],
#            'TIGIT+/KLRG1+ CD8+ $T_{CM}$': ['TIGIT+/KLRG1+ \n CD8+ $T_{CM}$', 
#                              'Other \n CD8+ $T_{CM}$ cells'],
#            'CD8+ $T_{SN}$': ['$T_{SN}$', 
#                              'Other $T_{EMRA}$ cells']}

# for i, method in enumerate(order):
#     for j, pop in enumerate(subsets):
        
#         ax = fig.add_subplot(gs[i, j])
#         # Print row labels
#         if j == 0:
#             ax.set_title(order[i], fontweight='bold', 
#                          fontsize=12, color=palette[i], y=0.5)
#             ax.set_axis_off()
#         else:
#             if i == 0:
#                 title = titles[j-1]
#             else:
#                 title = None
#             if j == 1:
#                 plot_ylabel = True
#             else:
#                 plot_ylabel = False
#             if i == 3:
#                 plot_xlabel = True
#             else:
#                 plot_xlabel = False
#             plot_abundance(data, method, pop, palette[i], 
#                            plot_xlabel=plot_xlabel, plot_ylabel=plot_ylabel, title=title)
# plt.subplots_adjust(wspace=0.8, hspace=0.1)

# Figure 6

In [None]:
from matplotlib.ticker import FuncFormatter

fig = plt.figure(figsize=(4, 4))
gs = fig.add_gridspec(nrows=1, ncols=2)

order = ['Infinicyt', 'CyTOFmerge', 'CytoBackBone', 'cyCombine']
palette = ['#f47f2a', '#009cb4', '#ee1d24', '#694893']

ax = fig.add_subplot(gs[0, 0])
sns.boxplot(data=singlemarker_data, y='% Change', x='Method', orient='v', 
            order=order, palette=palette, saturation=1, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.set_xlabel('')
ax.set_ylim(-110, 150)
ax.set_ylabel('Percentual change in abundance')
ax.yaxis.set_major_formatter(plt.FuncFormatter('{:.0f}%'.format))
ax.set_title('Single marker gate', fontweight='bold', y=1.025)

ax = fig.add_subplot(gs[0, 1])
sns.boxplot(data=pop_data, y='% Change', x='Method', orient='v', 
            order=order, palette=palette, saturation=1, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
ax.set_xlabel('')
ax.set_ylim(-110, 150)
ax.set_ylabel('')
ax.yaxis.set_major_formatter(plt.FuncFormatter('{:.0f}%'.format))
ax.set_title('T-cell subpopulations', fontweight='bold', y=1.025)

plt.subplots_adjust(wspace=1.1, hspace=0)

plt.savefig(figures_path + 'Figure 6.png', dpi=300, bbox_inches='tight')
plt.savefig(figures_path + 'Figure 6.tiff', dpi=300, bbox_inches='tight')

In [None]:
singlemarker_data[singlemarker_data['Method']=='Infinicyt']['% Change'].abs().median()

In [None]:
singlemarker_data[singlemarker_data['Method']=='CyTOFmerge']['% Change'].abs().median()

In [None]:
singlemarker_data[singlemarker_data['Method']=='CytoBackBone']['% Change'].abs().median()

In [None]:
singlemarker_data[singlemarker_data['Method']=='cyCombine']['% Change'].abs().median()

# Confusion matrices

In [None]:
def plot_matrix(matrix, color, title=None, plot_xlabel=False, plot_ylabel=False):
    sns.heatmap(matrix, annot=True, fmt='d', linewidths=2, robust=False, 
                cbar=False, cmap=ListedColormap(color), annot_kws={'color':'black'},
               rasterized=True)
    if title != None:
        ax.set_title(title, fontweight='bold', fontsize=14)
    if plot_ylabel:
        ax.set_ylabel('Ground truth', fontweight='bold')
    else:
        ax.set_ylabel('')
    if plot_xlabel:
        ax.set_xlabel('Imputed', fontweight='bold')
    else:
        ax.set_xlabel('')
    ax.set_yticklabels(ax.get_yticklabels(), rotation = 0, ha='right')
    ax.set_xticklabels(ax.get_xticklabels(), rotation = 15, ha='right')

In [None]:
order = ['Infinicyt', 'CyTOFmerge','cyCombine']
palette = ['#f47f2a', '#009cb4','#694893']
titles = ['TIM-3+ CD4+ 'r'$\bf{T_{EM}}$ cells',
          'CD4+ 'r'$\bf{T_{SCM}}$ cells',
          'TIGIT+/KLRG1+ CD8+ 'r'$\bf{T_{CM}}$ cells',
          r'$\bf{T_{SN}}$ cells']

fig = plt.figure(figsize=(15, 10))
gs = fig.add_gridspec(nrows=4, ncols=5, width_ratios=[0.1, 0.225, 0.225, 0.225, 0.225])

subsets = {'None':[],
           'TIM-3+ CD4+ $T_{EM}$': ['TIM-3+ \n CD4+ $T_{EM}$', 
                             'TIM-3- \n CD4+ $T_{EM}$'],
           'CD4+ $T_{SCM}$': ['CD4+ $T_{SCM}$', 
                              'CD4+ $T_{N}$'],
           'TIGIT+/KLRG1+ CD8+ $T_{CM}$': ['TIGIT+/KLRG1+ \n CD8+ $T_{CM}$', 
                             'Other \n CD8+ $T_{CM}$ cells'],
           'CD8+ $T_{SN}$': ['$T_{SN}$', 
                             'Other $T_{EMRA}$ cells']}

for i, method in enumerate(order):
    for j, pop in enumerate(subsets):
        
        ax = fig.add_subplot(gs[i, j])
        # Print row labels
        if j == 0:
            ax.set_title(order[i], fontweight='bold', 
                         fontsize=14, color=palette[i], y=0.5)
            ax.set_axis_off()
        else:
            if i == 0:
                title = titles[j-1]
            else:
                title = None
            if i == 2:
                plot_xlabel = True
            else:
                plot_xlabel = False
            if j == 1:
                plot_ylabel = True
            else:
                plot_ylabel = False
            plot_matrix(matrices[method][pop], palette[i], title=title, 
                        plot_xlabel=plot_xlabel, plot_ylabel=plot_ylabel)
        
plt.subplots_adjust(wspace=0.9,
                    hspace=0.6)

plt.savefig(figures_path + 'Supplemental Figure 7.png', dpi=300, bbox_inches='tight')
plt.savefig(figures_path + 'Supplemental Figure 7.tiff', dpi=300, bbox_inches='tight')

In [None]:
# Which names form a discrete subset
subsets = {'TIM-3+ CD4+ $T_{EM}$': ['TIM-3+ \n CD4+ $T_{EM}$', 
                             'TIM-3- \n CD4+ $T_{EM}$'],
           'CD4+ $T_{SCM}$': ['CD4+ $T_{SCM}$', 
                              'CD4+ $T_{N}$'],
           'TIGIT+/KLRG1+ CD8+ $T_{CM}$': ['TIGIT+/KLRG1+ \n CD8+ $T_{CM}$', 
                             'Other \n CD8+ $T_{CM}$ cells'],
           'CD8+ $T_{SN}$': ['$T_{SN}$', 
                             'Other $T_{EMRA}$ cells']}

In [None]:
def get_UMAP_data(method, state, backbone, subset='TIGIT+/KLRG1+ CD8+ $T_{CM}$'):
    # Load the concatenated expression data
    exprs = DataFrame.from_fcs(agg_path +'Flow_'+method+'_agg.fcs')
    exprs = exprs.reset_index(drop=True)
    gates = pd.read_csv(agg_path +'Flow_'+method+'_labels.csv', index_col=0)
    gates = gates.reset_index(drop=True)
    exprs = pd.concat([exprs, gates], axis=1)
    if method == 'cyCombine':
        exprs['cyCombine_NA'] = np.where(exprs['fSOM_metacluster'] == -1, True, False)
        exprs = exprs[exprs['cyCombine_NA']!=True]
    # Parse and rename populations
    exprs['Population'] = [i.split('/')[-1] for i in exprs['V1']]
    for i in pop_prefixes:
        exprs['Population'] = exprs['Population'].replace(i+'-', rename[i+'-'])
        try:
            exprs['Population'] = exprs['Population'].replace(i+'+', rename[i+'+'])
        except KeyError:
            continue
    gt = exprs[exprs['imp_state']== 0]
    exprs = exprs[exprs['imp_state']==state]
    exprs = exprs[exprs['Population'].isin(subsets[subset])]
    aggs = []
    for file in exprs['File'].unique():
        pt = exprs[exprs['File']==file]
        pt = pt[pt['Population'].isin(subsets[subset])]
        pos = pt[pt['Population']==subsets[subset][0]]
        neg = pt[pt['Population']==subsets[subset][1]]

        if len(pos) < 2:
            continue
        if len(neg) < 2:
            continue
        if len(pos) < len(neg):
            if len(pos) >= 200:
                pos = pos.sample(n=200)
            neg = neg.sample(n=len(pos))
        else:
            if len(neg) >= 200:
                neg = neg.sample(n=200)
            pos = pos.sample(n=len(neg))        
        combined = pd.concat([pos, neg])
        aggs.append(combined)
    combined = pd.concat(aggs)
    
    reducer = umap.UMAP(n_components=2, random_state=42, low_memory=False)
    embedding = reducer.fit_transform(combined[backbone])
    embedding = pd.DataFrame(embedding)
    for i in embedding.columns:
        embedding = embedding.rename(columns={i:'UMAP'+str(i+1)})
    embedding.index = combined.index
    combined = pd.concat([combined, embedding], axis=1)
    
    return combined

In [None]:
flow_bb = ["HV500c-A", "BUV395-A", "PerCP-Cy5-5-A", "BUV737-A", "BUV496-A", "BV421-A", "APC-R700-A"]
full_bb = flow_bb + ['APC-A', 'PC7-A']

In [None]:
gt = get_UMAP_data(method='CyTOFmerge', backbone=flow_bb, state=0)
cytofmerge = get_UMAP_data(method='CyTOFmerge', backbone=full_bb, state=1)
cytobackbone = get_UMAP_data(method='CytoBackBone', backbone=full_bb, state=1)
cycombine = get_UMAP_data(method='cyCombine', backbone=full_bb, state=1)
infinicyt = get_UMAP_data(method='Infinicyt', backbone=full_bb, state=1)

In [None]:
def plot_umap(dataframe, s=10):
    sns.scatterplot(data=dataframe, x='UMAP1', y='UMAP2', hue='Population', hue_order=hue_order,
                    palette=palette, legend=False, s=s, ax=ax, rasterized=True)
    ax.set_xticks([])
    ax.set_yticks([])

In [None]:
# Define how the different channels should be renamed in plots
flow_rename = {'HV500c-A':'CD45',
               'BUV395-A':'CD3',
               'PerCP-Cy5-5-A':'CD14',
               'BUV737-A': 'CD4',
               'BUV496-A': 'CD8',
               'BV421-A': 'CCR7',
               'APC-R700-A': 'CD45RA',
               'APC-A':'KLRG1',
               'BV711-A':'TIM-3',
               'FITC-A':'CD57',
               'BV786-A':'CD27',
               'PE-A':'CD28',
               'PE-CF594-A':'CD95',
               'PC7-A':'TIGIT',
               'BV605-A':'PD-1'}

In [None]:
fig = plt.figure(figsize=(8, 8))
gs = fig.add_gridspec(nrows=3, ncols=3, width_ratios=[0.5, 0.25, 0.25], height_ratios=[0.3, 0.3, 0.35])

palette = ['#ee1d24', '#003a44']
hue_order = ['TIGIT+/KLRG1+ \n CD8+ $T_{CM}$', 'Other \n CD8+ $T_{CM}$ cells']

ax = fig.add_subplot(gs[:2, :1])
ax.set_title('Backbone UMAP', fontweight='bold')
plot_umap(gt, s=20)

legend_elements = [Line2D([0], [0], label='TIGIT+/KLRG1+ CD8+ $T_{CM}$',
                          marker='o', color='w', markerfacecolor='#ee1d24', markersize=10),
                   Line2D([0], [0], label='Other CD8+ $T_{CM}$ cells',
                          marker='o', color='w', markerfacecolor='#003a44', markersize=10)]
ax.legend(handles=legend_elements, bbox_to_anchor=(1.9, 1.2), facecolor='white', framealpha=1, ncol=2)

ax = fig.add_subplot(gs[2, :3])
melt = pd.melt(gt, id_vars=['Population'], value_vars=flow_bb)
sns.boxplot(data=melt, x='variable', y='value', hue='Population', hue_order=hue_order,
            palette=palette, ax=ax)
labels = [item.get_text() for item in ax.get_xticklabels()]
ax.set_ylabel('Expression value')
ax.set_xlabel('')
ax.set_xticklabels([flow_rename[i] for i in labels])

ax.legend([], frameon=False)
ax = fig.add_subplot(gs[0, 1])
ax.set_title('Infinicyt', fontweight='bold')
plot_umap(infinicyt)

ax = fig.add_subplot(gs[0, 2])
ax.set_title('CyTOFmerge', fontweight='bold')
plot_umap(cytofmerge)

ax = fig.add_subplot(gs[1, 1])
ax.set_title('CytoBackBone', fontweight='bold')
plot_umap(cytobackbone)

ax = fig.add_subplot(gs[1, 2])
ax.set_title('cyCombine', fontweight='bold')
plot_umap(cycombine)

plt.subplots_adjust(wspace=0.5, hspace=0.4)

plt.savefig(figures_path + 'Figure 7.png', dpi=300, bbox_inches='tight')
plt.savefig(figures_path + 'Figure 7.tiff', dpi=300, bbox_inches='tight')