In [None]:
# Import all relevant packages
import os
import argparse
import pandas as pd
import numpy as np
from fcsy import DataFrame
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.colors import ListedColormap
import seaborn as sns
from statistics import mean, median, stdev
from scipy.stats import iqr
from math import log2
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score
import umap
plt.style.use('plotstyle.mplstyle')

In [None]:
imputed_path = ""
labeling_path = ""

In [None]:
# Prefixes in the labeling
pop_prefixes = ['CD27-, TIGIT subset', 'SCM', 'TIGIT+, KLRG1+ CM', 'TIM3 positive EM CD4',
               'CD57, CD28 subset','KLRG1, TIGIT subset']

# Suffixes and how they should be renamed
rename = {'TIM3 positive EM CD4+': 'TIM-3+ \n CD4+ $T_{EM}$',
          'TIM3 positive EM CD4-': 'TIM-3- \n CD4+ $T_{EM}$',
          'SCM+': 'CD4+ $T_{SCM}$',
          'SCM-': 'CD4+ $T_{N}$',
          'TIGIT+, KLRG1+ CM+': 'TIGIT+/KLRG1+ \n CD8+ $T_{CM}$',
          'TIGIT+, KLRG1+ CM-': 'Other \n CD8+ $T_{CM}$ cells',
          'CD57, CD28 subset-': 'Other $T_{EMRA}$ cells',
          'KLRG1, TIGIT subset-': 'Other $T_{EMRA}$ cells',
          'CD27-, TIGIT subset+': '$T_{SN}$',
          'CD27-, TIGIT subset-': 'Other $T_{EMRA}$ cells'}

# Which names form a discrete subset
subsets = {'TIM-3+ CD4+ $T_{EM}$': ['TIM-3+ \n CD4+ $T_{EM}$', 
                             'TIM-3- \n CD4+ $T_{EM}$'],
           'CD4+ $T_{SCM}$': ['CD4+ $T_{SCM}$', 
                              'CD4+ $T_{N}$'],
           'TIGIT+/KLRG1+ CD8+ $T_{CM}$': ['TIGIT+/KLRG1+ \n CD8+ $T_{CM}$', 
                             'Other \n CD8+ $T_{CM}$ cells'],
           'CD8+ $T_{SN}$': ['$T_{SN}$', 
                             'Other $T_{EMRA}$ cells']}

In [None]:
order = ['CyTOFmerge', 'CytoBackBone', 'cyCombine', 'Infinicyt']
scores = []

for name in order:
    print(name)
    # Load the concatenated expression data
    exprs = DataFrame.from_fcs(imputed_path + '001_' + name + '_exprs.fcs')
    exprs = exprs.reset_index(drop=True)
    gates = pd.read_csv(labeling_path + '001_' + name + '_labels.csv', index_col=0)
    gates = gates.reset_index(drop=True)
    exprs = pd.concat([exprs, gates], axis=1)

    # Parse and rename populations
    exprs['Population'] = [i.split('/')[-1] for i in exprs['V1']]
    for i in pop_prefixes:
        exprs['Population'] = exprs['Population'].replace(i+'-', rename[i+'-'])
        try:
            exprs['Population'] = exprs['Population'].replace(i+'+', rename[i+'+'])
        except KeyError:
            continue
            
    gt_data = exprs[exprs['imp_state']==0]
    imp_data = exprs[exprs['imp_state']==1]
    
    for subset in subsets:
        gt_temp = gt_data.copy()
        imp_temp = imp_data.copy()
        
        # Store counts/proportions
        gt_count = len(gt_temp[gt_temp['Population']==subsets[subset][0]])
        gt_prop = gt_count / len(gt_temp)
        imp_count = len(imp_temp[imp_temp['Population']==subsets[subset][0]])
        imp_prop = imp_count / len(imp_temp)
        if gt_count != 0:
            perc_change = ((imp_prop - gt_prop) / gt_prop) * 100
        else:
            perc_change = None
        
        # Save results
        result = {'Method': name,
                  'Population': subset,
                  'Ground-truth count': gt_count,
                  'Ground-truth proportion': gt_prop,
                  'Ground-truth abundance': gt_prop * 100,
                  'Imputed count': imp_count,
                  'Imputed proportion': imp_prop,
                  'Imputed abundance': imp_prop * 100,
                  '% Change': perc_change}
        scores.append(result)
                              
statistics = pd.DataFrame(scores)

In [None]:
absolute = statistics.copy()
absolute['% Change'] = abs(absolute['% Change'])
absolute[['Method', '% Change']].groupby('Method').mean()