In [1]:
import sys
import warnings
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pbmc_helper import load_dex_genes

sys.path.append('..')
sys.path.append('/Users/lciernik/Documents/TUB/projects/ans_scoring/ANS_supplementary_information')
from data.load_data import load_datasets

from score_with_all_methods import (
    score_signatures_with_all_methods,
    label_assignment_from_scores,
    get_lbl_assignment_performance,
    get_information_from_scores,
    remove_overlapping_signature_genes,
    get_violin_all_methods,
    prepare_data_for_violin_plot,
    save_close_or_show,
    plot_confusion_matrix
)

warnings.simplefilter(action='ignore', category=FutureWarning)

plt.rcParams.update({'pdf.fonttype': 42, 'font.family': 'sans-serif', 'font.sans-serif': 'Arial', 'font.size': 10})

Storing information and global variables

In [2]:
remove_overlapping_genes = True

SAVE = False
storing_path = Path('/Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk')

if remove_overlapping_genes:
    storing_path = storing_path / 'signatures_without_overlapping'
else:
    storing_path = storing_path / 'signatures_with_overlapping'

if SAVE:
    storing_path.mkdir(parents=True, exist_ok=True)

Loading data

In [3]:
adata = load_datasets('pbmc_b_mono_nk')
adata

AnnData object with n_obs × n_vars = 68574 × 12127
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier'
    var: 'gene_names', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'log1p'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'
    layers: 'counts'

In [4]:
y_true_col = 'celltype.l1'
sample_col = 'orig.ident'

In [5]:
adata.obs[y_true_col].value_counts().sort_index()

B       10613
Mono    43553
NK      14408
Name: celltype.l1, dtype: int64

Computing dimensionality reduction

In [6]:
# sc.tl.pca(adata)
# sc.pp.neighbors(adata)
# sc.tl.umap(adata)

Load signatures

In [7]:
DE_of_celltypes = load_dex_genes(filter_genes=True, threshold_pval=0.01, threshold_log2fc=0.5)

Shape DEX genes BEFORE filtering (11811, 4)
Shape DEX genes AFTER filtering (6288, 4)


In [8]:
subtypes_per_cell_type = adata.obs.groupby('celltype.l1')['celltype.l3'].apply(lambda x: list(x.unique()))

In [9]:
subtypes_per_cell_type.to_dict()

{'B': ['B naive kappa',
  'B memory kappa',
  'B naive lambda',
  'B memory lambda',
  'B intermediate kappa',
  'B intermediate lambda',
  'Plasma',
  'Plasmablast'],
 'Mono': ['CD14 Mono', 'CD16 Mono'],
 'NK': ['NK_2', 'NK_4', 'NK_1', 'NK_3', 'NK_CD56bright', 'NK Proliferating']}

In [9]:
signatures = {}
for row in subtypes_per_cell_type.items():
    cell_type, subtypes = row
    signatures[cell_type] = sorted(
        list(set(DE_of_celltypes[DE_of_celltypes['Cell Type'].isin(subtypes)]['Gene'].tolist())))

In [10]:
if remove_overlapping_genes:
    signatures = remove_overlapping_signature_genes(signatures)

Removed {'HSP90AA1', 'CARHSP1', 'HLA-DRB5', 'NUCKS1', 'RRM1', 'BLOC1S1', 'SSRP1', 'CENPE', 'CKS1B', 'H2AFV', 'HLA-DMA', 'STK17A', 'CX3CR1', 'RAP1B', 'CXXC5', 'TSC22D3', 'LSM3', 'CBX5', 'LAPTM5', 'MZT2B', 'COX8A', 'MCM7', 'PCNA', 'RAN', 'TPM3', 'HMGB1', 'TK1', 'SNRNP25', 'LY6E', 'HIST1H1B', 'PARK7', 'RHOC', 'TUBB', 'ATP6V0B', 'SNRPB', 'RHOQ', 'MTPN', 'CENPM', 'HIST1H1D', 'HLA-DRA', 'TOP2A', 'PLEK', 'PDIA3', 'C12orf75', 'SRSF2', 'GSTP1', 'RNASEH2C', 'SEC11A', 'FCER1G', 'CYBA', 'MCM4', 'MCM3', 'HMGN1', 'HIST1H4C', 'ANXA2', 'UTRN', 'FCGR3A', 'IDH2', 'SLC25A5', 'ARPC2', 'HNRNPA3', 'HMGN3', 'STMN1', 'BIRC5', 'PSME2', 'SMC2', 'MAD2L1', 'SIVA1', 'LDHA', 'PSMA4', 'PHF19', 'PARP1', 'PLAC8', 'ATP1B3', 'H2AFZ', 'LCP1', 'CD74', 'CLIC1', 'PRKDC', 'SRSF10', 'SPN', 'ALYREF', 'DBI', 'HLA-DPA1', 'SERBP1', 'GAPDH', 'CHCHD2', 'ACTB', 'TYMS', 'NUSAP1', 'CD63', 'PSMD8', 'EBP', 'NOP56', 'RRM2', 'POU2F2', 'COX5A', 'SRSF3', 'SMAP2', 'GMNN', 'IFITM2', 'ANP32B', 'SERP1', 'RBX1', 'UBE2S', 'DUT', 'MSN', 'CLSPN', '

In [11]:
order_signatures = ['B', 'Mono', 'NK']

Scoring signatures

In [12]:
score_cols, adata = score_signatures_with_all_methods(adata, signatures, verbose=True)

Scoring B with adjusted_neighborhood_scoring
Scoring B with seurat_scoring
Scoring B with seurat_ag_scoring
Scoring B with seurat_lvg_scoring
Scoring B with scanpy_scoring
Scoring B with jasmine_scoring
Scoring B with jasmine_scoring
Scoring B with ucell_scoring
Scoring Mono with adjusted_neighborhood_scoring
Scoring Mono with seurat_scoring
Scoring Mono with seurat_ag_scoring
Scoring Mono with seurat_lvg_scoring
Scoring Mono with scanpy_scoring
Scoring Mono with jasmine_scoring
Scoring Mono with jasmine_scoring
Scoring Mono with ucell_scoring
Scoring NK with adjusted_neighborhood_scoring
Scoring NK with seurat_scoring
Scoring NK with seurat_ag_scoring
Scoring NK with seurat_lvg_scoring
Scoring NK with scanpy_scoring
Scoring NK with jasmine_scoring
Scoring NK with jasmine_scoring
Scoring NK with ucell_scoring


Label assignment

In [13]:
all_cols = []
label_cols = {}
for method_name, method_scores in score_cols.items():
    adata, new_lbl_col = label_assignment_from_scores(adata, method_name, method_scores, include_undefined=False)
    label_cols[method_name] = new_lbl_col
    all_cols += method_scores + [new_lbl_col]

Visualizing results

In [14]:
# fig = sc.pl.umap(adata, color=all_cols + [sample_col, y_true_col, 'celltype.l2', 'celltype.l3'],
#                  ncols=len(signatures) + 1, return_fig=True)
# save_close_or_show(fig, SAVE, storing_path / 'umap.pdf')

In [15]:
df_melted = prepare_data_for_violin_plot(adata, y_true_col, score_cols)

In [26]:
fig = get_violin_all_methods(df_melted, y_true_col, hue_order=order_signatures, textwrap_width=7, height=1.8, aspect=1, sharey=True)
save_close_or_show(fig, SAVE, storing_path/"violin_all_methods.pdf")

Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk/signatures_without_overlapping/violin_all_methods.pdf.


In [17]:
# for method_name, method_scores in score_cols.items():
#     df = adata.obs.loc[:, method_scores + [y_true_col]]
#     fig = get_violin(df, method_scores, y_true_col)
#     plt.title(f"{method_name}")
#     save_close_or_show(fig, SAVE, storing_path / f'violin_{method_name}.pdf')

Computing label assignment performance

In [18]:
metrics = defaultdict(dict)
nfold = 10

for method_name, method_scores in score_cols.items():
    lbl_col = label_cols[method_name]
    conf_mat, bal_acc, f1_val = get_lbl_assignment_performance(adata,
                                                                    y_true_col=y_true_col,
                                                                    y_pred_col=lbl_col,
                                                                    label_names=order_signatures)

    scores = get_information_from_scores(adata, y_true_col=y_true_col, scores=method_scores, nfold=nfold)

    metrics[method_name] = {
        'conf_mat': conf_mat,
        'balanced_accuracy': bal_acc,
        'f1_score': f1_val,
        f'logreg_balanced_accuracy_{nfold}cv': np.mean(scores),
        f'logreg_balanced_accuracy_{nfold}cv_std': np.std(scores)
    }
    fig = plot_confusion_matrix(conf_mat, order_signatures, method_name, figsize=(2.5, 2.5), textwrap_width=7,
                                xrotation=45, cbar=False)
    save_close_or_show(fig, SAVE, storing_path / f'conf_mat_{method_name}.pdf')

Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk/signatures_without_overlapping/conf_mat_ANS.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk/signatures_without_overlapping/conf_mat_Seurat.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk/signatures_without_overlapping/conf_mat_Seurat_AG.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk/signatures_without_overlapping/conf_mat_Seurat_LVG.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk/signatures_without_overlapping/conf_mat_Scanpy.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk/signatures_without_overlapping/conf_mat_Jasmine_LH.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk/signatures_without_overlappin

In [19]:
metrics_df = pd.DataFrame(metrics)

Saving performance metrics

In [20]:
if SAVE:
    metrics_df.to_csv(storing_path / 'metrics.csv')
    print(f"Saved metrics to {storing_path / 'metrics.csv'}.")

Saved metrics to /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/b_mono_nk/signatures_without_overlapping/metrics.csv.
