In [1]:
import sys
import warnings
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pbmc_helper import load_dex_genes

sys.path.append('..')
sys.path.append('/Users/lciernik/Documents/TUB/projects/ans_scoring/ANS_supplementary_information')
from data.load_data import load_datasets

from score_with_all_methods import (
    score_signatures_with_all_methods,
    label_assignment_from_scores,
    get_lbl_assignment_performance,
    get_information_from_scores,
    remove_overlapping_signature_genes,
    get_violin_all_methods,
    prepare_data_for_violin_plot,
    save_close_or_show,
    plot_confusion_matrix
)

warnings.simplefilter(action='ignore', category=FutureWarning)
plt.rcParams.update({'pdf.fonttype': 42, 'font.family': 'sans-serif', 'font.sans-serif': 'Arial', 'font.size': 10})

Storing information and global variables

In [2]:
remove_overlapping_genes = False

SAVE = False
storing_path = Path('/Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes')

if remove_overlapping_genes:
    storing_path = storing_path / 'signatures_without_overlapping'
else:
    storing_path = storing_path / 'signatures_with_overlapping'

if SAVE:
    storing_path.mkdir(parents=True, exist_ok=True)

Loading data

In [3]:
adata = load_datasets('pbmc_cd8_subtypes')
adata

AnnData object with n_obs × n_vars = 22815 × 11147
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'nCount_SCT', 'nFeature_SCT', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'outlier', 'mt_outlier'
    var: 'gene_names', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'log1p'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'
    layers: 'counts'

In [4]:
y_true_col = 'celltype.l2'
sample_col = 'orig.ident'

In [5]:
adata.obs[y_true_col].value_counts().sort_index()

CD8 Naive            10236
CD8 Proliferating       38
CD8 TCM               2708
CD8 TEM               9833
Name: celltype.l2, dtype: int64

Computing dimensionality reduction

In [6]:
# sc.tl.pca(adata)
# sc.pp.neighbors(adata)
# sc.tl.umap(adata)

Load signatures

In [7]:
DE_of_celltypes = load_dex_genes(filter_genes=True, threshold_pval=0.01, threshold_log2fc=0.5)

Shape DEX genes BEFORE filtering (11811, 4)
Shape DEX genes AFTER filtering (6288, 4)


In [8]:
subtypes_per_cell_type = adata.obs.groupby('celltype.l2')['celltype.l3'].apply(lambda x: list(x.unique()))

In [9]:
subtypes_per_cell_type.to_dict()

{'CD8 Naive': ['CD8 Naive', 'CD8 Naive_2'],
 'CD8 Proliferating': ['CD8 Proliferating'],
 'CD8 TCM': ['CD8 TCM_1', 'CD8 TCM_3', 'CD8 TCM_2'],
 'CD8 TEM': ['CD8 TEM_2',
  'CD8 TEM_1',
  'CD8 TEM_4',
  'CD8 TEM_5',
  'CD8 TEM_6',
  'CD8 TEM_3']}

In [9]:
signatures = {}
for row in subtypes_per_cell_type.items():
    cell_type, subtypes = row
    signatures[cell_type] = sorted(
        list(set(DE_of_celltypes[DE_of_celltypes['Cell Type'].isin(subtypes)]['Gene'].tolist())))

In [10]:
for k, v in signatures.items():
    print(k, len(v))

CD8 Naive 100
CD8 Proliferating 292
CD8 TCM 57
CD8 TEM 106


In [11]:
if remove_overlapping_genes:
    signatures = remove_overlapping_signature_genes(signatures)

In [12]:
order_signatures = ['CD8 Naive', 'CD8 TCM', 'CD8 TEM', 'CD8 Proliferating']

Scoring signatures

In [13]:
score_cols, adata = score_signatures_with_all_methods(adata, signatures, verbose=True)

Scoring CD8 Naive with adjusted_neighborhood_scoring
Scoring CD8 Naive with seurat_scoring
Scoring CD8 Naive with seurat_ag_scoring
Scoring CD8 Naive with seurat_lvg_scoring
Scoring CD8 Naive with scanpy_scoring
Scoring CD8 Naive with jasmine_scoring
Scoring CD8 Naive with jasmine_scoring
Scoring CD8 Naive with ucell_scoring
Scoring CD8 Proliferating with adjusted_neighborhood_scoring
Scoring CD8 Proliferating with seurat_scoring
Scoring CD8 Proliferating with seurat_ag_scoring
Scoring CD8 Proliferating with seurat_lvg_scoring
Scoring CD8 Proliferating with scanpy_scoring
Scoring CD8 Proliferating with jasmine_scoring
Scoring CD8 Proliferating with jasmine_scoring
Scoring CD8 Proliferating with ucell_scoring
Scoring CD8 TCM with adjusted_neighborhood_scoring
Scoring CD8 TCM with seurat_scoring
Scoring CD8 TCM with seurat_ag_scoring
Scoring CD8 TCM with seurat_lvg_scoring
Scoring CD8 TCM with scanpy_scoring
Scoring CD8 TCM with jasmine_scoring
Scoring CD8 TCM with jasmine_scoring
Scorin

Label assignment

In [14]:
all_cols = []
label_cols = {}
for method_name, method_scores in score_cols.items():
    adata, new_lbl_col = label_assignment_from_scores(adata, method_name, method_scores, include_undefined=False)
    label_cols[method_name] = new_lbl_col
    all_cols += method_scores + [new_lbl_col]

Visualizing results

In [15]:
### UMAP
# fig = sc.pl.umap(adata, color=all_cols + [sample_col, y_true_col, 'celltype.l1', 'celltype.l3'],
#                  ncols=len(signatures) + 1, return_fig=True)
# save_close_or_show(fig, SAVE, storing_path / 'umap.pdf')

In [16]:
df_melted = prepare_data_for_violin_plot(adata, y_true_col, score_cols)

In [17]:
### Combined violin plots
fig = get_violin_all_methods(df_melted, y_true_col, height=1.95, aspect=1.1, hue_order=order_signatures, textwrap_width=6)

save_close_or_show(fig, SAVE, storing_path / "violin_all_methods.pdf")

Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes/signatures_with_overlapping/violin_all_methods.pdf.


In [18]:
### Single violin plots

# for method_name, method_scores in score_cols.items():
#     df = adata.obs.loc[:, method_scores + [y_true_col]]
#     fig = get_violin(df, method_scores, y_true_col)
#     plt.title(f"{method_name}")
#     save_close_or_show(fig, SAVE, storing_path / f'violin_{method_name}.pdf')

Computing label assignment performance

In [19]:
metrics = defaultdict(dict)
nfold = 10

for method_name, method_scores in score_cols.items():
    lbl_col = label_cols[method_name]
    conf_mat, bal_acc, f1_val = get_lbl_assignment_performance(adata,
                                                                    y_true_col=y_true_col,
                                                                    y_pred_col=lbl_col,
                                                                    label_names=order_signatures)

    scores = get_information_from_scores(adata, y_true_col=y_true_col, scores=method_scores, nfold=nfold)

    metrics[method_name] = {
        'conf_mat': conf_mat,
        'balanced_accuracy': bal_acc,
        'f1_score': f1_val,
        f'logreg_balanced_accuracy_{nfold}cv': np.mean(scores),
        f'logreg_balanced_accuracy_{nfold}cv_std': np.std(scores)
    }
    fig = plot_confusion_matrix(conf_mat, order_signatures, method_name, figsize=(2.66, 2.66), textwrap_width=6,
                                xrotation=45, cbar=False)
    save_close_or_show(fig, SAVE, storing_path / f'conf_mat_{method_name}.pdf')

Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes/signatures_with_overlapping/conf_mat_ANS.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes/signatures_with_overlapping/conf_mat_Seurat.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes/signatures_with_overlapping/conf_mat_Seurat_AG.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes/signatures_with_overlapping/conf_mat_Seurat_LVG.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes/signatures_with_overlapping/conf_mat_Scanpy.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes/signatures_with_overlapping/conf_mat_Jasmine_LH.pdf.
Saved figure at /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes/signatures_with_overlappin

In [20]:
metrics_df = pd.DataFrame(metrics)

Saving performance metrics

In [21]:
if SAVE:
    metrics_df.to_csv(storing_path / 'metrics.csv')
    print(f"Saved metrics to {storing_path / 'metrics.csv'}.")

Saved metrics to /Users/lciernik/Documents/TUB/projects/ans_scoring/results/citeseq/cd8_subtypes/signatures_with_overlapping/metrics.csv.
