In [None]:
import sys
import warnings
from collections import defaultdict
from pathlib import Path

import decoupler as dc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc

sys.path.append('..')

from score_with_all_methods import (
    score_signatures_with_all_methods,
    label_assignment_from_scores,
    get_lbl_assignment_performance,
    get_information_from_scores,
    remove_overlapping_signature_genes,
    get_violin_all_methods,
    prepare_data_for_violin_plot,
    save_close_or_show,
    plot_confusion_matrix
)

warnings.simplefilter(action='ignore', category=FutureWarning)
plt.rcParams.update({'pdf.fonttype': 42, 'font.family': 'sans-serif', 'font.sans-serif': 'Arial', 'font.size': 10})

# Ji et al. 2020 skin dataset
Ji AL, Rubin AJ, Thrane K, Jiang S, Reynolds DL, Meyers RM, Guo MG, George BM, Mollbrink A, Bergenstråhle J, Larsson L, Bai Y, Zhu B, Bhaduri A, Meyers JM, Rovira-Clavé X, Hollmig ST, Aasi SZ, Nolan GP, Lundeberg J, Khavari PA. Multimodal Analysis of Composition and Spatial Architecture in Human Squamous Cell Carcinoma. Cell. 2020 Jul 23;182(2):497-514.e22. doi: 10.1016/j.cell.2020.05.039. Epub 2020 Jun 23. Erratum in: Cell. 2020 Sep 17;182(6):1661-1662. doi: 10.1016/j.cell.2020.08.043. PMID: 32579974; PMCID: PMC7391009.

Storing information and global variables

In [None]:
# base_data_path = Path('/Users/lciernik/Documents/TUB/projects/ans_scoring/data/data_from_florian/')

remove_overlapping_genes = True

SAVE = False
storing_path = Path('/Users/lciernik/Documents/TUB/projects/ans_scoring/results/cancer_datasets/skin')

if remove_overlapping_genes:
    storing_path = storing_path / 'signatures_without_overlapping'
else:
    storing_path = storing_path / 'signatures_with_overlapping'

if SAVE:
    storing_path.mkdir(parents=True, exist_ok=True)

Loading data

In [None]:
def load_skin():
    adata = sc.read(
        '/Users/lciernik/Documents/TUB/projects/ans_scoring/data/raw_data/Ji_et_al_2020/GSE144236_cSCC_counts.txt',
        delimiter='\t')

    metadata = pd.read_table(
        '/Users/lciernik/Documents/TUB/projects/ans_scoring/data/raw_data/Ji_et_al_2020/GSE144236_patient_metadata_new.txt',
        delimiter='\t')

    adata = adata.transpose()
    adata.obs[metadata.columns.to_list()] = metadata.copy()
    adata = adata[:, 2:].copy()
    return adata


def preprocess(adata):
    # mitochondrial genes, "MT-" for human, "Mt-" for mouse
    adata.var["mt"] = adata.var_names.str.startswith("MT-")
    # ribosomal genes
    adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
    # hemoglobin genes
    adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

    sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], log1p=True, inplace=True)
    sc.pp.filter_genes(adata, min_cells=3)
    # sc.pp.scrublet(adata, batch_key="patient")
    adata.layers["counts"] = adata.X.copy()
    sc.pp.normalize_total(adata)
    sc.pp.log1p(adata) 
    return adata

In [None]:
adata = load_skin()
print(adata.shape)
# adata = preprocess(adata)
# print(adata.shape)

In [None]:
y_true_col = 'level2_celltype'
sample_col = 'patient'

In [None]:
# Filter tumor cells
adata = adata[adata.obs[y_true_col].isin(['TSK', 'Tumor_KC_Basal', 'Tumor_KC_Cyc', 'Tumor_KC_Diff'])].copy()
adata.shape

In [None]:
adata.obs[y_true_col].value_counts().sort_index()

In [None]:
adata.obs[y_true_col] = adata.obs[y_true_col].astype(str)
adata.obs.loc[adata.obs[y_true_col] == 'Tumor_KC_Cyc', y_true_col] = 'Tumor KC Cycling'
adata.obs[y_true_col] = adata.obs[y_true_col].astype('category')

In [None]:
adata.obs[y_true_col] = adata.obs[y_true_col].map(
    {val: val.replace('_', ' ') for val in adata.obs[y_true_col].unique()})

In [None]:
adata.obs[y_true_col].value_counts().sort_index()

In [None]:
adata = preprocess(adata)
adata.shape

Loading signatures

In [None]:
gmt_file = '/Users/lciernik/Documents/TUB/projects/ans_scoring/data/raw_data/Ji_et_al_2020/gene_sets.gmt'
signatures = dc.read_gmt(gmt_file)

In [None]:
signatures = signatures.groupby('source').target.apply(lambda x: sorted(x.unique()))
type(signatures)

signatures = signatures.to_dict()
print(signatures.keys())

In [None]:
if remove_overlapping_genes:
    signatures = remove_overlapping_signature_genes(signatures)

In [None]:
signatures = {k.replace('_', ' '):v for k, v in signatures.items()}
list(signatures.keys())

In [None]:
order_signatures = ['Tumor KC Basal', 'Tumor KC Cycling', 'Tumor KC Diff', 'TSK']

Computing dimensionality reduction

In [None]:
# sc.tl.pca(adata)
# sce.pp.harmony_integrate(adata, sample_col)
# sc.pp.neighbors(adata, use_rep='X_pca_harmony')
# sc.tl.umap(adata)

Scoring signatures

In [None]:
score_cols, adata = score_signatures_with_all_methods(adata, signatures)

Label assignment

In [None]:
all_cols = []
label_cols = {}
for method_name, method_scores in score_cols.items():
    adata, new_lbl_col = label_assignment_from_scores(adata, method_name, method_scores, include_undefined=False)
    label_cols[method_name] = new_lbl_col
    all_cols += method_scores + [new_lbl_col]

Visualizing results

In [None]:
# fig = sc.pl.umap(adata, color=all_cols + [sample_col, y_true_col], ncols=len(signatures) + 1, return_fig=True)
# if SAVE:
#     fig.savefig(storing_path / 'umap.png', bbox_inches='tight')
#     fig.savefig(storing_path / 'umap.pdf', bbox_inches='tight')
#     plt.close(fig)
#     print(f"Saved UMAP.")
# else:
#     plt.show(fig)

In [None]:
df_melted = prepare_data_for_violin_plot(adata, y_true_col, score_cols)

In [None]:
df_melted['Signature'].unique()

In [None]:
### Combined violin plots
fig = get_violin_all_methods(df_melted, y_true_col, 
                             hue_order=order_signatures, 
                             textwrap_width=9, 
                             sharey=False, 
                             height=1.95, aspect=1.75, 
                             wspace=0.125,
                             legend_bbox_anchor=(1.025, 1),
                             fontsizes=dict(title=12, labels=11, ticks=10, legend=11)
                             )
save_close_or_show(fig, SAVE, storing_path / "violin_all_methods.pdf")

In [None]:
# for method_name, method_scores in score_cols.items():
#     df = adata.obs.loc[:, method_scores + [y_true_col]]
#     fig = get_violin(df, method_scores, y_true_col)
#     plt.title(f"{method_name}")
#     if SAVE:
#         fig.savefig(storing_path / f'violin_{method_name}.png', bbox_inches='tight')
#         fig.savefig(storing_path / f'violin_{method_name}.pdf', bbox_inches='tight')
#         plt.close(fig)
#         print(f"Saved violin plot for {method_name}.")
#     else:
#         plt.show(fig)

Computing label assignment performance

In [None]:
metrics = defaultdict(dict)
nfold = 10

for method_name, method_scores in score_cols.items():
    lbl_col = label_cols[method_name]
    conf_mat, bal_acc, f1_val = get_lbl_assignment_performance(adata,
                                                               y_true_col=y_true_col,
                                                               y_pred_col=lbl_col,
                                                               label_names=order_signatures)
    scores = get_information_from_scores(adata, y_true_col=y_true_col, scores=method_scores, nfold=nfold)

    metrics[method_name] = {
        'conf_mat': conf_mat,
        'balanced_accuracy': bal_acc,
        'f1_score': f1_val,
        f'logreg_balanced_accuracy_{nfold}cv': np.mean(scores),
        f'logreg_balanced_accuracy_{nfold}cv_std': np.std(scores)
    }

    ## Confusion matrix plot
    # fig = plot_confusion_matrix(conf_mat, order_signatures, method_name, figsize=(3, 3), textwrap_width=7,
    #                             xrotation=45, cbar=False)
    fig = plot_confusion_matrix(conf_mat, order_signatures, method_name, base_size=0.9, textwrap_width=8, fontsizes={'title': 12, 'labels': 11, 'ticks': 10, 'legend': 11})
    save_close_or_show(fig, SAVE, storing_path / f'conf_mat_{method_name}.pdf')

In [None]:
metrics_df = pd.DataFrame(metrics)

Saving performance metrics

In [None]:
if SAVE:
    metrics_df.to_csv(storing_path / 'metrics.csv')
    print(f"Saved metrics to {storing_path / 'metrics.csv'}.")