In [None]:
import sys
import warnings
from collections import defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc

sys.path.append('..')

from score_with_all_methods import (
    score_signatures_with_all_methods,
    label_assignment_from_scores,
    get_lbl_assignment_performance,
    get_information_from_scores,
    remove_overlapping_signature_genes,
    get_violin_all_methods,
    prepare_data_for_violin_plot,
    save_close_or_show,
    plot_confusion_matrix
)

warnings.simplefilter(action='ignore', category=FutureWarning)
plt.rcParams.update({'pdf.fonttype': 42, 'font.family': 'sans-serif', 'font.sans-serif': 'Arial', 'font.size': 10})

# Kim et al. lung dataset
Kim, N., Kim, H.K., Lee, K. et al. Single-cell RNA sequencing demonstrates the molecular and cellular reprogramming of metastatic lung adenocarcinoma. Nat Commun 11, 2285 (2020). https://doi.org/10.1038/s41467-020-16164-1

Storing information and global variables

In [None]:
base_data_path = Path('/Users/lciernik/Documents/TUB/projects/ans_scoring/data/data_from_florian/')

remove_overlapping_genes = True

SAVE = False
storing_path = Path('/Users/lciernik/Documents/TUB/projects/ans_scoring/results/cancer_datasets/lung')

if remove_overlapping_genes:
    storing_path = storing_path / 'signatures_without_overlapping'
else:
    storing_path = storing_path / 'signatures_with_overlapping'

if SAVE:
    storing_path.mkdir(parents=True, exist_ok=True)

Loading data

In [None]:
adata = sc.read_h5ad(base_data_path / 'data' / 'kim_lung.h5ad')

In [None]:
samples_in_adata = adata.obs.index.to_list()

In [None]:
cell_labels = pd.read_table(base_data_path / "labels/GSE131907_Lung_Cancer_cell_annotation.txt")
cell_labels['Index'] = cell_labels['Index'].str.replace('_', '-')
cell_labels = cell_labels.set_index('Index')
cell_labels = cell_labels.loc[samples_in_adata]

In [None]:
y_true_col = 'Cell_subtype'
sample_col = 'sample'

In [None]:
adata.obs[y_true_col] = cell_labels[y_true_col].str.lower()

In [None]:
adata = adata[adata.obs[adata.obs[y_true_col].str.startswith('ts', na=False)].index]

In [None]:
adata.obs[y_true_col].value_counts()

Loading signatures

In [None]:
signatures = pd.read_csv(base_data_path / 'annotations' / 'kim_3.csv')
signatures = signatures.to_dict('series')
signatures = {k: sorted(v.dropna().tolist()) for k, v in signatures.items()}

In [None]:
if remove_overlapping_genes:
    signatures = remove_overlapping_signature_genes(signatures)

In [None]:
order_signatures = list(signatures.keys())

Computing dimensionality reduction

In [None]:
# sc.tl.pca(adata)
# sce.pp.harmony_integrate(adata, sample_col)
# sc.pp.neighbors(adata, use_rep='X_pca_harmony')
# sc.tl.umap(adata)

Scoring signatures

In [None]:
score_cols, adata = score_signatures_with_all_methods(adata, signatures)

Label assignment

In [None]:
all_cols = []
label_cols = {}
for method_name, method_scores in score_cols.items():
    adata, new_lbl_col = label_assignment_from_scores(adata, method_name, method_scores, include_undefined=False)
    label_cols[method_name] = new_lbl_col
    all_cols += method_scores + [new_lbl_col]

Visualizing results

In [None]:
# fig = sc.pl.umap(adata, color=all_cols + [sample_col, y_true_col], ncols=len(signatures) + 1, return_fig=True)
# if SAVE:
#     fig.savefig(storing_path / 'umap.png', bbox_inches='tight')
#     fig.savefig(storing_path / 'umap.pdf', bbox_inches='tight')
#     plt.close(fig)
#     print(f"Saved UMAP.")
# else:
#     plt.show(fig)

In [None]:
df_melted = prepare_data_for_violin_plot(adata, y_true_col, score_cols)

In [None]:
df_melted['Signature'].unique()

In [None]:
### Combined violin plots
fig = get_violin_all_methods(df_melted, y_true_col, hue_order=order_signatures, 
                              height=1.95,
                              aspect=1.15,
                              sharey=False,
                              wspace=0.15,
                              col_wrap=4,
                              legend_bbox_anchor=(1.13, 1),
                             )
save_close_or_show(fig, SAVE, storing_path / "violin_all_methods.pdf")

In [None]:
# for method_name, method_scores in score_cols.items():
#     df = adata.obs.loc[:, method_scores + [y_true_col]]
#     fig = get_violin(df, method_scores, y_true_col)
#     plt.title(f"{method_name}")
#     if SAVE:
#         fig.savefig(storing_path / f'violin_{method_name}.png', bbox_inches='tight')
#         fig.savefig(storing_path / f'violin_{method_name}.pdf', bbox_inches='tight')
#         plt.close(fig)
#         print(f"Saved violin plot for {method_name}.")
#     else:
#         plt.show(fig)

Computing label assignment performance

In [None]:
metrics = defaultdict(dict)
nfold = 10

for method_name, method_scores in score_cols.items():
    lbl_col = label_cols[method_name]
    conf_mat, bal_acc, f1_val = get_lbl_assignment_performance(adata,
                                                               y_true_col=y_true_col,
                                                               y_pred_col=lbl_col,
                                                               label_names=order_signatures)

    scores = get_information_from_scores(adata, y_true_col=y_true_col, scores=method_scores, nfold=nfold)

    metrics[method_name] = {
        'conf_mat': conf_mat,
        'balanced_accuracy': bal_acc,
        'f1_score': f1_val,
        f'logreg_balanced_accuracy_{nfold}cv': np.mean(scores),
        f'logreg_balanced_accuracy_{nfold}cv_std': np.std(scores)
    }

    ## Confusion matrix plot
    fig = plot_confusion_matrix(conf_mat, order_signatures, method_name)
    save_close_or_show(fig, SAVE, storing_path / f'conf_mat_{method_name}.pdf')

In [None]:
metrics_df = pd.DataFrame(metrics)

Saving performance metrics

In [None]:
if SAVE:
    metrics_df.to_csv(storing_path / 'metrics.csv')
    print(f"Saved metrics to {storing_path / 'metrics.csv'}.")