# Comparison scores computed with Python or R: UCell, Jasmine, Seurat
The aim of this notebook is to analyze the behaviour between the Python and R implementation of the UCell, Jasmine and Seurat gene signature scoring methods.
We score CRC, ESCC and LUAD for a signature for malignant cells of length 100, i.e., 100 genes with smallest adjusted p-val and log2FC>2. See `scoring_ith_ucell_jasmine_seurat.R` (in the same folder) for details on scoring with R. 

In [None]:
import sys
import scanpy as sc 
import random
import glob
import os
import scipy as sp
import csv

import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

from signaturescoring import score_signature

sys.path.append('../..')
from data.load_data import load_datasets, load_dgex_genes_for_mal_cells
from data.constants import BASE_PATH_EXPERIMENTS, BASE_PATH_DATA

sc.settings.verbosity = 2

appendix = '_wo_luad'

### Store CRC, ESCC and LUAD datasets as SingleCellExperiments 

In [None]:
# import scanpy
# import anndata2ri
# anndata2ri.activate()
# %load_ext rpy2.ipython

In [None]:
# crc_data = load_datasets('crc')

In [None]:
# %%R -i crc_data
# saveRDS(crc_data, os.path.join(BASE_PATH_DATA, "preprocessed_seurat/pp_crc.rds"))

In [None]:
# del crc_data

In [None]:
# escc_data = load_datasets('escc')

In [None]:
# %%R -i escc_data
# saveRDS(escc_data, os.path.join(BASE_PATH_DATA, "preprocessed_seurat/pp_escc.rds"))

In [None]:
# del escc_data

In [None]:
# luad_data = load_datasets('luad')

In [None]:
# %%R -i luad_data
# saveRDS(luad_data, os.path.join(BASE_PATH_DATA, "preprocessed_seurat/pp_luad.rds"))

In [None]:
# del luad_data

### Score CRC, ESCC and LUAD for malignant signature with 100 genes

In [None]:
SCORING_METHODS = [
    {
       "scoring_method": "adjusted_neighborhood_scoring",
        "sc_params": {
            "ctrl_size": 100,
            "score_name": "ANS_Python",
        }, 
    },
    {
        "scoring_method": "seurat_scoring",
        "sc_params": {
            "ctrl_size": 100,
            "n_bins": 25,
            "score_name": "Seurat_Python",
        },
    },
    {
        "scoring_method": "scanpy_scoring",
        "sc_params": {
            "ctrl_size": 100,
            "n_bins": 25,
            "score_name": "Scanpy_Python",
        },
    },

    {
        "scoring_method": "jasmine_scoring",
        "sc_params": {
            "score_method": 'likelihood',
            "score_name": "Jasmine_LH_Python",
        },
    },
    {
        "scoring_method": "jasmine_scoring",
        "sc_params": {
            "score_method": 'oddsratio',
            "score_name": "Jasmine_OR_Python",
        },
    },
    {
        "scoring_method": "ucell_scoring",
        "sc_params": {
            "score_name": "UCell_Python",
            "maxRank": 1500,
        },
    },
]

In [None]:
def load_adata_gene_list(dataset, sig_len=100):
    adata = load_datasets(dataset)
    wc = load_dgex_genes_for_mal_cells(dataset)
    wc = wc.sort_values(by=['padj', 'log2FoldChange'], ascending=[True, False])
    gene_list = wc[0:sig_len].genes.tolist()
    return adata, gene_list
    

def score_data(adata, gene_list):
    for sc_method in SCORING_METHODS:
        scoring_method = sc_method['scoring_method']
        sc.logging.info(f'Scoring with {scoring_method}')
        sc_params = sc_method['sc_params']
        random.seed(123) # Use the same seed as in R 
        score_signature(method=scoring_method,
                        adata=adata,
                        gene_list=gene_list,
                        **sc_params)

In [None]:
crc_data, crc_gene_list = load_adata_gene_list('crc')
escc_data, escc_gene_list = load_adata_gene_list('escc')
# luad_data, luad_gene_list = load_adata_gene_list('luad')

In [None]:
score_data(crc_data, crc_gene_list)

In [None]:
score_data(escc_data, escc_gene_list)

In [None]:
# score_data(luad_data, luad_gene_list)

### Import scores computed with R methods

In [None]:
base_path = os.path.join(BASE_PATH_EXPERIMENTS, "construction_scoring_methods")

In [None]:
def get_name(dataset, name):
    name_mapping={
        f'{dataset}_ans_scores.csv':'ANS_R',
        f'{dataset}_addmodulescore_scores.csv': 'AddModuleScore_R',
        f'{dataset}_jas_lh_scores.csv': 'Jasmine_LH_R',
        f'{dataset}_jas_or_scores.csv': 'Jasmine_OR_R',
        f'{dataset}_ucell_scores.csv': 'UCell_R',
    }
    return name_mapping[name]
    
    

def get_scores(dataset, base_path):
    fns = glob.glob(os.path.join(base_path, dataset ,'*.csv'))
    fns.sort()
    dfs = []
    for fn in fns:
        file = os.path.basename(fn)
        df = pd.read_csv(fn)
        df.columns = ['cell_names', get_name(dataset, file)]
        df = df.set_index('cell_names')
        df.index.name = None
        dfs.append(df)
    return pd.concat(dfs, axis=1)


In [None]:
## CRC 
crc_R_scores = get_scores('crc', base_path)
crc_data.obs[crc_R_scores.columns] = crc_R_scores

In [None]:
## ESCC 
escc_R_scores = get_scores('escc', base_path)
escc_data.obs[escc_R_scores.columns] = escc_R_scores

In [None]:
# ## LUAD 
# luad_R_scores = get_scores('luad', base_path)
# luad_data.obs[luad_R_scores.columns] = luad_R_scores

### plotting function 

In [None]:
def plot_scatter(xaxis, yaxis):
    cols = ['malignant_key', xaxis, yaxis]
    small_crc = crc_data.obs[cols].copy()
    small_crc['dataset'] = 'CRC'
    small_escc = escc_data.obs[cols].copy()
    small_escc['dataset'] = 'ESCC'
#     small_luad = luad_data.obs[cols].copy()
#     small_luad['dataset'] = 'LUAD'

#     df = pd.concat([small_crc, small_escc, small_luad], axis=0)
    df = pd.concat([small_crc, small_escc], axis=0)
    g = sns.relplot(
        data=df, x=xaxis, y=yaxis,
        col="dataset", hue="malignant_key",
        kind="scatter"
    )
    
    r_crc, p_crc = sp.stats.pearsonr(x=small_crc[xaxis], y=small_crc[yaxis])
    r_escc, p_escc = sp.stats.pearsonr(x=small_escc[xaxis], y=small_escc[yaxis])
#     r_luad, p_luad = sp.stats.pearsonr(x=small_luad[xaxis], y=small_luad[yaxis])
    
#     for ax, r in zip(g.axes[0], [r_crc, r_escc, r_luad]):
    for ax, r in zip(g.axes[0], [r_crc, r_escc]):
        plt.text(.05, .8, "Pearson's r ={:.3f}".format(r), transform=ax.transAxes)
    
    
    return plt.gcf()

In [None]:
pairs = [['UCell_R', 'UCell_Python'],
['Jasmine_LH_R', 'Jasmine_LH_Python'],
['Jasmine_OR_R', 'Jasmine_OR_Python'],
['AddModuleScore_R', 'Seurat_Python'],
['AddModuleScore_R', 'Scanpy_Python'],
['ANS_R', 'ANS_Python'],]

### Plot individual pairs

In [None]:
plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':16})
for lbls in pairs:
    plt.rcParams.update({'pdf.fonttype':42, 'font.family':'sans-serif', 'font.sans-serif':'Arial', 'font.size':16})
    fig = plot_scatter(*lbls)
    fig.suptitle(f'{lbls[0]} vs. {lbls[1]}', fontsize=18, y=1.075)
    name_1 = lbls[0].rsplit('_', 1)[0]
    name_2 = lbls[1].rsplit('_', 1)[0]
    fig.savefig(os.path.join(base_path, 'plots', f'scatter_{name_1}_vs_{name_2}{appendix}.png'), dpi=300)
    plt.show(fig)

### Plot all together

In [None]:
pairs = ['dataset', 'UCell_R', 'UCell_Python','Jasmine_LH_R', 'Jasmine_LH_Python','Jasmine_OR_R', 'Jasmine_OR_Python','AddModuleScore_R', 'Seurat_Python','ANS_R', 'ANS_Python']

In [None]:
crc_data.obs['dataset'] = 'CRC'
escc_data.obs['dataset'] = 'ESCC'
# luad_data.obs['dataset'] = 'LUAD'

# df = pd.concat([crc_data.obs[pairs], 
#                 escc_data.obs[pairs], 
#                 luad_data.obs[pairs]], axis=0)
df = pd.concat([crc_data.obs[pairs], 
                escc_data.obs[pairs]], axis=0)
df['Seurat_R'] = df['AddModuleScore_R'].copy()

In [None]:
df.drop(columns='AddModuleScore_R', inplace=True)

In [None]:
df = df.reset_index().melt(id_vars=['index', 'dataset'],
                     var_name='scoring_method',
                     value_name='score')

In [None]:
df['method'] = df.scoring_method.apply(lambda x: x.rsplit('_', 1)[0])
df['language'] = df.scoring_method.apply(lambda x: x.rsplit('_', 1)[1])

In [None]:
df = df.pivot(index=['index', 'dataset', 'method'],
         columns=['language'],
         values=['score']).reset_index()

In [None]:
df.columns = ['sample_id', 'dataset', 'method', 'Python', 'R']
df.index.name = None

In [None]:
# row_order=['CRC', 'ESCC', 'LUAD']
row_order=['CRC', 'ESCC']
col_order=['ANS', 'Seurat', 'Jasmine_LH', 'Jasmine_OR', 'UCell']
col="method"
row="dataset"

In [None]:
sns.set_theme(style='white')

In [None]:
g = sns.relplot(data=df, x="R", y="Python",col=col, row=row, 
                row_order=row_order,
                col_order=col_order,
                facet_kws={'sharey': False, 'sharex': False},
                height=2, 
                aspect=1,
                s=30
               )
g.set(xticks=[], yticks=[], xlabel=None, ylabel=None)
# Remove the default titles
g.set_titles("")

# Set new titles for the columns
for ax, title in zip(g.axes[0,:], col_order):
    ax.set_title(title, fontsize=18)

# Set new titles for the rows
for ax, title in zip(g.axes[:,-1], row_order):
    ax.set_ylabel(title, fontsize=18, rotation=0)
    ax.yaxis.set_label_position("right")
    

g.fig.supxlabel("R", y=0.025, fontsize=18)
g.fig.supylabel("Python", x=0.05, fontsize=18)

    
for curr_row,row_name in zip(g.axes,row_order):
    for ax, col_name in zip(curr_row, col_order):
        val = df[(df[row]==row_name)&(df[col]==col_name)]
        r, p = sp.stats.pearsonr(x=val['Python'], y=val['R'])
        # Add the correlation coefficient as text annotation
        ax.annotate(f"R = {r:.3f}", xy=(0.1, 0.8), xycoords='axes fraction',fontsize=14)

g.fig.tight_layout()

# g.fig.savefig(os.path.join(base_path, 'plots', f'all_methods{appendix}.svg'))
g.fig.savefig(os.path.join(base_path, 'plots', f'all_methods{appendix}.png'), dpi=600)

In [None]:
col_order=['CRC', 'ESCC']
row_order=['ANS', 'Seurat', 'Jasmine_LH', 'Jasmine_OR', 'UCell']
row="method"
col="dataset"

In [None]:
stat_df = []

In [None]:
import numpy as np 
g = sns.relplot(data=df, x="R", y="Python",col=col, row=row, 
                row_order=row_order,
                col_order=col_order,
                facet_kws={'sharey': False, 'sharex': False},
                height=1.5, 
                aspect=1.5,
                s=30
               )
g.set(xticks=[], yticks=[], xlabel=None, ylabel=None)
# Remove the default titles
g.set_titles("")

# Set new titles for the columns
for ax, title in zip(g.axes[0,:], col_order):
    ax.set_title(title, fontsize=18)

# Set new titles for the rows
for ax, title in zip(g.axes[:,-1], row_order):
    ax.set_ylabel(title, fontsize=18, rotation=0)
    ax.yaxis.set_label_position("right")
    
g.fig.supxlabel("R", y=0.025, fontsize=18)
g.fig.supylabel("Python", x=0.05, fontsize=18)
   
for curr_row,row_name in zip(g.axes,row_order):
    for ax, col_name in zip(curr_row, col_order):
        val = df[(df[row]==row_name)&(df[col]==col_name)]
        r, p = sp.stats.pearsonr(x=val['Python'], y=val['R'])
        # Add the correlation coefficient as text annotation
        ax.annotate(f"R = {r:.3f}", xy=(0.1, 0.8), xycoords='axes fraction',fontsize=14)

        # Sample size and degrees of freedom
        n = len(val['Python'])
        df_stat = n - 2
        
        # Calculate 95% confidence intervals
        z = np.arctanh(r)
        se = 1/np.sqrt(n-3)
        ci_lower = np.tanh(z - 1.96*se)
        ci_upper = np.tanh(z + 1.96*se)

        stat_df.append(
            {
            'Scoring method': row_name,
            'Dataset': col_name,
            'r': r,
            'p': p,
            'df': df_stat,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper,
        }
        )

g.fig.tight_layout()

g.fig.savefig(os.path.join(base_path, 'plots', f'all_methods{appendix}_vertical.png'), dpi=600)

In [None]:
stat_df = pd.DataFrame(stat_df)

In [None]:
stat_df.to_csv(os.path.join(base_path,'plots', 'statistics.csv'))

### Find subset in CRC, ESCC, and LUAD that don't match in UCell for R and Python 

In [None]:
def ucell_diff(df, nr_samples=100, ratio=0.75):
    ucell_diff = (df.UCell_R - df.UCell_Python).abs()
    cells_same = ucell_diff[ucell_diff==0].index.tolist()
    cells_not_same = ucell_diff[ucell_diff>0.03].index.tolist()
    nr_same = int(nr_samples * (1-ratio))
    nr_not_same = int(nr_samples * ratio)
    random.seed(123)
    return random.sample(cells_same, nr_same) + random.sample(cells_not_same, nr_not_same)

def store_list_cells(datasset, list_cells):
    curr_path = os.path.join(base_path, f'{dataset}_sample_cells.csv')
    with open(curr_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(list_cells)
    print(f'Storing sample cells in {}')

In [None]:
crc_sample = ucell_diff(crc_data.obs)

In [None]:
escc_sample = ucell_diff(escc_data.obs)

In [None]:
# luad_sample = ucell_diff(luad_data.obs)