# Import

In [1]:
import pandas as pd
import numpy as np
from bioreactor.plotting import *
from bioreactor.use_colors import *
from bioreactor.annotation import *
from bioreactor.utils import read_dataset, median_scale
from bioreactor.ssgsea import *
from bioreactor.gsea import *
from bioreactor.expression import  *
from bioreactor.pathway_scoring import *
from bioreactor.survival import * 
from bioreactor.graphs import *

from bioreactor.plotting import *
from bioreactor.use_colors import *
from bioreactor.utils import *
from bioreactor.gsea import *
from bioreactor.mutations import *
from bioreactor.ssgsea import *
from bioreactor.clustering import *
from bioreactor.oncoplot2 import *
from bioreactor.cna import *

from mldeconv.cells_plotting import *
from mldeconv.data_loading import *

from typing import Tuple, List, Optional
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from scipy.stats import spearmanr
import scipy

from collections import defaultdict
from scipy.spatial.distance import pdist, squareform
import itertools as it

def palette_from_series(series, exclude_colors=['black', 'white'], pastel_factor=0, n_attempts=1000, colorblind_type='Deuteranomaly', rng=42, color_format='HEX'):
    '''
    Generates a color palette from a pandas series, ensuring distinct colors for each category.

    Parameters:
    series (pandas.Series): A series with categories.
    exclude_colors (list): List of colors to exclude.
    pastel_factor (float): If >0, generates paler colors.
    n_attempts (int): Number of attempts to generate distinct colors.
    colorblind_type (str): Type of colorblindness to account for.
    rng (int): Random number generator seed.
    color_format (str): The format of the color output ('HEX', 'RGBA', 'HSL').

    Returns:
    dict: A dictionary mapping categories to colors.
    '''
    import distinctipy
    from matplotlib.colors import to_hex, rgb_to_hsv

    # Convert excluded colors from HEX to RGB
    exclude_colors_rgb = [tuple(int(to_hex(hex_color).strip('#')[i:i+2], 16)/255 for i in (0, 2, 4)) for hex_color in exclude_colors] if len(exclude_colors)!=0 else exclude_colors

    # Get unique categories
    categories = series.unique()
    n_colors = len(categories)

    # Get distinct colors
    colors = distinctipy.get_colors(n_colors=n_colors, exclude_colors=exclude_colors_rgb, pastel_factor=pastel_factor, n_attempts=n_attempts, colorblind_type=colorblind_type, rng=rng)

    # Format colors according to the specified format
    if color_format == 'HEX':
        color_dict = {category: to_hex(color) for category, color in zip(categories, colors)}
    elif color_format == 'RGBA':
        color_dict = {category: color for category, color in zip(categories, colors)}
    elif color_format == 'HSL':
        color_dict = {category: rgb_to_hsv(*color) for category, color in zip(categories, colors)}
    else:
        raise ValueError("Unsupported color format. Choose 'HEX', 'RGBA', or 'HSL'.")

    return color_dict

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
sys.path.append('/home/jovyan/git/signature_validation/func')

In [4]:
from data_obtaining import get_anno, get_expression_table
from plotting import genes_expression_chart_, clustered_heatmap, get_ranks, gene_corr_plot, get_hclust_order

In [5]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config IPCompleter.use_jedi = False

In [6]:
import warnings
warnings.filterwarnings("ignore")
%config InlineBackend.figure_format = 'png'
plt.rcParams['pdf.fonttype'] = 'truetype'
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['figure.dpi'] = 120
sns.set_style('white')

# Signature validation: Sorted cells and cell lines, public

## read anno

In [7]:
# Here we check which version of annotation is the newest - oldest directories go first
paths = sorted(Path('/uftp2/Databases/sorted_cell_databases/').iterdir(), key=os.path.getmtime)
paths

[PosixPath('/uftp2/Databases/sorted_cell_databases/jan2022'),
 PosixPath('/uftp2/Databases/sorted_cell_databases/march2022'),
 PosixPath('/uftp2/Databases/sorted_cell_databases/june2022'),
 PosixPath('/uftp2/Databases/sorted_cell_databases/jan2023'),
 PosixPath('/uftp2/Databases/sorted_cell_databases/current_version'),
 PosixPath('/uftp2/Databases/sorted_cell_databases/March2024')]

In [75]:
# cells_clean = read_dataset('/uftp2/Databases/sorted_cell_databases/current_version/cells_clean_annotation.tsv')
# cells_clean = cells_clean[(~cells_clean.index.str.contains('RNA'))&(~cells_clean.index.str.contains('WES'))&(~cells_clean.index.str.contains('210514_NovaA'))]
# print(cells_clean.shape)
# public_cells_annot = cells_clean.copy()
# public_cells_annot = public_cells_annot.reset_index(drop=True)
# public_cells_annot.index = public_cells_annot.Sample
# public_cells_annot.to_csv('cells_clean_annotation_version_3_07_24.tsv.gz', sep='\t', compression='gzip')

(10874, 49)

In [79]:
public_cells_annot = read_dataset('cells_clean_annotation_version_3_07_24.tsv.gz')

In [80]:
mapper = {'CD4_T_cells': 'CD4_T_cells',
 'Fibroblasts': 'Fibroblasts',
 'Platelets': 'Platelets',
 'Monocytic_DC': 'Dendritic_cells',
 'Macrophages': 'Macrophages',
 'B_cells': 'B_cells',
 'CD8_T_cells': 'CD8_T_cells',
 'Monocytes': 'Monocytes',
 'Endothelium': 'Endothelium',
 'Keratinocytes': 'Keratinocytes',
 'Smooth_muscle_cells': 'Fibroblasts',
 'Hepatocytes': 'Hepatocytes',
 'NK_cells': 'NK_cells',
 'Neutrophils': 'Granulocytes',
 'Macrophages_M1': 'Macrophages',
 'Cardiac_myofibroblasts': 'Fibroblasts',
 'T_cells': 'CD3_T_cells',
 'Monocyte_derived_dendritic_cells': 'Dendritic_cells',
 'Dendritic_cells': 'Dendritic_cells',
 'Non_plasma_B_cells': 'B_cells',
 'Plasmacytoid_dendritic_cells': 'Dendritic_cells',
 'Myeloid_cells': 'Myeloid_cells',
 'MAIT_cells': 'Other_T_cells',
 'Endothelium_lymph': 'Endothelium',
 'Naive_CD8_T_cells': 'CD8_T_cells',
 'CD4_T_helpers': 'CD4_T_cells',
 'Tregs': 'CD4_T_cells',
 'gd_T_cells': 'Other_T_cells',
 'Preadipocytes': 'Adipocytes',
 'Memory_CD8_T_cells': 'CD8_T_cells',
 'cDC2': 'Dendritic_cells',
 'Skeletal_muscle_cells': 'Muscle_cells',
 'Eosinophils': 'Granulocytes',
 'Adipocytes': 'Adipocytes',
 'CD8_T_cells_PD1_high': 'CD8_T_cells',
 'Eff_and_transit_memory_CD4_T_cells': 'CD4_T_cells',
 'Follicular_T_helper': 'CD4_T_cells',
 'T_regs': 'CD4_T_cells',
 'Neurons': 'Neural_cells',
 'Pancreatic_beta_cells': 'Pancreatic_cells',
 'Naive_B_cells': 'B_cells',
 'Granulocytes': 'Granulocytes',
 'Transitional_B_cells': 'B_cells',
 'Epithelium': 'Epithelium',
 'Breg_cells': 'B_cells',
 'CD4_T_cells_PD1_high': 'CD4_T_cells',
 'Astrocytes': 'Neural_cells',
 'Pancreatic_cells': 'Pancreatic_cells',
 'PMN_MDSC': 'MDSC',
 'moDC': 'Dendritic_cells',
 'Hepatic_progenitor_cells': 'Hepatocytes',
 'Memory_B_cells': 'B_cells',
 'Eff_and_transit_memory_CD8_T_cells': 'CD8_T_cells',
 'cDC1': 'Dendritic_cells',
 'CD4_T_cells_PD1_low': 'CD4_T_cells',
 'Macrophages_M2': 'Macrophages',
 'Memory_exhausted_B_cells': 'B_cells',
 'Pancreatic_alpha_cells': 'Pancreatic_cells',
 'Naive_T_helpers': 'CD4_T_cells',
 'Th1_cells': 'CD4_T_cells',
 'cDC': 'Dendritic_cells',
 'Th17_cells': 'CD4_T_cells',
 'Induced_hepatocytes': 'Hepatocytes',
 'Mature-naive_B_cells': 'B_cells',
 'Central_memory_CD8_T_cells': 'CD8_T_cells',
 'Effector_memory_CD8_T_cells': 'CD8_T_cells',
 'Dendritic_cells_CD141': 'Dendritic_cells',
 'Transitional_memory_CD8_T_cells': 'CD8_T_cells',
 'Plasmablasts': 'B_cells',
 'CD8_T_cells_PD1_low': 'CD8_T_cells',
 'Fibroblast_line': 'Fibroblasts',
 'Langerhans_cells': 'Dendritic_cells',
 'Basophils': 'Granulocytes',
 'Th1': 'CD4_T_cells',
 'Cardiac_muscle_cells': 'Muscle_cells',
 'Erythrocytes': 'Erythrocytes',
 'Plasma_B_cells': 'B_cells',
 'Central_memory_T_cells': 'Other_T_cells',
 'M_MDSC': 'MDSC',
 'Non-switch_memory_IgM_B_cells': 'B_cells'}
[i for i,v in mapper.items() if v=='']

[]

In [None]:
public_cells_expr = read_expressions(public_cells_annot)
public_cells_expr = np.log2(public_cells_expr+1)

In [None]:
mask = public_cells_annot['Cell_type'].isin(public_cells_annot['Cell_type'].value_counts()[public_cells_annot['Cell_type'].value_counts()>10].index)
public_cells_annot = public_cells_annot[mask]
public_cells_annot

In [None]:
public_cells_annot['Cell_type_paper'] = public_cells_annot.Cell_type
public_cells_annot['Cell_type_paper'] = public_cells_annot['Cell_type_paper'].map(lambda x: mapper[x])
public_cells_annot['Cell_type_paper'].value_counts()

In [None]:
# pal = palette_from_series(public_cells_annot['Cell_type_paper'].dropna())
pal = {'Monocytes': '#0080ff',
 'Other_T_cells': '#e08c07',
 'CD3_T_cells': '#e805ad',
 'Fibroblasts': '#10e685',
 'Keratinocytes': '#0203ce',
 'B_cells': '#007117',
 'Endothelium': '#ff80ff',
 'CD4_T_cells': '#fccd1a',
 'Macrophages': '#00ffff',
 'Hepatocytes': '#008080',
 'CD8_T_cells': '#b808e6',
 'Dendritic_cells': '#f20d47',
 'Granulocytes': '#05b3df',
 'Pancreatic_cells': '#01bc14',
 'Neural_cells': '#860b6a',
 'NK_cells': '#f57e7a',
 'Epithelium': '#e8c7b9',
 'Myeloid_cells': '#800000',
 'MDSC': '#171e4c',
 'Platelets': '#31f10b',
 'Erythrocytes': '#ff00ff',
 'Muscle_cells': '#9c8ca7',
 'Adipocytes': '#ffff00'}
patch_plot(pal)

In [None]:
data = public_cells_expr.T
data = data[~data.index.duplicated()]
data = data.reindex(public_cells_annot.index).dropna()
data

In [None]:
pca_plot(data, public_cells_annot['Cell_type_paper'], s=2, palette=pal)
plt.savefig('Figure_S2C.svg', format='svg')

In [None]:
print(len(public_cells_annot.Dataset.reindex(data.index).unique()))
pca_plot(data, public_cells_annot.Dataset.fillna('NA'), s=2, legend=None)
plt.savefig('Figure_S2B.svg', format='svg')

In [None]:
pal = {'NA': '#8000ff', 'polyA RNA': '#6afdc0', 'total RNA': '#ff3e1f'}

pca_plot(data, public_cells_annot.Material_type.fillna('NA'), s=2, legend='out', palette=pal)
plt.savefig('Figure_S2D.svg', format='svg')

In [None]:
mask = public_cells_annot.Cell_type=='Macrophages'
pca_plot(data, public_cells_annot.Material_type.fillna('NA')[mask], legend='out', palette=pal)
plt.savefig('Figure_S2E.svg', format='svg')

In [None]:
print(data.reindex(public_cells_annot.Material_type.fillna('NA')[mask].index).shape)

len(public_cells_annot.Dataset.fillna('NA')[mask].reindex(data.index).unique())

In [None]:
pca_plot(data, public_cells_annot.Dataset.fillna('NA')[mask], legend='out')

In [None]:
pca_plot(data, public_cells_annot.Dataset.fillna('NA')[mask], legend=None)
plt.savefig('macrophages_datasets.svg', format='svg')