In [None]:
import sys
from alphaPhosHelperFunctions import *
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from scipy import stats
import plotly.figure_factory as ff
from PeptideCollapse import *
import analytics_core_V04 as ac
import kinase_library as kl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from scipy import stats
import gseapy as gp
from gseapy import dotplot, barplot, enrichment_map

# Custom functions 

In [96]:
def replace_between_asterics(match):
    return match.group(1).upper()

In [97]:
def set_condition (data, condition_list, collapse_level = 'PG'):
    """
    Assigns condition setup to collapsed pandas dataframe

    :param pd.DataFrame data: collapsed dataframe
    :param dict condition_list: condition setup for the experiment with condition groups as key and sample unique IDs as values  
    :param str collapse_level: default 'PG' returns protein group level collapse, 'P' return protein level collapse (useful for functional site assignment)
    :return pd.DataFrame data: collapsed pandas dataframe 

    Example::
        condition_dataframe = set_condition (data = collapsed_dataframe, condition_list, collapse_level = 'PG')
    """
    
    if collapse_level == 'P':
        index_cols = ['UPD_seq', 'PTM_localization', 'Protein_name', 'Protein_group', 'Gene_group',
                      'PTM_Collapse_key', 'Protein_Collapse_key', 'kinase_sequence']
    if collapse_level == 'PG':
        index_cols = ['UPD_seq', 'PTM_localization', 'Protein_group', 'Gene_group', 'PTM_Collapse_key', 'kinase_sequence']
    index_df = data[index_cols]
    num_df = data.drop(index_cols, axis = 1).T
    names = [name for name in list(num_df.index)]
    nam = []
    for name in names:
        for el in name.split('_'):
            for key, value in condition_list.items():
                for val in value:
                    if val == el:
                        nam.append(el)
    num_df.index = nam
    if collapse_level == 'PG':
        num_df.columns = index_df['PTM_Collapse_key']
    if collapse_level == 'P':
        num_df.columns = index_df['Protein_Collapse_key']
    num_df['Condition'] = np.nan
    tmp = pd.DataFrame()
    tmp1 = []
    for key, value in condition_list.items():
        for val in value:
            tmp = num_df[num_df.index == val]
            tmp['Condition'] = key
            tmp1.append(tmp)
    tmp1 = pd.concat(tmp1)
    num_df = tmp1
    num_df['group'] = num_df['Condition']
    num_df = num_df.drop('Condition', axis = 1)
    num_df['subject'] = num_df.index
    num_df['sample'] = num_df.index
    num_df = num_df.reset_index().drop('index', axis = 1)
    num_df.columns.name = ''
    return(num_df) 

In [98]:
class PhosphoAnalysis:
    def __init__(self):
        self.processor = PeptideCollapse()
        self.condition_df = None
        self.collapsed_data = None
        self.formatted_data = None
        
    
    def peptideCollapse(self, data, **kwargs):
        
        self.collapsed_data = self.processor.process_complete_pipeline(
            data=data, 
            **kwargs
        )
        
        return self.collapsed_data 
        
    def assign_condition_setup(self, condition_df=None):
        
        import warnings
        warnings.filterwarnings('ignore', category=FutureWarning)
        
        df_copy = self.collapsed_data.copy()
        
        if condition_df is None:
            condition_df = self.processor.get_precursor_condition_dataset() 
            
            
            print("Assign conditions to each sample")
        
            for i, sample in enumerate(condition_df['Sample']):
                condition = input(f"Enter condition for '{sample}': ")
                condition_df.loc[i, 'Condition'] = condition
            
            quant_cols = condition_df['Sample'].unique().tolist()
            meta_cols = [col for col in df_copy.columns if col not in quant_cols]
            quant_df, meta_df = df_copy[quant_cols].T, df_copy[meta_cols].T
        
        else:
            
            quant_cols = condition_df['Sample'].unique().tolist()
            meta_cols = [col for col in df_copy.columns if col not in quant_cols]
            quant_df, meta_df = df_copy[quant_cols].T, df_copy[meta_cols].T
        
        tmp_dict = dict(zip(condition_df['Sample'], condition_df['Condition']))
        quant_df.columns = meta_df.loc["PTM_Collapse_key"]
        quant_df['group'] = quant_df.index.map(tmp_dict)
        quant_df['sample'] = (quant_df['group'] + '_' + (quant_df.groupby('group').cumcount() + 1).astype(str))
        quant_df['subject'] = quant_df['sample']
        
        self.formatted_data = quant_df
        self.condition_df = condition_df
        
        return quant_df

In [99]:
def run_kinase_prediction (dataset, kinase_df, group1, group2, kinase_class = "ser_thr"):
    import kinase_library as kl
    ###### Preparing kinase format
    kinase_df_copy = kinase_df.copy()
    def replace_between_asterics(match):
        return match.group(1).upper()
    tmp1 = []
    for el in kinase_df_copy['kinase_sequence'].tolist():
        tmp = re.sub(r'\*([^*]+)\*', replace_between_asterics, el)
        tmp1.append(tmp.replace('_', '').upper())
        
    kinase_df_copy['kinase_sequence'] = tmp1
    kinase_df_copy['PTM_Collapse_key'] = kinase_df_copy['PTM_Collapse_key'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[0]) + '_' + 'p' + kinase_df_copy['PTM_Collapse_key'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[1])
    ###### 
    dataset_copy = dataset.copy()
    dataset_copy = dataset_copy[(dataset_copy['group'] == group1) | (dataset_copy['group'] == group2)]
    dataset_copy = dataset_copy.loc[:, dataset_copy.isna().sum() <= 10]
    dataset_copy = ac.imputation_normal_distribution(dataset_copy).reset_index()
    ttest_result = ac.run_ttest(dataset_copy, group1, group2)
    ttest_result['identifier'] = ttest_result['identifier'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[0]) + '_' + 'p' + ttest_result['identifier'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[1])
    ttest_result = ttest_result.sort_values('identifier')
    ttest_result.columns = ['PTM_Collapse_key', 'T-statistics', 'pvalue', 'mean_group1', 'mean_group2',
        'std(group1)', 'std(group2)', 'log2FC', 'test', 'correction', 'padj',
       'rejected', 'group1', 'group2', 'FC', '-log10 pvalue', 'Method']
    kinases = pd.DataFrame()
    kinases = kinase_df_copy.merge(ttest_result, on = 'PTM_Collapse_key')
    kinases = kinases[['PTM_Collapse_key','kinase_sequence', 'log2FC', 'T-statistics', 'padj']]
    kinases.columns = ['Phosphosites', 'Sequence', 'logFC', 't', 'adj.P.Val']
    test = kl.DiffPhosData(kinases, lfc_col='logFC', seq_col='Sequence', pval_col='adj.P.Val', pval_thresh=0.05)
    kin_type = kinase_class
    method = 'percentile_rank'
    thresh = 15
    test1 = test.kinase_enrichment(kin_type=kin_type, kl_method=method, kl_thresh=thresh)
    fin_df = test1.combined_enrichment_results
    return fin_df, test1

In [100]:
def z_normalize_data(df):
    scaler = StandardScaler()
    df_norm = pd.DataFrame(
        scaler.fit_transform(df.T).T,
        index=df.index,
        columns=df.columns
    )
    return df_norm

def perform_hierarchical_clustering(df_norm, method='ward', metric='euclidean'):

    if metric == 'correlation':
        
        sample_distances = pdist(df_norm.T, metric='correlation')
        linkage_samples = linkage(sample_distances, method='average')
    else:
        linkage_samples = linkage(df_norm.T, method=method, metric=metric)

    if metric == 'correlation':
        feature_distances = pdist(df_norm, metric='correlation')
        linkage_features = linkage(feature_distances, method='average')
    else:
        linkage_features = linkage(df_norm, method=method, metric=metric)
    
    return linkage_samples, linkage_features

def plot_clustermap(df_norm, figsize=(12, 10), cmap='rocket', 
                   method='ward', metric='euclidean', n_clusters=6, save_path=None):

    linkage_samples, linkage_features = perform_hierarchical_clustering(
        df_norm, method=method, metric=metric
    )

    sample_clusters = fcluster(linkage_samples, n_clusters, criterion='maxclust')
    feature_clusters = fcluster(linkage_features, n_clusters, criterion='maxclust')

    cluster_colors = plt.cm.Set3(np.linspace(0, 1, n_clusters))

    sample_cluster_colors = []
    for cluster in sample_clusters:
        sample_cluster_colors.append(cluster_colors[cluster - 1])

    feature_cluster_colors = []
    for cluster in feature_clusters:
        feature_cluster_colors.append(cluster_colors[cluster - 1])


    feature_cluster_palette = {i+1: cluster_colors[i] for i in range(n_clusters)}


    feature_color_array = [feature_cluster_palette[cluster] for cluster in feature_clusters]

    g = sns.clustermap(
        df_norm,
        method=method,
        metric=metric,
        cmap=cmap,
        center=0,
        figsize=figsize,
        cbar_kws={'label': 'Z-score'},
        xticklabels=True,
        yticklabels=False if df_norm.shape[0] > 50 else True,
        dendrogram_ratio=0.15,
        colors_ratio=0.03,
        row_colors=feature_color_array
    )
    
    if df_norm.shape[0] <= 100:
        for i, cluster in enumerate(np.unique(feature_clusters)):
            cluster_positions = np.where(feature_clusters == cluster)[0]
            if len(cluster_positions) > 0:
                center_pos = np.mean(cluster_positions)
                g.ax_row_colors.text(
                    0.5, center_pos, f'C{cluster}', 
                    ha='center', va='center', fontsize=8, fontweight='bold'
                )

    plt.setp(g.ax_heatmap.get_xticklabels(), rotation=45, ha='right')


    legend_elements = [plt.Rectangle((0,0),1,1, facecolor=cluster_colors[i], 
                                   label=f'Cluster {i+1}') for i in range(n_clusters)]
    

    g.fig.legend(handles=legend_elements, title='Clusters', 
                bbox_to_anchor=(1.02, 0.8), loc='upper left')
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    plt.show()

    cluster_info = {
        'sample_clusters': pd.DataFrame({'Sample': df_norm.columns, 'Cluster': sample_clusters}),
        'feature_clusters': pd.DataFrame({'Feature': df_norm.index, 'Cluster': feature_clusters}),
        'clustermap': g
    }
    
    return cluster_info

def extract_clusters(linkage_matrix, labels, n_clusters=6, threshold=None):
    
    if threshold:
        clusters = fcluster(linkage_matrix, threshold, criterion='distance')
    else:
        clusters = fcluster(linkage_matrix, n_clusters, criterion='maxclust')
    
    cluster_df = pd.DataFrame({
        'Item': labels,
        'Cluster': clusters
    })
    
    return cluster_df

In [101]:
def prepare_gene_lists_from_clusters(feature_clusters, gene_column):
    
    if gene_column !=None and gene_column not in feature_clusters.columns:

        feature_clusters['gene_symbol'] = feature_clusters[gene_column].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[0])
        for i in range(min(5, len(feature_clusters))):
            original = feature_clusters.iloc[i][gene_column]
            extracted = feature_clusters.iloc[i]['gene_symbol']
            print(f"  {original} -> {extracted}")

        failed_extractions = feature_clusters['gene_symbol'].isna().sum()
        if failed_extractions > 0:
            print(f"\nWarning: {failed_extractions} features could not be parsed for gene symbols")
            print("Examples of failed extractions:")
            failed_examples = feature_clusters[feature_clusters['gene_symbol'].isna()][gene_column].head(3).tolist()
            for example in failed_examples:
                print(f"  {example}")
            

            mask = feature_clusters['gene_symbol'].isna()

            alternative_pattern = feature_clusters.loc[mask, gene_column].str.extract(r'([A-Z][A-Z0-9]+)')
            feature_clusters.loc[mask, 'gene_symbol'] = alternative_pattern[0]
            
            remaining_failed = feature_clusters['gene_symbol'].isna().sum()
            print(f"After alternative extraction: {remaining_failed} still failed")
    
    cluster_gene_dict = {}
    
    for cluster_id in sorted(feature_clusters['Cluster'].unique()):
        genes_in_cluster = feature_clusters[
            feature_clusters['Cluster'] == cluster_id
        ][gene_column].tolist()

        genes_in_cluster = [g for g in genes_in_cluster if pd.notna(g)]
        genes_in_cluster = list(set(genes_in_cluster)) 
        
        cluster_gene_dict[f'Cluster_{cluster_id}'] = genes_in_cluster
        print(f"Cluster {cluster_id}: {len(genes_in_cluster)} unique genes")
    
    return cluster_gene_dict

In [102]:
def run_go_enrichment_per_cluster(cluster_gene_dict, organism='human', 
                                 gene_sets=['GO_Biological_Process_2023',
                                          'GO_Molecular_Function_2023',
                                          'GO_Cellular_Component_2023'],
                                 cutoff=0.05, top_terms=20):
    """
    Run GO enrichment analysis for each cluster
    
    Parameters:
    cluster_gene_dict: dictionary with cluster names as keys and gene lists as values
    organism: organism name ('human', 'mouse', etc.) - lowercase
    gene_sets: list of gene set databases to use
    cutoff: adjusted p-value cutoff
    top_terms: number of top terms to keep per cluster
    
    Returns:
    enrichment_results: dictionary with enrichment results for each cluster
    """
    
    enrichment_results = {}
    
    for cluster_name, gene_list in cluster_gene_dict.items():
        print(f"\nRunning enrichment for {cluster_name} ({len(gene_list)} genes)...")
        
        if len(gene_list) < 3:
            print(f"Skipping {cluster_name}: too few genes ({len(gene_list)})")
            continue
        
        try:
            # Run enrichment analysis - removed unsupported parameters
            enr = gp.enrichr(
                gene_list=gene_list,
                gene_sets=gene_sets,
                organism=organism,  # 'human', 'mouse', 'yeast', etc.
                cutoff=cutoff  # adjusted p-value cutoff
            )
            
            # Get results and filter top terms
            results_df = enr.results
            
            if not results_df.empty:
                # Sort by adjusted p-value and keep top terms
                results_df = results_df.sort_values('Adjusted P-value').head(top_terms)
                enrichment_results[cluster_name] = {
                    'results': results_df,
                    'enrichr_object': enr
                }
                print(f"Found {len(results_df)} significant terms for {cluster_name}")
            else:
                print(f"No significant terms found for {cluster_name}")
                
        except Exception as e:
            print(f"Error running enrichment for {cluster_name}: {str(e)}")
            continue
    
    return enrichment_results

In [103]:
def run_gsea_analysis(expression_data, gene_ranking_method='signal_to_noise', 
                     gene_sets=['GO_Biological_Process_2023'], 
                     classes=None, permutation_num=1000):
    """
    Run Gene Set Enrichment Analysis (GSEA) on expression data
    
    Parameters:
    expression_data: DataFrame with genes as rows, samples as columns
    gene_ranking_method: method to rank genes ('signal_to_noise', 'log2_ratio_of_classes', etc.)
    gene_sets: gene set databases to use
    classes: sample class labels for comparison (e.g., ['control', 'treatment', ...])
    permutation_num: number of permutations
    
    Returns:
    gsea_results: GSEA results object
    """
    
    if classes is None:
        print("Please provide class labels for GSEA analysis")
        return None
    
    print(f"Running GSEA analysis on {expression_data.shape[0]} genes...")
    
    try:
        # Run GSEA
        gsea_results = gp.gsea(
            data=expression_data,
            gene_sets=gene_sets,
            cls=classes,
            method=gene_ranking_method,
            permutation_num=permutation_num,
            outdir=None,  # Don't save to disk
            seed=42,
            verbose=True
        )
        
        print(f"GSEA completed. Found {len(gsea_results.res2d)} gene sets.")
        return gsea_results
        
    except Exception as e:
        print(f"Error running GSEA: {str(e)}")
        return None

def plot_enrichment_results(enrichment_results, top_n=10, figsize=(12, 8), save_path=None):
    """
    Create visualization of GO enrichment results
    
    Parameters:
    enrichment_results: dictionary with enrichment results from run_go_enrichment_per_cluster
    top_n: number of top terms to show per cluster
    figsize: figure size
    save_path: path to save the figure
    """
    
    # Prepare data for plotting
    plot_data = []
    
    for cluster_name, results in enrichment_results.items():
        df = results['results'].head(top_n).copy()
        df['Cluster'] = cluster_name
        df['-log10(adj_pval)'] = -np.log10(df['Adjusted P-value'])
        plot_data.append(df)
    
    if not plot_data:
        print("No enrichment results to plot")
        return
    
    combined_df = pd.concat(plot_data, ignore_index=True)
    
    # Create the plot
    plt.figure(figsize=figsize)
    
    # Dot plot style visualization
    sns.scatterplot(
        data=combined_df,
        x='Cluster',
        y='Term',
        size='-log10(adj_pval)',
        hue='Gene_set',
        sizes=(50, 400),
        alpha=0.7
    )
    
    plt.title('GO Term Enrichment Across Clusters', fontsize=16, pad=20)
    plt.xlabel('Cluster', fontsize=12)
    plt.ylabel('GO Terms', fontsize=12)
    plt.xticks(rotation=45)
    
    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    plt.show()

def create_enrichment_heatmap(enrichment_results, top_n=15, figsize=(14, 10), save_path=None):
    """
    Create a heatmap showing enrichment across clusters
    
    Parameters:
    enrichment_results: enrichment results dictionary
    top_n: number of top terms to include
    figsize: figure size
    save_path: path to save figure
    """
    
    # Get all unique terms across clusters
    all_terms = set()
    cluster_term_pvals = {}
    
    for cluster_name, results in enrichment_results.items():
        df = results['results'].head(top_n)
        terms = df['Term'].tolist()
        pvals = df['Adjusted P-value'].tolist()
        
        all_terms.update(terms)
        cluster_term_pvals[cluster_name] = dict(zip(terms, pvals))
    
    # Create matrix for heatmap
    all_terms = list(all_terms)
    clusters = list(enrichment_results.keys())
    
    # Initialize matrix with NaN (will show as white/no enrichment)
    matrix = np.full((len(all_terms), len(clusters)), np.nan)
    
    for j, cluster in enumerate(clusters):
        for i, term in enumerate(all_terms):
            if term in cluster_term_pvals[cluster]:
                # Use -log10(p-value) for intensity
                pval = cluster_term_pvals[cluster][term]
                matrix[i, j] = -np.log10(pval) if pval > 0 else 10
    
    # Create heatmap
    plt.figure(figsize=figsize)
    
    # Mask NaN values to show them as white
    mask = np.isnan(matrix)
    
    sns.heatmap(
        matrix,
        xticklabels=clusters,
        yticklabels=all_terms,
        cmap='Reds',
        mask=mask,
        cbar_kws={'label': '-log10(Adjusted P-value)'},
        square=False
    )
    
    plt.title('GO Term Enrichment Heatmap Across Clusters', fontsize=16, pad=20)
    plt.xlabel('Clusters', fontsize=12)
    plt.ylabel('GO Terms', fontsize=12)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0, fontsize=8)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    plt.show()

In [104]:
def normalize_phospho_median(phospho_df, protein_df, return_non_matched=False):
    """
    Normalize phosphoproteomics data using condition-level protein-specific normalization.

    This function normalizes each phosphosite by subtracting the median abundance of its
    corresponding parent protein within each condition. Samples with the same name are
    treated as replicates of the same condition. This approach preserves condition-specific
    protein expression changes while providing phosphorylation stoichiometry information.

    Parameters
    ----------
    phospho_df : pandas.DataFrame
        Phosphoproteomics dataframe with samples as index and phosphosites as columns.
        Column names should contain protein identifiers (e.g., 'P12345~PROTEIN_S123').
        Values should be log-transformed intensities. Duplicate sample names indicate
        replicates of the same condition.
    protein_df : pandas.DataFrame
        Proteomics dataframe with samples as index and proteins as columns.
        Column names should contain protein identifiers matching phospho data.
        Values should be log-transformed intensities. Duplicate sample names indicate
        replicates of the same condition.
    return_non_matched : bool, optional
        If True, returns all phosphosites including those without protein matches (unnormalized).
        If False (default), returns only successfully normalized phosphosites.

    Returns
    -------
    dict
        Dictionary containing:
        - 'normalized_phospho': DataFrame with protein-normalized phospho data
        - 'condition_protein_medians': DataFrame with median protein values per condition
        - 'phospho_to_protein_mapping': Dict mapping phosphosites to protein IDs
        - 'normalization_success_rate': Float indicating fraction successfully normalized
        - 'common_conditions': List of conditions present in both datasets

    Notes
    -----
    - Protein IDs are extracted from phosphosite names using '~' or '_' delimiters
    - For each condition, calculates median protein abundance across replicates
    - Each phosphosite normalized by its parent protein's median in that condition
    - By default, only returns phosphosites with successful protein matches
    - Set return_non_matched=True to include unmatched phosphosites (unnormalized)
    - Preserves biological protein expression differences between conditions

    Raises
    ------
    ValueError
        If no common conditions are found between the two datasets

    Examples
    --------
    >>> results = normalize_phospho_median(phospho_data, protein_data)
    >>> normalized_data = results['normalized_phospho']
    >>> success_rate = results['normalization_success_rate']
    >>> print(f"Successfully normalized {success_rate:.1%} of phosphosites")
    """
    # Find common conditions
    phospho_conditions = set(phospho_df.index)
    protein_conditions = set(protein_df.index)
    common_conditions = phospho_conditions & protein_conditions

    if len(common_conditions) == 0:
        raise ValueError("No common conditions found between phospho and protein data!")

    print(f"Found {len(common_conditions)} common conditions: {sorted(common_conditions)}")

    # Filter to common conditions
    phospho_matched = phospho_df.loc[phospho_df.index.isin(common_conditions)]
    protein_matched = protein_df.loc[protein_df.index.isin(common_conditions)]

    print(f"Phospho samples: {len(phospho_matched)}")
    print(f"Protein samples: {len(protein_matched)}")

    # Extract protein IDs from phosphosite names
    def extract_protein_id(phosphosite_name):
        """Extract protein ID from phosphosite name (format: 'A0A087WUL8~NBPF19_S364_M1')"""
        return (
            phosphosite_name.split("~")[0]
            if "~" in phosphosite_name
            else phosphosite_name.split("_")[0]
        )

    # Create phosphosite to protein mapping
    phospho_to_protein = {}
    for phosphosite in phospho_matched.columns:
        protein_id = extract_protein_id(phosphosite)
        phospho_to_protein[phosphosite] = protein_id

    print(f"Mapped {len(phospho_to_protein)} phosphosites to proteins")

    # Calculate condition-level protein medians
    # Group protein data by condition and calculate median for each protein
    condition_protein_medians = protein_matched.groupby(protein_matched.index).median()

    print(f"Calculated protein medians for {len(condition_protein_medians)} conditions")
    print(f"Protein medians shape: {condition_protein_medians.shape}")

    # Normalize phospho data
    normalized_phospho = phospho_matched.copy()
    successfully_normalized_phosphosites = set()
    normalization_stats = {
        "total_values": 0,
        "normalized": 0,
        "protein_not_found": 0,
        "missing_protein_values": 0,
    }

    for condition in common_conditions:
        condition_mask = phospho_matched.index == condition
        condition_phospho = phospho_matched.loc[condition_mask]

        for phosphosite in phospho_matched.columns:
            protein_id = phospho_to_protein[phosphosite]
            normalization_stats["total_values"] += sum(condition_mask)

            # Find matching protein column
            matching_proteins = [
                col for col in condition_protein_medians.columns if protein_id in col.split(";")
            ]

            if matching_proteins:
                protein_col = matching_proteins[0]
                condition_protein_median = condition_protein_medians.loc[condition, protein_col]

                if not pd.isna(condition_protein_median):
                    # Normalize all replicates of this condition for this phosphosite
                    normalized_phospho.loc[condition_mask, phosphosite] = (
                        condition_phospho[phosphosite] - condition_protein_median
                    )
                    normalization_stats["normalized"] += sum(condition_mask)
                    successfully_normalized_phosphosites.add(phosphosite)
                else:
                    normalization_stats["missing_protein_values"] += sum(condition_mask)
            else:
                normalization_stats["protein_not_found"] += sum(condition_mask)

    # Filter to only successfully normalized phosphosites (unless return_non_matched=True)
    if not return_non_matched:
        print("Filtering to only successfully normalized phosphosites...")
        print(f"Original phosphosites: {len(phospho_matched.columns)}")
        print(f"Successfully normalized: {len(successfully_normalized_phosphosites)}")
        print(
            f"Removed: {len(phospho_matched.columns) - len(successfully_normalized_phosphosites)}"
        )

        normalized_phospho = normalized_phospho[list(successfully_normalized_phosphosites)]

        # Also update the original phospho for consistency
        phospho_matched = phospho_matched[list(successfully_normalized_phosphosites)]

    # Calculate success rate
    success_rate = (
        normalization_stats["normalized"] / normalization_stats["total_values"]
        if normalization_stats["total_values"] > 0
        else 0
    )

    print("\nNormalization statistics:")
    print(f"  Total phosphosite-sample combinations: {normalization_stats['total_values']:,}")
    print(f"  Successfully normalized: {normalization_stats['normalized']:,} ({success_rate:.1%})")
    print(f"  Protein not found: {normalization_stats['protein_not_found']:,}")
    print(f"  Missing protein values: {normalization_stats['missing_protein_values']:,}")

    return {
        "normalized_phospho": normalized_phospho,
        "condition_protein_medians": condition_protein_medians,
        "phospho_to_protein_mapping": phospho_to_protein,
        "normalization_success_rate": success_rate,
        "common_conditions": list(common_conditions),
        "original_phospho": phospho_matched,
        "original_protein": protein_matched,
    }

# Data upload

In [None]:
sorted_hela_nanophos_100cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_new_HeLa_sorted_100cells_Report.tsv', sep = '\t')
sorted_hela_nanophos_300cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_new_HeLa_sorted_300cells_Report.tsv', sep = '\t')
sorted_hela_nanophos_500cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_new_HeLa_sorted_500cells_Report.tsv', sep = '\t')
sorted_hela_nanophos_1000cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_new_HeLa_sorted_1000cells_Report.tsv', sep = '\t')
sorted_hela_nanophos_2000cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_new_HeLa_sorted_2000cells_Report.tsv', sep = '\t')
sorted_hela_nanophos_3000cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_new_HeLa_sorted_3000cells_Report.tsv', sep = '\t')

In [106]:
sorted_hela_uphos_100cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\uPhos_HeLa_sorted_100cells_Report.tsv', sep = '\t')
sorted_hela_uphos_300cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\uPhos_HeLa_sorted_300cells_Report.tsv', sep = '\t')
sorted_hela_uphos_500cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\uPhos_HeLa_sorted_500cells_Report.tsv', sep = '\t')
sorted_hela_uphos_1000cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\uPhos_HeLa_sorted_1000cells_Report.tsv', sep = '\t')
sorted_hela_uphos_2000cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\uPhos_HeLa_sorted_2000cells_Report.tsv', sep = '\t')
sorted_hela_uphos_3000cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\uPhos_HeLa_sorted_3000cells_Report.tsv', sep = '\t')

In [None]:
sorted_stemcells_100cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_StemCells_100cells_Report.tsv', sep = '\t')
sorted_stemcells_300cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_StemCells_300cells_Report.tsv', sep = '\t')
sorted_stemcells_500cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_StemCells_500cells_Report.tsv', sep = '\t')
sorted_stemcells_800cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_StemCells_800cells_Report.tsv', sep = '\t')
sorted_stemcells_1000cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_StemCells_1000cells_Report.tsv', sep = '\t')
sorted_stemcells_3000cells = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_StemCells_3000cells_Report.tsv', sep = '\t')

In [None]:
df_all = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\StemCells_RAlin_SL_all_Report.tsv', sep = '\t')

In [None]:
df_stemcells_500cells_normalized = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure3\nanoPhos_StemCells_500cells_RAlin_SL_normalized_Report.tsv', sep = '\t')

In [None]:
l_hela_sorted = [sorted_hela_nanophos_100cells, sorted_hela_nanophos_300cells, sorted_hela_nanophos_500cells, sorted_hela_nanophos_1000cells, sorted_hela_nanophos_2000cells, sorted_hela_nanophos_3000cells]
l_hela_sorted_uphos = [sorted_hela_uphos_100cells, sorted_hela_uphos_300cells, sorted_hela_uphos_500cells, sorted_hela_uphos_1000cells, sorted_hela_uphos_2000cells, sorted_hela_uphos_3000cells]

In [None]:
l_stemcells = [sorted_stemcells_100cells, sorted_stemcells_300cells, sorted_stemcells_500cells, 
               sorted_stemcells_800cells, sorted_stemcells_1000cells, sorted_stemcells_3000cells]

# Run PeptideCollapse on data

In [None]:
pc = PeptideCollapse()
l_hela_sorted_collapsed = []
for l in l_hela_sorted:
    l_hela_sorted_collapsed.append(pc.process_complete_pipeline(l, cutoff = 0, fasta_path=r'D:\Projects\Spectral libraries\human.fasta'))

In [None]:
l_stemcells_collapsed = []
pc = PeptideCollapse()
for l in l_stemcells:
    l_stemcells_collapsed.append(pc.process_complete_pipeline(l, cutoff = 0, add_kinase_sequences=False))
    
####
l_stemcells_collapsed_with_fasta = []
pc = PeptideCollapse()
for l in l_stemcells:
    l_stemcells_collapsed_with_fasta.append(pc.process_complete_pipeline(l, cutoff = 0, fasta_path = r'D:\Projects\Spectral libraries\mouse.fasta', kinase_window_size=7))

In [None]:
pc = PeptideCollapse()
l_hela_sorted_collapsed_uphos = []
for l in l_hela_sorted_uphos:
    l_hela_sorted_collapsed_uphos.append(pc.process_complete_pipeline(l, cutoff = 0, fasta_path=r'D:\Projects\Spectral libraries\human.fasta'))

In [None]:
builder = PhosphoAnalysis()
df_all_collapsed = builder.peptideCollapse(df_all, cutoff = 0, fasta_path = r'D:\Projects\Spectral libraries\mouse.fasta')

In [None]:
builder = PhosphoAnalysis()
df_stemcells_500cells_normalized_collapsed = builder.peptideCollapse(df_stemcells_500cells_normalized, cutoff = 0.75, fasta_path= r'D:\Projects\Spectral libraries\mouse.fasta')

# Figure 3a

In [None]:
figure = get_cumulative_barplot(l_hela_sorted_collapsed, 3, point_size=9, point_color='black')
#figure.write_image(r'D:\figure.pdf')

# Supplementary Figure 2c

In [None]:
figure = get_cumulative_barplot(l_hela_sorted_collapsed_uphos, 3, point_size=9, point_color='black')

#figure.write_image(r'D:\Projects\nanoPhos\figures_upd\figure3\suppl_figure2a.pdf', height = 600, width = 600)

# Figure 3b

In [None]:
#estimated protein input is calculated based on the assumption that HeLa cells contain approximately 250 pg of total protein per cell.
estimated_protein_input = [100*0.25, 300*0.25, 500*0.25, 1000*0.25, 2000*0.25, 3000*0.25]

In [None]:
r_squared_sorted_nanophos = calculate_interdilution_correlation(l_hela_sorted_collapsed, l_hela_sorted_collapsed[-1], estimated_protein_input)


In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=r_squared_sorted_nanophos,
    nbinsx=len(r_squared_sorted_nanophos)//20,
    name='PG Number',
    marker=dict(
        color='black',
        line=dict(color='black', width=0.3)
    ),
    opacity=0.8
))

fig.update_layout(
    width=600, 
    height=600, 
    template='plotly_white',
    xaxis_title='R squared',
    yaxis_title='Count',
    font=dict(size=12),
    showlegend=False,
    bargap=0.05
)
fig.add_vline(x = np.nanmedian(r_squared_sorted_nanophos), line = {'dash':'dash', 'width': 2, 'color':'black'})
#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure3\figure3b.pdf', height = 600, width = 600)

# Supplementary Figure 2a

In [None]:
cvs_nanophos = []
for df in l_hela_sorted_collapsed:
    linear_values = np.power(2, df.iloc[:,:3])

    n_valid = linear_values.notna().sum(axis=1)
    
    cv = linear_values.std(axis=1) / linear_values.mean(axis=1)
    cv = cv.replace(0, np.nan)

    cv[n_valid < 3] = np.nan
    
    cvs_nanophos.append(cv)

In [310]:
cvs_nanophos_flattened = [item for sublist in cvs_nanophos for item in sublist]

In [311]:
np.nanmedian(cvs_nanophos_flattened)

0.2221857543191797

In [136]:
color_sequence_violet = ['#C79EEA', '#B178E2','#9B52DA','#7E2AC7','#6D25AD','#551D87']
color_sequence_red = ['#FBA08D','#FA7A61','#F95534','#ED2E07','#CB2706','#9E1E05']

In [None]:
fig = go.Figure()
for i, el in enumerate(cvs_nanophos):
    fig.add_trace(go.Box(y = el, marker_color = color_sequence_red[i]))
    #fig.add_trace(go.Box(y = cvs_uphos[i], marker_color = color_sequence_violet[i]))
fig.update_layout(width = 800, height = 600, template = 'none')
fig.add_hline(y = np.nanmedian(cvs_nanophos_flattened), line = {'dash':'dash', 'width': 2, 'color':'black'})
#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure3\supplfig2b.pdf', height = 600, width = 600)

# Supplementary Figure 2b

In [None]:
sel_100cells = pd.read_csv(r'Z:\Denys_nanoPhos\PRIDE\analysis_data\figure3\selectivity_100cells.tsv', sep = '\t')
sel_300cells = pd.read_csv(r'Z:\Denys_nanoPhos\PRIDE\analysis_data\figure3\selectivity_300cells.tsv', sep = '\t')
sel_500cells = pd.read_csv(r'Z:\Denys_nanoPhos\PRIDE\analysis_data\figure3\selectivity_500cells.tsv', sep = '\t')
sel_1000cells = pd.read_csv(r'Z:\Denys_nanoPhos\PRIDE\analysis_data\figure3\selectivity_1000cells.tsv', sep = '\t')
sel_2000cells = pd.read_csv(r'Z:\Denys_nanoPhos\PRIDE\analysis_data\figure3\selectivity_2000cells.tsv', sep = '\t')
sel_3000cells = pd.read_csv(r'Z:\Denys_nanoPhos\PRIDE\analysis_data\figure3\selectivity_3000cells.tsv', sep = '\t')

list_selectivity = [sel_100cells, sel_300cells, sel_500cells, sel_1000cells, sel_2000cells, sel_3000cells]

In [None]:
sel = []
for df in list_selectivity:
    sel.append(df.set_index('XLabel').apply(np.sum,axis = 1).tolist())

In [None]:
id_values = ['100', '300', '500', '1000', '2000', '3000']
id_values1 = np.repeat(id_values, 3).tolist()
flattened = [item for sublist in sel for item in sublist]
df_selectivity = pd.DataFrame({'Selectivity':flattened, 'ID':id_values1})

In [150]:
fig = px.strip(df_selectivity, y = 'Selectivity', x = 'ID', orientation='h')
fig.update_layout(width = 600, height = 600, template = 'plotly_white')
fig.update_traces(marker = dict(size = 18, color = '#db4c2e', line = dict(width = 0.5, color = 'black')))
fig.update_yaxes(
    range=[0, 100],
    showgrid=True,
    gridwidth=0.1,           
    gridcolor='#F3F2F2',  
    griddash='solid'         
)
fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure3\supplfig2c.pdf', height = 600, width = 600)

# Figure 3c

In [37]:
ratio = []
for i, el in enumerate(l_hela_sorted_collapsed):
    tmp1 = []
    tmp2 = []
    for col in el.iloc[:,:3]:
        tmp1.append(len(el[col].dropna()))
    tmp1 = np.array(tmp1)
    for col in l_hela_sorted_collapsed_uphos[i].iloc[:,:3]:
        tmp2.append(len(l_hela_sorted_collapsed_uphos[i][col].dropna()))
    tmp2 = np.array(tmp2)
    ratio.append(np.round(tmp1/tmp2,2))
id_values = ['100', '300', '500', '1000', '2000', '3000']
id_values1 = np.repeat(id_values, 3).tolist()
flattened = [item for sublist in ratio for item in sublist]
df_ratio = pd.DataFrame({'Ratio':flattened, 'ID':id_values1})

In [None]:
fig = px.strip(df_ratio, y = 'Ratio', x = 'ID', orientation='h', log_y=True)
fig.update_layout(width = 600, height = 600, template = 'plotly_white')
fig.update_traces(marker = dict(size = 18, color = '#B3321E', line = dict(width = 0.5, color = 'black')))
fig.update_yaxes(
    range=[0, 1.1],
    showgrid=True,
    gridwidth=0.1,           
    gridcolor='#F3F2F2',  
    griddash='solid'         
)
#fig.write_image(r'D:\figure.pdf')

# Figure 3e

In [269]:
condition_df = pd.DataFrame({'Sample': df_all['R.FileName'].unique().tolist(), 'Condition': ['2iL_100', '2iL_100', '2iL_100', 'SL_100','SL_100','SL_100', 'RA_100','RA_100','RA_100','RA24_100', 'RA24_100','RA24_100',
                                                                                            '2iL_300', '2iL_300', '2iL_300', 'SL_300','SL_300','SL_300', 'RA_300','RA_300','RA_300','RA24_300', 'RA24_300','RA24_300',
                                                                                            '2iL_500', '2iL_500', '2iL_500', 'SL_500','SL_500','SL_500', 'RA_500','RA_500','RA_500','RA24_500', 'RA24_500','RA24_500',
                                                                                            '2iL_800', '2iL_800', '2iL_800', 'SL_800','SL_800','SL_800', 'RA_800','RA_800','RA_800','RA24_800', 'RA24_800','RA24_800',
                                                                                            '2iL_1000', '2iL_1000', '2iL_1000', 'SL_1000','SL_1000','SL_1000', 'RA_1000','RA_1000','RA_1000','RA24_1000', 'RA24_1000','RA24_1000',
                                                                                            '2iL_3000', '2iL_3000', '2iL_3000', 'SL_3000','SL_3000','SL_3000', 'RA_3000','RA_3000','RA_3000','RA24_3000', 'RA24_3000','RA24_3000']})

In [270]:
df_all_collapsed_cond = builder.assign_condition_setup(condition_df)
df_all_collapsed_filt = df_all_collapsed_cond.loc[:, (1 - (df_all_collapsed_cond.isna().sum() / len(df_all_collapsed_cond))) >=0.64]
df_all_collapsed_imp = ac.imputation_normal_distribution(df_all_collapsed_filt).reset_index()
df_all_collapsed_imp = df_all_collapsed_imp[(df_all_collapsed_imp['group'].str.contains('RA24') == False)]
pca = ac.run_pca(df_all_collapsed_imp)

In [None]:
fig = px.scatter(pca[0][0], x = 'x', y = 'y', labels= "sample", color = 'group', color_discrete_sequence= ['#FBA08D', '#C79EEA', '#9ECCEA',
                                                                                                           '#FA7A61', '#B178E2', '#78B8E2',
                                                                                                           '#F95534', '#9B52DA', '#52A4DA',
                                                                                                           '#ED2E07', '#7E2AC7', '#2A88C7' ,
                                                                                                           '#CB2706', '#6D25AD', '#2576AD' ,
                                                                                                           '#9E1E05', '#551D87', '#1D5C87' ])
fig.update_layout(width = 600, height = 600, template = 'plotly_white')
fig.update_traces(marker=dict(
    size=21, 
    line=dict(width=1, color='black')
))
#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure3\fig3f.pdf', height = 600, width = 600)

# Figure 3F

In [185]:
nums = []
for col in df_stemcells_500cells_normalized_collapsed.columns: 
    nums.append(len(df_stemcells_500cells_normalized_collapsed[col].dropna()))

In [186]:
nums1 = [nums[0:3], nums[4:7], nums[9:12]]

In [None]:
fig = go.Figure()
for el in nums1:
    fig.add_trace(go.Box(y = el, boxpoints='all', pointpos=0))
fig.update_layout(width = 500, height = 500, template = 'none')
fig.update_yaxes(range = [0,25000])
#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure3\fig3i.pdf', height = 500, width = 500)

# Figure 3G

In [191]:
a = df_stemcells_500cells_normalized_collapsed.iloc[:,[0,1,2,3,4,5,9,10,11]]

In [192]:
colspace = ['#e93526','#e93526','#e93526','#734b9e','#734b9e','#734b9e', '#2b89c7','#2b89c7','#2b89c7']

In [None]:
fig = go.Figure()
for i, col in enumerate(a.columns):
    fig.add_trace(go.Violin(x = a[col].dropna().tolist(), marker_color = colspace[i]))
median_val = a[col].dropna().median()
fig.add_vline(x=median_val, line_dash="dash", line_color='darkslategrey', line_width=2, opacity=0.7)
fig.update_layout(width = 600, height = 500, template = 'none')
fig.update_traces(orientation='h', side='positive', width=3, points=False, showlegend = False)
#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure3\fig3j.pdf', height = 500, width = 600)

# Supplementary figure 2d

In [219]:
l_stemcells = [sorted_stemcells_100cells, sorted_stemcells_300cells, 
               sorted_stemcells_800cells, sorted_stemcells_1000cells, sorted_stemcells_3000cells]

In [220]:
l_stemcells_collapsed = []
pc = PeptideCollapse()
for l in l_stemcells:
    l_stemcells_collapsed.append(pc.process_complete_pipeline(l, cutoff = 0, add_kinase_sequences=False))

In [221]:
nums_grouped = []
for df in l_stemcells_collapsed:
    nums = []
    for col in df.iloc[:,[0,1,2,3,4,5,9,10,11]].columns:
        nums.append(len(df[col].dropna()))
    nums_grouped.append([nums[:3], nums[3:6], nums[6:]])


In [228]:
colors = [ '#ED2E07', '#7E2AC7', '#2A88C7']

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(
    rows=2, 
    cols=3,
    subplot_titles=[f'Condition {i+1}' for i in range(5)],
    specs=[[{}, {}, {}],
           [{"colspan": 1}, {"colspan": 1}, None]],  
    vertical_spacing=0.15,
    horizontal_spacing=0.1
)

positions = [
    (1, 1), (1, 2), (1, 3), 
    (2, 1), (2, 2)           
]

for idx, nums in enumerate(nums_grouped):
    row, col = positions[idx]
    
    all_data = [val for replicate in nums for val in replicate]
    median_val = np.median(all_data)
    y_max = median_val * 2  

    for i, el in enumerate(nums):
        fig.add_trace(
            go.Box(y=el, boxpoints='all', pointpos=0, showlegend=False,
                   marker_color=colors[i], line_color=colors[i]),
            row=row, 
            col=col
        )

    fig.update_yaxes(range=[0, y_max], row=row, col=col)

fig.update_layout(
    width=1200,
    height=800,
    template='none',
    showlegend=False
)

#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure3\supplfig2e.pdf', height = 800, width = 1200)

# Figure 3H

In [None]:
df_stemcells_500cells_normalized_collapsed_filt = df_stemcells_500cells_normalized_collapsed.loc[:, (1 - (df_stemcells_500cells_normalized_collapsed.isna().sum() / len(df_stemcells_500cells_normalized_collapsed))) >=0.7] # filtering 70% missing
df_stemcells_500cells_normalized_collapsed_imp = ac.imputation_normal_distribution(df_stemcells_500cells_normalized_collapsed_filt).reset_index() #imputation
df_stemcells_500cells_normalized_collapsed_imp = df_stemcells_500cells_normalized_collapsed_imp[df_stemcells_500cells_normalized_collapsed_imp['group'] != 'RA24'] # removing RA24 samples for this analysis


In [None]:
df_anova = ac.run_anova(df_stemcells_500cells_normalized_collapsed_imp) # ANOVA test

In [25]:
df_stemcells_500cells_normalized_collapsed_imp_tmp = df_stemcells_500cells_normalized_collapsed_imp.drop(['sample', 'subject'],axis =1)

In [None]:
marker_genes = pd.read_excel(r'D:\Marker genes.xlsx') #list of marker transcription genes for stem cell differentiation

In [86]:
to_include = ['Pou3f1', 'Dnmt3a', 'Dbn1', 'Krt18', 'Sox3', 'Esrrb', 'Sox2', 'Otx2', 'Dnmt3b', 'Pou5f1', 'Dnmt3l', 'Tet2', 'Nanog', 'Nes']

In [87]:
anova_df_sig = df_anova[df_anova['rejected'] == True]
meta = df_stemcells_500cells_normalized_collapsed_imp[['group', 'sample', 'subject']]
df_stemcells_500cells_normalized_collapsed_imp = df_stemcells_500cells_normalized_collapsed_imp.set_index('sample').drop(['group', 'subject'], axis = 1).T.reset_index()
df_stemcells_500cells_normalized_collapsed_imp = df_stemcells_500cells_normalized_collapsed_imp[df_stemcells_500cells_normalized_collapsed_imp ['PTM_Collapse_key'].isin(anova_df_sig['identifier'])]
df_stemcells_500cells_normalized_collapsed_imp['Gene'] = df_stemcells_500cells_normalized_collapsed_imp['PTM_Collapse_key'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[0])
df_stemcells_500cells_normalized_collapsed_imp = df_stemcells_500cells_normalized_collapsed_imp[df_stemcells_500cells_normalized_collapsed_imp['Gene'].isin(to_include)].drop('Gene',axis = 1)
df_stemcells_500cells_normalized_collapsed_imp = df_stemcells_500cells_normalized_collapsed_imp.set_index('PTM_Collapse_key')
df_norm = z_normalize_data(df_stemcells_500cells_normalized_collapsed_imp)
linkage_samples, linkage_features = perform_hierarchical_clustering(
        df_norm, method='ward', metric='euclidean'
    )

In [None]:
clustermap = plot_clustermap(df_norm, figsize=(8, 7), method='ward', 
                                metric='euclidean', n_clusters=1, save_path=r'D:\Projects\nanoPhos\images_export\img10.pdf')