In [2]:
import sys
from alphaPhosHelperFunctions import *
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from scipy import stats
import plotly.figure_factory as ff
from PeptideCollapse import *
import analytics_core_V04 as ac
import kinase_library as kl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# Custom functions

In [2]:
def run_kinase_prediction (dataset, kinase_df, group1, group2, kinase_class = "ser_thr"):
    import kinase_library as kl
    ###### Preparing kinase format
    kinase_df_copy = kinase_df.copy()
    def replace_between_asterics(match):
        return match.group(1).upper()
    tmp1 = []
    for el in kinase_df_copy['kinase_sequence'].tolist():
        tmp = re.sub(r'\*([^*]+)\*', replace_between_asterics, el)
        tmp1.append(tmp.replace('_', '').upper())
        
    kinase_df_copy['kinase_sequence'] = tmp1
    kinase_df_copy['PTM_Collapse_key'] = kinase_df_copy['PTM_Collapse_key'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[0]) + '_' + 'p' + kinase_df_copy['PTM_Collapse_key'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[1])
    ###### 
    dataset_copy = dataset.copy()
    dataset_copy = dataset_copy[(dataset_copy['group'] == group1) | (dataset_copy['group'] == group2)]
    #dataset_copy = ac.filt_per_percentage(dataset_copy, 0.7)
    #dataset_copy = ac.imputation_normal_distribution(dataset_copy).reset_index()
    ttest_result = ac.run_ttest(dataset_copy, group1, group2)
    ttest_result['identifier'] = ttest_result['identifier'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[0]) + '_' + 'p' + ttest_result['identifier'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[1])
    ttest_result = ttest_result.sort_values('identifier')
    ttest_result.columns = ['PTM_Collapse_key', 'T-statistics', 'pvalue', 'mean_group1', 'mean_group2',
        'std(group1)', 'std(group2)', 'log2FC', 'test', 'correction', 'padj',
       'rejected', 'group1', 'group2', 'FC', '-log10 pvalue', 'Method']
    kinases = pd.DataFrame()
    kinases = kinase_df_copy.merge(ttest_result, on = 'PTM_Collapse_key')
    kinases = kinases[['PTM_Collapse_key','kinase_sequence', 'log2FC', 'T-statistics', 'padj']]
    kinases.columns = ['Phosphosites', 'Sequence', 'logFC', 't', 'adj.P.Val']
    test = kl.DiffPhosData(kinases, lfc_col='logFC', seq_col='Sequence', pval_col='adj.P.Val', pval_thresh=0.1)
    kin_type = kinase_class
    method = 'percentile_rank'
    thresh = 15
    test1 = test.kinase_enrichment(kin_type=kin_type, kl_method=method, kl_thresh=thresh)
    fin_df = test1.combined_enrichment_results
    return fin_df, test1

In [3]:
class PhosphoAnalysis:
    def __init__(self):
        self.processor = PeptideCollapse()
        self.condition_df = None
        self.collapsed_data = None
        self.formatted_data = None
        
    
    def peptideCollapse(self, data, **kwargs):
        
        self.collapsed_data = self.processor.process_complete_pipeline(
            data=data, 
            **kwargs
        )
        
        return self.collapsed_data 
        
    def assign_condition_setup(self, condition_df=None):
        
        import warnings
        warnings.filterwarnings('ignore', category=FutureWarning)
        
        df_copy = self.collapsed_data.copy()
        
        if condition_df is None:
            condition_df = self.processor.get_precursor_condition_dataset() 
            
            
            print("Assign conditions to each sample")
        
            for i, sample in enumerate(condition_df['Sample']):
                condition = input(f"Enter condition for '{sample}': ")
                condition_df.loc[i, 'Condition'] = condition
            
            quant_cols = condition_df['Sample'].unique().tolist()
            meta_cols = [col for col in df_copy.columns if col not in quant_cols]
            quant_df, meta_df = df_copy[quant_cols].T, df_copy[meta_cols].T
        
        else:
            
            quant_cols = condition_df['Sample'].unique().tolist()
            meta_cols = [col for col in df_copy.columns if col not in quant_cols]
            quant_df, meta_df = df_copy[quant_cols].T, df_copy[meta_cols].T
        
        tmp_dict = dict(zip(condition_df['Sample'], condition_df['Condition']))
        quant_df.columns = meta_df.loc["PTM_Collapse_key"]
        quant_df['group'] = quant_df.index.map(tmp_dict)
        quant_df['sample'] = (quant_df['group'] + '_' + (quant_df.groupby('group').cumcount() + 1).astype(str))
        quant_df['subject'] = quant_df['sample']
        
        self.formatted_data = quant_df
        self.condition_df = condition_df
        
        return quant_df

In [43]:
def normalize_phospho_median(phospho_df, protein_df, return_non_matched=False):
    """
    Normalize phosphoproteomics data using condition-level protein-specific normalization.

    This function normalizes each phosphosite by subtracting the median abundance of its
    corresponding parent protein within each condition. Samples with the same name are
    treated as replicates of the same condition. This approach preserves condition-specific
    protein expression changes while providing phosphorylation stoichiometry information.

    Parameters
    ----------
    phospho_df : pandas.DataFrame
        Phosphoproteomics dataframe with samples as index and phosphosites as columns.
        Column names should contain protein identifiers (e.g., 'P12345~PROTEIN_S123').
        Values should be log-transformed intensities. Duplicate sample names indicate
        replicates of the same condition.
    protein_df : pandas.DataFrame
        Proteomics dataframe with samples as index and proteins as columns.
        Column names should contain protein identifiers matching phospho data.
        Values should be log-transformed intensities. Duplicate sample names indicate
        replicates of the same condition.
    return_non_matched : bool, optional
        If True, returns all phosphosites including those without protein matches (unnormalized).
        If False (default), returns only successfully normalized phosphosites.

    Returns
    -------
    dict
        Dictionary containing:
        - 'normalized_phospho': DataFrame with protein-normalized phospho data
        - 'condition_protein_medians': DataFrame with median protein values per condition
        - 'phospho_to_protein_mapping': Dict mapping phosphosites to protein IDs
        - 'normalization_success_rate': Float indicating fraction successfully normalized
        - 'common_conditions': List of conditions present in both datasets

    Notes
    -----
    - Protein IDs are extracted from phosphosite names using '~' or '_' delimiters
    - For each condition, calculates median protein abundance across replicates
    - Each phosphosite normalized by its parent protein's median in that condition
    - By default, only returns phosphosites with successful protein matches
    - Set return_non_matched=True to include unmatched phosphosites (unnormalized)
    - Preserves biological protein expression differences between conditions

    Raises
    ------
    ValueError
        If no common conditions are found between the two datasets

    Examples
    --------
    >>> results = normalize_phospho_median(phospho_data, protein_data)
    >>> normalized_data = results['normalized_phospho']
    >>> success_rate = results['normalization_success_rate']
    >>> print(f"Successfully normalized {success_rate:.1%} of phosphosites")
    """
    # Find common conditions
    phospho_conditions = set(phospho_df.index)
    protein_conditions = set(protein_df.index)
    common_conditions = phospho_conditions & protein_conditions

    if len(common_conditions) == 0:
        raise ValueError("No common conditions found between phospho and protein data!")

    print(f"Found {len(common_conditions)} common conditions: {sorted(common_conditions)}")

    # Filter to common conditions
    phospho_matched = phospho_df.loc[phospho_df.index.isin(common_conditions)]
    protein_matched = protein_df.loc[protein_df.index.isin(common_conditions)]

    print(f"Phospho samples: {len(phospho_matched)}")
    print(f"Protein samples: {len(protein_matched)}")

    # Extract protein IDs from phosphosite names
    def extract_protein_id(phosphosite_name):
        """Extract protein ID from phosphosite name (format: 'A0A087WUL8~NBPF19_S364_M1')"""
        return (
            phosphosite_name.split("~")[0]
            if "~" in phosphosite_name
            else phosphosite_name.split("_")[0]
        )

    # Create phosphosite to protein mapping
    phospho_to_protein = {}
    for phosphosite in phospho_matched.columns:
        protein_id = extract_protein_id(phosphosite)
        phospho_to_protein[phosphosite] = protein_id

    print(f"Mapped {len(phospho_to_protein)} phosphosites to proteins")

    # Calculate condition-level protein medians
    # Group protein data by condition and calculate median for each protein
    condition_protein_medians = protein_matched.groupby(protein_matched.index).median()

    print(f"Calculated protein medians for {len(condition_protein_medians)} conditions")
    print(f"Protein medians shape: {condition_protein_medians.shape}")

    # Normalize phospho data
    normalized_phospho = phospho_matched.copy()
    successfully_normalized_phosphosites = set()
    normalization_stats = {
        "total_values": 0,
        "normalized": 0,
        "protein_not_found": 0,
        "missing_protein_values": 0,
    }

    for condition in common_conditions:
        condition_mask = phospho_matched.index == condition
        condition_phospho = phospho_matched.loc[condition_mask]

        for phosphosite in phospho_matched.columns:
            protein_id = phospho_to_protein[phosphosite]
            normalization_stats["total_values"] += sum(condition_mask)

            # Find matching protein column
            matching_proteins = [
                col for col in condition_protein_medians.columns if protein_id in col.split(";")
            ]

            if matching_proteins:
                protein_col = matching_proteins[0]
                condition_protein_median = condition_protein_medians.loc[condition, protein_col]

                if not pd.isna(condition_protein_median):
                    # Normalize all replicates of this condition for this phosphosite
                    normalized_phospho.loc[condition_mask, phosphosite] = (
                        condition_phospho[phosphosite] - condition_protein_median
                    )
                    normalization_stats["normalized"] += sum(condition_mask)
                    successfully_normalized_phosphosites.add(phosphosite)
                else:
                    normalization_stats["missing_protein_values"] += sum(condition_mask)
            else:
                normalization_stats["protein_not_found"] += sum(condition_mask)

    # Filter to only successfully normalized phosphosites (unless return_non_matched=True)
    if not return_non_matched:
        print("Filtering to only successfully normalized phosphosites...")
        print(f"Original phosphosites: {len(phospho_matched.columns)}")
        print(f"Successfully normalized: {len(successfully_normalized_phosphosites)}")
        print(
            f"Removed: {len(phospho_matched.columns) - len(successfully_normalized_phosphosites)}"
        )

        normalized_phospho = normalized_phospho[list(successfully_normalized_phosphosites)]

        # Also update the original phospho for consistency
        phospho_matched = phospho_matched[list(successfully_normalized_phosphosites)]

    # Calculate success rate
    success_rate = (
        normalization_stats["normalized"] / normalization_stats["total_values"]
        if normalization_stats["total_values"] > 0
        else 0
    )

    print("\nNormalization statistics:")
    print(f"  Total phosphosite-sample combinations: {normalization_stats['total_values']:,}")
    print(f"  Successfully normalized: {normalization_stats['normalized']:,} ({success_rate:.1%})")
    print(f"  Protein not found: {normalization_stats['protein_not_found']:,}")
    print(f"  Missing protein values: {normalization_stats['missing_protein_values']:,}")

    return {
        "normalized_phospho": normalized_phospho,
        "condition_protein_medians": condition_protein_medians,
        "phospho_to_protein_mapping": phospho_to_protein,
        "normalization_success_rate": success_rate,
        "common_conditions": list(common_conditions),
        "original_phospho": phospho_matched,
        "original_protein": protein_matched,
    }

In [5]:
def replace_between_asterics(match):
    return match.group(1).upper()

# Data upload

In [6]:
ff_10ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FF_10ng_Report.tsv', sep = '\t')
ff_20ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FF_20ng_Report.tsv', sep = '\t')
ff_50ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FF_50ng_Report.tsv', sep = '\t')
ff_100ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FF_100ng_Report.tsv', sep = '\t')
ff_200ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FF_200ng_Report.tsv', sep = '\t')
ff_500ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FF_500ng_Report.tsv', sep = '\t')
ff_1000ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FF_1000ng_Report.tsv', sep = '\t')

#####

ffpe_10ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FFPE_10ng_Report.tsv', sep = '\t')
ffpe_20ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FFPE_20ng_Report.tsv', sep = '\t')
ffpe_50ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FFPE_50ng_Report.tsv', sep = '\t')
ffpe_100ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FFPE_100ng_Report.tsv', sep = '\t')
ffpe_200ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FFPE_200ng_Report.tsv', sep = '\t')
ffpe_500ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FFPE_500ng_Report.tsv', sep = '\t')
ffpe_1000ng = pd.read_csv(r'W:\User\Denys\SN_diffs\nanoPhos_submission\figure4\nanoPhos_dilser_FFPE_1000ng_Report.tsv', sep = '\t')

In [None]:
compare_1ug = pd.read_csv(r'W:\User\Denys\SN_diffs\FF_versus_FFPE_report_1000ng.tsv', sep = '\t')

In [10]:
df_phospho = pd.read_csv(r'D:\Projects\nanoPhos\SN_diffs\nanoPhos_lungAC_Report.tsv', sep = '\t')

In [None]:
df_proteome = pd.read_csv(r'D:\Projects\nanoPhos\SN_diffs\DVP_lungAC_Report.tsv',sep = '\t')

In [None]:
annotation = pd.read_csv(r'C:\Users\oliinyk\Desktop\mainAnnot.mus_musculus.txt', sep = '\t')

In [7]:
ff_list = [ff_10ng, ff_20ng, ff_50ng, ff_100ng, ff_200ng, ff_500ng, ff_1000ng]
ffpe_list = [ffpe_10ng, ffpe_20ng, ffpe_50ng, ffpe_100ng, ffpe_200ng, ffpe_500ng, ffpe_1000ng]

# Run PeptideCollapse on data 

In [None]:
ff_list_collapsed = []
ffpe_list_collapsed = []
for i, l in enumerate(ff_list):
    pc = PeptideCollapse()
    ff_list_collapsed.append(pc.process_complete_pipeline(l, cutoff=0, fasta_path=r'D:\Projects\Spectral libraries\mouse.fasta'))
    pc = PeptideCollapse()
    ffpe_list_collapsed.append(pc.process_complete_pipeline(ffpe_list[i], cutoff=0, fasta_path=r'D:\Projects\Spectral libraries\mouse.fasta'))

In [10]:
nums_ffpe_total = [len(ffpe_list_collapsed[0]), len(ffpe_list_collapsed[1]), len(ffpe_list_collapsed[2]), len(ffpe_list_collapsed[3]), len(ffpe_list_collapsed[4]), len(ffpe_list_collapsed[5]), len(ffpe_list_collapsed[6])]
nums_ff_total = [len(ff_list_collapsed[0]), len(ff_list_collapsed[1]), len(ff_list_collapsed[2]), len(ff_list_collapsed[3]), len(ff_list_collapsed[4]), len(ff_list_collapsed[5]), len(ff_list_collapsed[6])]
ids = [10, 20, 50, 100, 200, 500, 1000]

In [None]:
builder = PhosphoAnalysis()
compare_1ug_collapsed = builder.peptideCollapse(compare_1ug, cutoff = 0, fasta_path=r'D:\Projects\Spectral libraries\mouse.fasta')

In [None]:
df_phospho['R.FileName'] = df_phospho['R.FileName'].replace('20250923_OA4_Evo11_Whisper80_DeOl_lungAC_phosphoDVP_K13_20250924081708', '20250923_OA4_Evo11_Whisper80_DeOl_lungAC_phosphoDVP_K13')
builder = PhosphoAnalysis()
df_phospho_collapsed = builder.peptideCollapse(df_phospho, cutoff = 0.75,fasta_path = r'D:\Projects\Spectral libraries\human.fasta')

# Figure 4a

In [None]:
figure = get_cumulative_barplot(ff_list_collapsed, 3, point_size=9, point_color='black')
figure.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4a1.pdf', height = 600, width = 600)

figure = get_cumulative_barplot(ffpe_list_collapsed, 3, point_size=9, point_color='black')
figure.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4a2.pdf', height = 600, width = 600)

# Supplementary Figures 3a & 3b

In [11]:
def r_squared(y_actual, y_predicted):
    ss_res = np.sum((y_actual - y_predicted) ** 2)
    ss_tot = np.sum((y_actual - np.mean(y_actual)) ** 2)
    return 1 - (ss_res / ss_tot)

def linear(x, a, b):
    """Linear: y = ax + b"""
    return a * x + b

def logarithmic(x, a, b):
    """Logarithmic: y = a*ln(x) + b"""
    return a * np.log(x) + b

models = []


In [12]:
x = np.array(ids)
y = np.array(nums_ffpe_total)

popt_lin, pcov_lin = curve_fit(linear, x, y)
a_lin, b_lin = popt_lin

y_pred_lin = linear(x, a_lin, b_lin)
ss_res_lin = np.sum((y - y_pred_lin) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r2_lin = 1 - (ss_res_lin / ss_tot)

x_smooth = np.linspace(min(x), max(x), 200)
y_smooth_lin = linear(x_smooth, a_lin, b_lin)

In [None]:
## Suppl. Figure 3b
fig1 = go.Figure()

fig1.add_trace(go.Scatter(
    x=x, 
    y=y,
    mode='markers',
    name='Data Points',
    marker=dict(size=10, color='red'),
    hovertemplate='<b>Data Point</b><br>X: %{x}<br>Y: %{y}<extra></extra>'
))
fig1.add_trace(go.Scatter(
    x=x_smooth,
    y=y_smooth_lin,
    mode='lines',
    name=f'Linear Fit<br>y = {a_lin:.1f}x + {b_lin:.1f}<br>R² = {r2_lin:.3f}',
    line=dict(color='blue', width=3, dash='dash'),
    hovertemplate='<b>Linear Curve</b><br>X: %{x:.1f}<br>Y: %{y:.1f}<extra></extra>'
))
fig1.update_layout(
    xaxis_title='X Values',
    yaxis_title='Y Values',
    hovermode='closest',
    template='plotly_white',
    width=800,
    height=600
)
#fig1.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\suppl_figure3b.pdf', height = 600, width = 600)

In [14]:
popt_log, pcov_log = curve_fit(logarithmic, ids, nums_ff_total)
a, b = popt_log

y_pred_log = logarithmic(ids, a, b)
ss_res = np.sum(( nums_ff_total - y_pred_log) ** 2)
ss_tot = np.sum(( nums_ff_total - np.mean(nums_ff_total)) ** 2)
r2_log = 1 - (ss_res / ss_tot)

x_smooth = np.linspace(min(ids), max(ids), 200)
y_smooth_log = logarithmic(x_smooth, a, b)

print(f"Logarithmic fit: y = {a:.3f}ln(x) + {b:.3f}")
print(f"R² = {r2_log:.4f}")

Logarithmic fit: y = 7000.658ln(x) + -17794.079
R² = 0.9606


In [None]:
## Suppl. Figure 3a
fig1 = go.Figure()

fig1.add_trace(go.Scatter(
    x=ids, 
    y=nums_ff_total,
    mode='markers',
    name='Data Points',
    marker=dict(size=10, color='red'),
    hovertemplate='<b>Data Point</b><br>X: %{x}<br>Y: %{y}<extra></extra>'
))
fig1.add_trace(go.Scatter(
    x=x_smooth,
    y=y_smooth_log,
    mode='lines',
    name=f'Logarithmic Fit<br>y = {a:.1f}ln(x) + {b:.1f}<br>R² = {r2_log:.3f}',
    line=dict(color='blue', width=3, dash = 'dash'),
    hovertemplate='<b>Fitted Curve</b><br>X: %{x:.1f}<br>Y: %{y:.1f}<extra></extra>'
))
fig1.update_layout(
    title='Logarithmic Curve Fit - Interactive Plot',
    xaxis_title='X Values',
    yaxis_title='Y Values',
    hovermode='closest',
    template='plotly_white',
    width=800,
    height=600
)
fig1.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\suppl_figure3a.pdf', height = 600, width = 600)

# Figure 4b

In [None]:
cv_ff = []
for df in ff_list_collapsed:
    linear_values = np.power(2, df.iloc[:,:3])

    n_valid = linear_values.notna().sum(axis=1)
    
    cv = linear_values.std(axis=1) / linear_values.mean(axis=1)
    cv = cv.replace(0, np.nan)

    cv[n_valid < 3] = np.nan
    
    cv_ff.append(cv)

In [10]:
cv_ffpe = []
for df in ffpe_list_collapsed:
    linear_values = np.power(2, df.iloc[:,:3])

    n_valid = linear_values.notna().sum(axis=1)
    
    cv = linear_values.std(axis=1) / linear_values.mean(axis=1)
    cv = cv.replace(0, np.nan)

    cv[n_valid < 2] = np.nan
    
    cv_ffpe.append(cv)

In [14]:
cv_ff_flat = [item for sublist in cv_ff for item in sublist]
cv_ffpe_flat = [item for sublist in cv_ffpe for item in sublist]

In [16]:
np.nanmedian(cv_ffpe_flat)

0.17118499734001127

In [None]:
np.nanmedian(cv_ffpe, axis=0)

In [312]:
color_palette = ['#EEA69B', '#E78373', '#E1604C', '#DB452E', '#B3321E', '#8C2718', '#641C11']
color_palette_violet = ['#DFCBEC', '#CAA9DF', '#B588D3', '#A066C7', '#8B45BA', '#723899', '#592C78']

In [314]:
fig = go.Figure()
for i, el in enumerate(cv_ff):
    fig.add_trace(go.Box(y = el, marker_color = color_palette[i]))
    fig.add_trace(go.Box(y = cv_ffpe[i], marker_color = color_palette_violet[i]))
fig.update_layout(width = 800, height = 600, template = 'plotly_white')
fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4b.pdf', height = 600, width = 800)

# Figure 4d

In [None]:

compare_1ug_collapsed_cond = builder.assign_condition_setup()
compare_1ug_collapsed_filt = compare_1ug_collapsed_cond.loc[:, (1 - (compare_1ug_collapsed_cond.isna().sum() / len(compare_1ug_collapsed_cond))) >=0.7]
compare_1ug_collapsed_imp = ac.imputation_normal_distribution(compare_1ug_collapsed_filt).reset_index()
pca_tissue = ac.run_pca(compare_1ug_collapsed_imp)

In [111]:
pca_tissue[1]

{'x_title': 'PC1 (0.64)', 'y_title': 'PC2 (0.13)', 'group': 'group'}

In [26]:
fig = px.scatter(pca_tissue[0][0], x = 'x', y = 'y', hover_name = 'group',color= 'group', color_discrete_sequence= ['#846db1','#e64126'])
fig.update_layout(width = 500, height = 500, template = 'none')
fig.update_traces(marker=dict(
        size=18,         
        opacity=1,               
        line=dict(
            color='DarkSlateGray',         
            width=1                
        )
    ))
fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4d.pdf', height = 500, width = 500)

# Figure 4c

In [None]:
pc = PeptideCollapse()
compare_1ug_collapsed = pc.process_complete_pipeline(compare_1ug, cutoff = 0, fasta_path=r'D:\Projects\Spectral libraries\mouse.fasta')
compare_1ug_collapsed = compare_1ug_collapsed.dropna()
compare_1ug_collapsed['FFPE_mean'] = compare_1ug_collapsed.iloc[:,[0,1,2]].apply(np.mean, axis = 1)
compare_1ug_collapsed['FF_mean'] = compare_1ug_collapsed.iloc[:,[3,4,5]].apply(np.mean, axis = 1)

In [28]:
pearson_r, pearson_p = stats.pearsonr(
    compare_1ug_collapsed['FFPE_mean'], 
    compare_1ug_collapsed['FF_mean']
)

slope, intercept, _, _, _ = stats.linregress(
    compare_1ug_collapsed['FFPE_mean'], 
    compare_1ug_collapsed['FF_mean']
)

x_line = np.array([compare_1ug_collapsed['FFPE_mean'].min(), 
                   compare_1ug_collapsed['FFPE_mean'].max()])
y_line = slope * x_line + intercept

In [29]:
spearman_r, spearman_p = stats.spearmanr(
    compare_1ug_collapsed['FFPE_mean'], 
    compare_1ug_collapsed['FF_mean']
)

print(f"Spearman correlation: {spearman_r:.4f}, p-value: {spearman_p:.4e}")

Spearman correlation: 0.7279, p-value: 0.0000e+00


In [30]:
from scipy.stats import pearsonr, spearmanr

In [31]:
pearson_coef, pearson_pval = pearsonr(compare_1ug_collapsed['FFPE_mean'], compare_1ug_collapsed['FF_mean'])

In [32]:
spearman_coef, spearman_pval = spearmanr(compare_1ug_collapsed['FFPE_mean'], compare_1ug_collapsed['FF_mean'])

In [33]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = compare_1ug_collapsed['FFPE_mean'], y = compare_1ug_collapsed['FF_mean'], mode = 'markers', showlegend=False))
fig.add_trace(go.Scatter(
    x=x_line, 
    y=y_line, 
    mode='lines',
    line=dict(color='#4A4744', width=3, dash = 'dash'), showlegend=False
))
fig.update_layout(width = 600, height = 600, template = 'none')
fig.update_traces(marker=dict(
        size=4,                    
        color='grey',         
        opacity=0.2,               
        line=dict(
            color='#0F0E0D',         
            width=0.2                
        )
    ))

fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4c.pdf', height = 600, width = 600)

# Figure 4f

In [34]:
fig = ff.create_distplot([compare_1ug_collapsed['FFPE_mean'], compare_1ug_collapsed['FF_mean']], group_labels=['FFPE', 'FF'], show_rug=False, bin_size=.1, colors=['#846db1','#e64126'])
fig.update_traces(opacity=0.7, selector=dict(type='histogram'))
fig.update_layout(width = 600, height = 600, template = 'none')
fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4f.pdf', height = 600, width = 600)

# Figure 4e

In [35]:
def get_annotation(annotation_dataset, main_dataset, go_term):
    data_copy = main_dataset.copy()
    annotation_copy = annotation_dataset.copy()

    new_col_name = go_term + '_exploded'

    annotation_copy['Protein_group'] = annotation_copy['UniProt'].str.split(';')
    annotation_copy[new_col_name] = annotation_copy[go_term].str.split(';')

    annotation_copy = annotation_copy.explode('Protein_group')
    annotation_copy = annotation_copy.explode(new_col_name)

    annotation_copy = annotation_copy[['Protein_group', new_col_name]]
    fin_df = data_copy.merge(annotation_copy, on='Protein_group', how='left')

    return fin_df

In [36]:
gocc = get_annotation(annotation, compare_1ug_collapsed, 'GOCC name')
kegg = get_annotation(annotation, compare_1ug_collapsed, 'KEGG name')

In [37]:
gocc = gocc[['FFPE_mean', 'FF_mean', 'Protein_group', 'GOCC name_exploded']]
kegg = kegg[['FFPE_mean', 'FF_mean', 'Protein_group', 'KEGG name_exploded']]

### Getting selected valid pathways

In [38]:
pathways_to_include_kegg = ['Wnt signaling pathway',
       'Notch signaling pathway', 'Jak-STAT signaling pathway', 'MAPK signaling pathway', 'mTOR signaling pathway', 'Calcium signaling pathway', 'ErbB signaling pathway', 'Neurotrophin signaling pathway', 'Insulin signaling pathway']

In [39]:
pearson_kegg = []
spearman_kegg = []
for el in pathways_to_include_kegg:
    tmp_df = kegg[kegg['KEGG name_exploded'] == el]
    pearson_coef, pearson_pval = pearsonr(tmp_df['FFPE_mean'], tmp_df['FF_mean'])
    pearson_kegg.append(pearson_coef)

    spearman_coef, spearman_pval = spearmanr(tmp_df['FFPE_mean'], tmp_df['FF_mean'])
    spearman_kegg.append(spearman_coef)

fin_df = pd.DataFrame({'ID': pathways_to_include_kegg, 'Pearson': pearson_kegg, 'Spearman': spearman_kegg})

In [None]:
fig = px.scatter(fin_df, x = 'Spearman', y = 'Pearson', hover_name = 'ID')

fig.update_layout(width = 600, height = 600, template = 'none')
fig.update_yaxes(range = [0,1])
fig.update_xaxes(range = [0,1])
fig.update_traces(marker=dict(
        size=18,                    
        color='#EB420C',         
        opacity=1,               
        line=dict(
            color='DarkSlateGray',         
            width=1                
        )
    ))

fig.add_shape(
    type='line',
    x0=0, y0=0,
    x1=1, y1=1,
    line=dict(
        color='darkgrey',
        width=2,
        dash='dash'
    )
)
#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4e.pdf', height = 600, width = 600)

# Supplementary Figure 3c

In [44]:
ratio = []
for idx, el in enumerate(ffpe_list_collapsed):
    ratio.append(len(ff_list_collapsed[idx]) / len(el))

In [46]:
labels = ['10ng', '20ng', '50ng', '100ng', '200ng', '500ng', '1000ng']

In [54]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = labels, y = ratio))
fig.update_layout(width = 600, height = 600, template = 'plotly_white')
fig.update_traces(marker = dict(size = 13, color = '#312353'), line = dict(color = '#846db1'))
fig.update_yaxes(range = [0,4.5])
fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\suppl_figure3c.pdf', height = 600, width = 600)

# Figure 4h 

### data preprocessing

In [27]:
meta = df_phospho_collapsed.iloc[:,-6:]

In [28]:
nums = []
labels = []
for el in df_phospho_collapsed.iloc[:,:-6].columns:
    nums.append(len(df_phospho_collapsed[el].dropna()))
    labels.append(el)
d = dict(zip(labels, nums))

In [29]:
threshold = np.nanmedian(list(d.values())) - stats.iqr(list(d.values()))*1.5

In [30]:
columns_to_keep = [col for col, count in d.items() if count >= threshold] + list(meta.columns)

In [31]:
columns_to_remove = [col for col, count in d.items() if count <= threshold]

In [32]:
df_phospho_collapsed_filtered = df_phospho_collapsed[columns_to_keep]

### Fetching legend file for phospho and proteome analysis 

In [33]:
legend = pd.read_excel(r'D:\Random\lungAC_main_FFPE_tissue_nPhos_forDenys_plate_layout_20250922.xlsx')

In [34]:
tmp = []
for index, row in legend.iterrows():
    if row['sample_type'] == 'tumor (adenocarcinoma)':
        tmp.append('AC')
    else: tmp.append('H')
legend['sample_type'] = tmp
legend['sample_ID'] = legend['sample_type'] + '_' + legend['patient'] + '_' + legend['biological_cohort']

In [35]:
sample_cols = df_phospho_collapsed_filtered.iloc[:,:-6].columns.tolist()
sample_cols_reduced = []
for el in sample_cols:
    sample_cols_reduced.append(el.split('_')[-1])
sample_cols_df = pd.DataFrame({'Full_name':sample_cols, '384_well_position': sample_cols_reduced})

sample_cols_df1 = sample_cols_df.merge(legend, on = '384_well_position')
sample_cols_df1['sample_ID_reduced'] = sample_cols_df1['sample_type'] + '_' + sample_cols_df1['biological_cohort']
condition_df = pd.DataFrame({'Sample':sample_cols_df1['Full_name'], 'Condition':sample_cols_df1['sample_ID_reduced']})

### Setting a condition setup for phosphoproteome dataset, filtering and imputation

In [36]:
df_phospho_collapsed1 = builder.assign_condition_setup(condition_df=condition_df)

In [37]:
df_phospho_collapsed_filt = df_phospho_collapsed1.loc[:, (1 - (df_phospho_collapsed1.isna().sum() / len(df_phospho_collapsed1))) >=0.7]

In [38]:
df_phospho_collapsed_imp = ac.imputation_normal_distribution(df_phospho_collapsed_filt).reset_index()

In [39]:
df_phospho_collapsed_imp.shape

(56, 6618)

### Fetching the proteome dataset

In [None]:

df_proteome = df_proteome.rename(columns={'20250923_OA4_Evo11_Whisper80_DeOl_lungAC_phosphoDVP_K13_20250924081708.raw.PG.Quantity': '20250923_OA4_Evo11_Whisper80_DeOl_lungAC_phosphoDVP_K13.raw.PG.Quantity'})
sample_cols_prot = df_proteome.iloc[:,1:].columns.tolist()
sample_cols_prot_reduced = []
for el in sample_cols_prot:
    sample_cols_prot_reduced.append(el.split('_')[-1].split('.')[0])
sample_cols_prot_df = pd.DataFrame({'Full_name':sample_cols_prot, '384_well_position': sample_cols_prot_reduced})
sample_cols_prot_df1 = sample_cols_prot_df.merge(legend, on = '384_well_position')
sample_cols_prot_df1['sample_ID_reduced'] = sample_cols_prot_df1['sample_type'] + '_' + sample_cols_prot_df1['biological_cohort']
sample_cols_prot_df1 = sample_cols_prot_df1[sample_cols_prot_df1['sample_ID'].isin(sample_cols_df1['sample_ID'])]

### Drop blank samples for proteome dataset

In [41]:
df_proteome1 = df_proteome.drop(['[61] 20250923_OA4_Evo11_Whisper80_DeOl_lungAC_DVP_O16.raw.PG.Quantity','[62] 20250923_OA4_Evo11_Whisper80_DeOl_lungAC_DVP_O18.raw.PG.Quantity','[63] 20250923_OA4_Evo11_Whisper80_DeOl_lungAC_DVP_O20.raw.PG.Quantity'],axis = 1)
df_proteome1.columns = ['PG.ProteinGroups'] + sample_cols_prot_df1['sample_ID_reduced'].tolist()
df_proteome1 = np.log2(df_proteome1.set_index('PG.ProteinGroups').T)

### Normalization of phosphoproteomics dataset based on proteome dataset

In [None]:
df_phospho_collapsed_imp1 = df_phospho_collapsed_imp
df_phospho_collapsed2 = df_phospho_collapsed_imp1.drop(['sample','subject'], axis = 1).set_index('group')
df_phospho_normalized = normalize_phospho_median(df_phospho_collapsed2, df_proteome1)

In [227]:
df_phospho_normalized = df_phospho_normalized['normalized_phospho']
df_phospho_normalized = df_phospho_normalized.reset_index()
df_phospho_normalized['sample'] = df_phospho_collapsed_imp1['sample']
df_phospho_normalized['subject'] = df_phospho_collapsed_imp1['subject']
df_phospho_normalized = df_phospho_normalized[df_phospho_normalized['group'] != 'could_be_blank']

In [239]:
df_phospho_normalized1 = df_phospho_normalized

In [240]:
df_phospho_normalized1['group'] = df_phospho_normalized1['group'].apply(lambda x: x.split('_')[0])
df_phospho_normalized1['sample'] = df_phospho_normalized1['sample'].apply(lambda x: x.split('_')[0]) + '_' + df_phospho_normalized1['sample'].apply(lambda x: x.split('_')[-1])
df_phospho_normalized1['subject'] = df_phospho_normalized1['subject'].apply(lambda x: x.split('_')[0]) + '_' + df_phospho_normalized1['subject'].apply(lambda x: x.split('_')[-1])

In [241]:
pca = ac.run_pca(df_phospho_normalized1)

In [254]:
pca[1]

{'x_title': 'PC1 (0.21)', 'y_title': 'PC2 (0.09)', 'group': 'group'}

In [253]:
fig = px.scatter(pca[0][0], x = 'x', y = 'y', labels= "sample", color = 'group', color_discrete_sequence= ['#6020B3', '#C4291A'])
fig.update_layout(width = 600, height = 600, template = 'none')
fig.update_traces(marker=dict(
    size=21, 
    line=dict(width=0.2, color='black')
))
fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4h.pdf', height = 600, width = 600)

# Figure 4I

### Changing the condition setup to include all cancer and healthy samples

In [None]:
df_phospho_normalized1['group'] = df_phospho_normalized1['group'].apply(lambda x: x.split('_')[0])

### t-test analysis

In [243]:
ttest1 = ac.run_ttest(df_phospho_normalized1, 'AC', 'H')

### Setting up the ID column to distinguish between significant and unsignificant sites

In [244]:
tmp = []
for index, row in ttest1.iterrows():
    if (row['log2FC'] >= 0.585) & (row['padj'] < 0.05):
        tmp.append('upreg')
    
    elif ((row['log2FC'] <= 0.585) & (row['log2FC'] >= 0)) & (row['padj'] < 0.05):
        tmp.append('almost_upreg')
    
    elif (row['log2FC'] <= -0.585) & (row['padj'] < 0.05):
        tmp.append('downreg')

    elif ((row['log2FC'] >= -0.585) & (row['log2FC'] <= 0)) & (row['padj'] < 0.05):
        tmp.append('almost_downreg')
    
    else: 
        tmp.append('noreg')

ttest1['ID'] = tmp

In [None]:
ttest1[ttest1['ID'] == 'upreg'].shape

(1179, 18)

In [None]:
ttest1[ttest1['ID'] == 'downreg'].shape

### Volcano plot

In [256]:
fig = px.scatter(ttest1, x = 'log2FC', y = -np.log10(ttest1['padj']), color = 'ID', color_discrete_sequence = ['#BDBDBD','#EE4811','#0B6299','#F7D291', '#77CDE6'])
fig.update_traces(marker=dict(
    size=10, 
    line=dict(width=0.5, color='black')
))
fig.add_vline(x = -0.585, line_dash = 'dash', line_color = 'black')
fig.add_vline(x = 0.585, line_dash = 'dash', line_color = 'black')
fig.add_hline(y = 1.3, line_dash = 'dash', line_color = 'black')
fig.update_xaxes(title = "log2FC (AC vs H)")
fig.update_yaxes(title = '-log10(p-adjusted)')
fig.update_layout(width = 600, height = 600, template = 'plotly_white', showlegend = False)
fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4i.png', height = 600, width = 600, scale = 5)

### Exporting significantly up- and down-regulated sites for EnrichR analysis

In [258]:
a = ttest1[ttest1['ID'] == 'upreg']
a['Gene'] = a['identifier'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[0])
a['Gene'].to_excel(r'D:\ttest_upreg_gene.xlsx', index = False)

In [259]:
a = ttest1[ttest1['ID'] == 'downreg']
a['Gene'] = a['identifier'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[0])
a['Gene'].to_excel(r'D:\ttest_downreg_gene.xlsx', index = False)

In [271]:
a = ttest1
a['Gene'] = a['identifier'].apply(lambda x: x.split('~')[1]).apply(lambda x: x.split('_')[0])
a['Gene'].to_excel(r'D:\ttest_background.xlsx', index = False)

# Figure 4J 

In [264]:
kin_df = df_phospho_collapsed[['PTM_Collapse_key', 'kinase_sequence']]
kinase_df, kinase_class = run_kinase_prediction(df_phospho_normalized1, kin_df, 'AC', 'H')

75 entries were omitted due to even length (no central position).
24 entries were omitted due to invalid central phosphoacceptor.
Use the 'omited_entries' attribute to view dropped enteries due to invalid sequences.

Calculating percentiles for upregulated sites (1964 substrates)
Scoring 1898 ser_thr substrates
Calculating percentile for 1898 ser_thr substrates
100%|██████████| 311/311 [00:02<00:00, 150.02it/s] 
                                                  

Calculating percentiles for downregulated sites (1229 substrates)
Scoring 1185 ser_thr substrates
Calculating percentile for 1185 ser_thr substrates
100%|██████████| 311/311 [00:02<00:00, 153.76it/s] 
                                                  

Calculating percentiles for background (unregulated) sites (4436 substrates)
Scoring 4282 ser_thr substrates
Calculating percentile for 4282 ser_thr substrates
100%|██████████| 311/311 [00:02<00:00, 137.51it/s] 
                                                  


In [267]:
tmp = []
for index, row in kinase_df.iterrows():
    if ((-np.log10(row['most_sig_fisher_adj_pval'])>=1.3) & (row['most_sig_log2_freq_factor']>=0)):
        tmp.append('upreg')
    elif ((-np.log10(row['most_sig_fisher_adj_pval'])>=1.3) & (row['most_sig_log2_freq_factor']<=0)):
        tmp.append('downreg')
    else:
        tmp.append('noreg')

kinase_df['ID'] = tmp

### Plot kinase prediction volcano plot

In [None]:
fig = px.scatter(kinase_df, y = -np.log10(kinase_df['most_sig_fisher_adj_pval']), x = 'most_sig_log2_freq_factor', color = 'ID', color_discrete_sequence=['#BDBDBD','#EE4811','#0B6299'])
fig.update_traces(marker=dict(
    size=10, 
    line=dict(width=0.5, color='black')
))
fig.add_hline(y = 1.3, line_dash = 'dash', line_color = 'black')
fig.update_xaxes(title = "log2(FF)")
fig.update_yaxes(title = '-log10(p-adjusted)')
fig.update_layout(width = 800, height = 400, template = 'none', showlegend = False)
#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4j.pdf', height = 400, width = 800)

# Figures 4k and 4l

### Importing EnrichR results for up- and down-regulated sites from volcano plot

In [299]:
wiki_upreg = pd.read_csv(r'D:\wiki_up.txt', sep = '\t')
gocc_upreg = pd.read_csv(r'D:\gobp_up.txt', sep = '\t')
kegg_downreg = pd.read_csv(r'D:\bioplanet_down.txt', sep = '\t')
gobp_downreg = pd.read_csv(r'D:\reactome_down.txt', sep = '\t')

In [303]:
wiki_upreg = wiki_upreg[wiki_upreg['Adjusted P-value']<=0.05]
gocc_upreg = gocc_upreg[gocc_upreg['Adjusted P-value']<=0.05]
kegg_downreg = kegg_downreg[kegg_downreg['Adjusted P-value']<=0.05]
gobp_downreg = gobp_downreg[gobp_downreg['Adjusted P-value']<=0.05]

In [None]:
wiki_upreg['Odds Ratio'] = np.log2(wiki_upreg['Odds Ratio'])
gocc_upreg['Odds Ratio'] = np.log2(gocc_upreg['Odds Ratio'])
kegg_downreg['Odds Ratio'] = np.log2(kegg_downreg['Odds Ratio'])
gobp_downreg['Odds Ratio'] = np.log2(gobp_downreg['Odds Ratio'])

### Fetching names of selected significantly enriched pathways for up- and down-regulated sites

In [300]:
to_include_upreg = ['Aerobic Glycolysis WP4629','EGF EGFR Signaling WP437', 'Lipid Metabolism Pathway WP3965', 'ATM Signaling In Development And Disease WP3878','Target Of Rapamycin Signaling WP1471','Focal Adhesion WP306','MAPK Signaling WP382']
to_include_downreg = ['Apoptotic Execution Phase', 'Signaling by Rho GTPases', 'Membrane Trafficking', 'Tight junction', 'Spliceosome']


### Plotting upregulated pathways (Figure 4k)

In [None]:
wiki_upreg = wiki_upreg[wiki_upreg['Term'].isin(to_include_upreg)].sort_values('Odds Ratio', ascending=True)
fig = px.bar(wiki_upreg, y = 'Term', x = 'Odds Ratio', color = 'Adjusted P-value', color_continuous_scale=[
                 [0, '#5a4a6f'], 
                 [0.25, '#9970ab'], 
                 [0.5, '#c994c7'],    
                 [0.75, '#d4b9da'],  
                 [1, '#e0d0e8']       
             ])
fig.update_layout(width = 600, height = 600, template = 'none')
#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4k.pdf', height = 600, width = 600)

### Plotting downregulated pathways (Figure 4l)

In [None]:
gobp_downreg = gobp_downreg[gobp_downreg['Term'].isin(to_include_downreg)].sort_values('Odds Ratio', ascending=True)
kegg_downreg = kegg_downreg[kegg_downreg['Term'].isin(to_include_downreg)].sort_values('Odds Ratio', ascending=True)
downreg = pd.concat([gobp_downreg, kegg_downreg], axis = 0).sort_values('Odds Ratio', ascending=True)


#####


fig = px.bar(downreg, y = 'Term', x = 'Odds Ratio', color = 'Adjusted P-value', color_continuous_scale=[
                 [0, '#5a4a6f'],   
                [0.25, '#9970ab'], 
                 [0.5, '#c994c7'], 
                 [0.75, '#d4b9da'],  
                 [1, '#e0d0e8']       
             ])
fig.update_layout(width = 700, height = 400, template = 'none')
#fig.write_image(r'D:\Projects\nanoPhos\figures_upd\figure4\figure4l.pdf', height = 600, width = 600)