# Make statistical tests between different groups to see if peptide properties statistically differ

In [1]:
import pandas as pd
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import os
import sys

# --- Configuration ---
DATA_DIR = "/projectnb/cancergrp/Philipp/data/"
RESULTS_DIR = "/projectnb/cancergrp/Philipp/results/RITA_peptides"
COMPREHENSIVE_TABLE_PATH = os.path.join(RESULTS_DIR, "comprehensive_peptide_properties_and_metadata_with_RSA.csv")
STAT_TEST_RESULTS_PATH = os.path.join(RESULTS_DIR, "statistical_test_results.csv")

# Define the peptide property columns to test
PEPTIDE_PROPERTY_COLUMNS = ['Disorder_perc', 'Helix_perc', 'Sheet_perc', 'Coil_perc', 'Buried_perc', 'Exposed_perc']

# --- Main Script for Statistical Testing ---
def perform_statistical_tests():
    print(f"\n--- Starting Statistical Analysis ---")

    # 1. Load the comprehensive peptide data
    if not os.path.exists(COMPREHENSIVE_TABLE_PATH):
        print(f"Error: Comprehensive peptide table not found at {COMPREHENSIVE_TABLE_PATH}")
        sys.exit(1)
    
    comprehensive_df = pd.read_csv(COMPREHENSIVE_TABLE_PATH)
    print(f"Loaded comprehensive data with {len(comprehensive_df_aa_aa_aa_aa_aa_aa_aa)} peptides.")
    print(f"Columns available: {comprehensive_df.columns.tolist()}")

    # Ensure log2FoldChange is numeric before filtering
    comprehensive_df['log2FoldChange'] = pd.to_numeric(comprehensive_df['log2FoldChange'], errors='coerce')

    # 2. Define the groups   
    # Group Definitions:
    # Full Library (VT/VP): All peptides in the comprehensive_df that passed the initial VT/VP filter.
    # (This is effectively `comprehensive_df` itself if it only contains VT/VP peptides from the start)
    group_data = {}
    
    # All peptides in the filtered library (those loaded into comprehensive_df)
    group_data['Full_Library_VT_VP'] = comprehensive_df

    # Peptides actually used in the RITA experiment (they have 'sig' and 'log2FoldChange' info)
    group_data['Experiment_Used_VT_VP'] = comprehensive_df[comprehensive_df['sig'].notna()]
    
    # Peptides in the full library that were *not* in the RITA experiment (their 'sig' column is NaN)
    group_data['Experiment_Not_Used_VT_VP'] = comprehensive_df[comprehensive_df['sig'].isna()]

    # Significant peptides (from RITA experiment)
    group_data['Experiment_Significant_VT_VP'] = comprehensive_df[comprehensive_df['sig'] == 'Yes']

    # Non-significant peptides (from RITA experiment)
    group_data['Experiment_NonSignificant_VT_VP'] = comprehensive_df[comprehensive_df['sig'] == 'No']

    # Upregulated significant peptides
    group_data['Experiment_Upregulated_VT_VP'] = comprehensive_df[
        (comprehensive_df['sig'] == 'Yes') & (comprehensive_df['log2FoldChange'] > 0)
    ]

    # Downregulated significant peptides
    group_data['Experiment_Downregulated_VT_VP'] = comprehensive_df[
        (comprehensive_df['sig'] == 'Yes') & (comprehensive_df['log2FoldChange'] < 0)
    ]

    # Report sizes of groups
    print("\nGroup Sizes:")
    for name, df in group_data.items():
        print(f"  {name}: {len(df)} peptides")
        if df.empty:
            print(f"    Warning: {name} is empty. This group cannot be used in comparisons.")

    # Define the specific pairwise comparisons to perform
    # Each tuple is (Group_A_name, Group_B_name)
    comparisons_to_run = [
        ('Experiment_Significant_VT_VP', 'Full_Library_VT_VP'),
        ('Experiment_NonSignificant_VT_VP', 'Full_Library_VT_VP'),
        ('Experiment_Upregulated_VT_VP', 'Experiment_Downregulated_VT_VP'),
        ('Experiment_Upregulated_VT_VP', 'Full_Library_VT_VP'),
        ('Experiment_Downregulated_VT_VP', 'Full_Library_VT_VP'),
        ('Experiment_Significant_VT_VP', 'Experiment_NonSignificant_VT_VP'),
        ('Experiment_Used_VT_VP', 'Experiment_Not_Used_VT_VP'),
    ]

    results = []
    p_values_for_correction = []
    
    print("\nPerforming Mann-Whitney U tests...")
    for group_name_a, group_name_b in comparisons_to_run:
        df_a = group_data.get(group_name_a)
        df_b = group_data.get(group_name_b)

        if df_a is None or df_b is None:
            print(f"Skipping comparison {group_name_a} vs {group_name_b}: One or both groups not found.")
            continue
        if df_a.empty or df_b.empty:
            print(f"Skipping comparison {group_name_a} vs {group_name_b}: One or both groups are empty ({len(df_a)} vs {len(df_b)} peptides).")
            continue

        for prop_col in PEPTIDE_PROPERTY_COLUMNS:
            data_a = df_a[prop_col].dropna() # Drop NA values for the test
            data_b = df_b[prop_col].dropna()

            if len(data_a) < 2 or len(data_b) < 2: # Mann-Whitney U requires at least 2 observations per group
                print(f"Skipping {prop_col} for {group_name_a} vs {group_name_b}: Insufficient data ({len(data_a)} vs {len(data_b)} observations).")
                continue

            try:
                # Perform Mann-Whitney U test
                statistic, p_value = mannwhitneyu(data_a, data_b, alternative='two-sided')
                results.append({
                    'Comparison': f"{group_name_a} vs {group_name_b}",
                    'Property': prop_col,
                    'Statistic': statistic,
                    'P_Value': p_value
                })
                p_values_for_correction.append(p_value)
            except ValueError as e:
                print(f"Error running Mann-Whitney U test for {prop_col} between {group_name_a} and {group_name_b}: {e}")
                print(f"Data A (first 5): {data_a.head().tolist()}")
                print(f"Data B (first 5): {data_b.head().tolist()}")
                # Still append to results so it's noted, with NaN p-value
                results.append({
                    'Comparison': f"{group_name_a} vs {group_name_b}",
                    'Property': prop_col,
                    'Statistic': None,
                    'P_Value': None
                })
                p_values_for_correction.append(1.0) # Assign 1.0 to not affect FDR for others, but ensures it's handled.


    if not results:
        print("No statistical tests could be performed. Please check your data and group definitions.")
        return

    results_df = pd.DataFrame(results)

    # 3. Apply Multiple Hypothesis Correction (Benjamini-Hochberg FDR)
    # Filter out None p-values before correction
    valid_p_values = [p for p in p_values_for_correction if p is not None]
    
    if valid_p_values:
        # returns (reject_null, pvals_corrected, alpha_corrected_threshold, bonferroni_alpha_threshold)
        reject_null, pvals_corrected, _, _ = multipletests(valid_p_values, alpha=0.05, method='fdr_bh')
        
        # Map corrected p-values back to the original results_df
        corrected_p_value_iter = iter(pvals_corrected)
        corrected_p_values_for_df = []
        for p in p_values_for_correction:
            if p is not None:
                corrected_p_values_for_df.append(next(corrected_p_value_iter))
            else:
                corrected_p_values_for_df.append(None) # Maintain None for skipped tests

        results_df['Adjusted_P_Value'] = corrected_p_values_for_df
        results_df['Significant_FDR_0.05'] = [p < 0.05 if p is not None else False for p in corrected_p_values_for_df]
    else:
        results_df['Adjusted_P_Value'] = None
        results_df['Significant_FDR_0.05'] = False
        print("Warning: No valid p-values to apply multiple hypothesis correction.")


    # Sort results for better readability
    results_df = results_df.sort_values(by=['Property', 'Adjusted_P_Value']).reset_index(drop=True)

    # 4. Report results
    print("\n--- Statistical Test Results ---")
    print(results_df)

    # Save results to a CSV file
    results_df.to_csv(STAT_TEST_RESULTS_PATH, index=False)
    print(f"\nStatistical test results saved to: {STAT_TEST_RESULTS_PATH}")
    print(f"\n--- Statistical Analysis Complete ---")

# Call the function to run the tests
if __name__ == "__main__":
    perform_statistical_tests()


--- Starting Statistical Analysis ---
Loaded comprehensive data with 200 peptides.
Columns available: ['identifier', 'Aminoacids', 'NCBI_id', 'Disorder_perc', 'Helix_perc', 'Sheet_perc', 'Coil_perc', 'Buried_perc', 'Exposed_perc', 'sig', 'log2FoldChange', 'padj']

Group Sizes:
  Full_Library_VT_VP: 200 peptides
  Experiment_Used_VT_VP: 191 peptides
  Experiment_Not_Used_VT_VP: 9 peptides
  Experiment_Significant_VT_VP: 1 peptides
  Experiment_NonSignificant_VT_VP: 190 peptides
  Experiment_Upregulated_VT_VP: 1 peptides
  Experiment_Downregulated_VT_VP: 0 peptides

Performing Mann-Whitney U tests...
Skipping Disorder_perc for Experiment_Significant_VT_VP vs Full_Library_VT_VP: Insufficient data (1 vs 200 observations).
Skipping Helix_perc for Experiment_Significant_VT_VP vs Full_Library_VT_VP: Insufficient data (1 vs 200 observations).
Skipping Sheet_perc for Experiment_Significant_VT_VP vs Full_Library_VT_VP: Insufficient data (1 vs 200 observations).
Skipping Coil_perc for Experiment

In [4]:
import pandas as pd
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
import os
import sys

# --- Configuration ---
DATA_DIR = "/projectnb/cancergrp/Philipp/data/"
RESULTS_DIR = "/projectnb/cancergrp/Philipp/results/RITA_peptides"

# --- Point to the Amino Acid Composition CSV for individual peptides ---
COMPREHENSIVE_TABLE_PATH_AA = os.path.join(RESULTS_DIR, "comprehensive_peptide_amino_acid_composition_and_metadata.csv")
STAT_TEST_RESULTS_PATH_AA = os.path.join(RESULTS_DIR, "statistical_test_results_amino_acids.csv")

# --- Define the 20 standard amino acids as properties to test ---
AMINO_ACIDS = sorted(list('ACDEFGHIKLMNPQRSTVWY'))


# --- Main Script for Statistical Testing ---
def perform_statistical_tests_amino_acids():
    print(f"\n--- Starting Statistical Analysis for Amino Acid Compositions ---")

    # 1. Load the comprehensive peptide data with amino acid compositions
    if not os.path.exists(COMPREHENSIVE_TABLE_PATH_AA):
        print(f"Error: Comprehensive amino acid composition table not found at {COMPREHENSIVE_TABLE_PATH_AA}")
        sys.exit(1)
    
    comprehensive_df_aa = pd.read_csv(COMPREHENSIVE_TABLE_PATH_AA)
    print(f"Loaded comprehensive amino acid data with {len(comprehensive_df_aa)} peptides.")
    print(f"Columns available: {comprehensive_df_aa.columns.tolist()}")

    # Ensure log2FoldChange is numeric before filtering (it should already be from the creation script)
    comprehensive_df_aa['log2FoldChange'] = pd.to_numeric(comprehensive_df_aa['log2FoldChange'], errors='coerce')

    # 2. Define the groups   
    # Group Definitions:
    # Full Library (VT/VP): All peptides in the comprehensive_df_aa that passed the initial VT/VP filter.
    # (This is effectively `comprehensive_df_aa` itself if it only contains VT/VP peptides from the start)
    group_data_aa = {}
    
    # All peptides in the filtered library (those loaded into comprehensive_df_aa)
    group_data_aa['Full_Library_VT_VP'] = comprehensive_df_aa

    # Peptides actually used in the RITA experiment (they have 'sig' and 'log2FoldChange' info)
    group_data_aa['Experiment_Used_VT_VP'] = comprehensive_df_aa[comprehensive_df_aa['sig'].notna()]
    
    # Peptides in the full library that were *not* in the RITA experiment (their 'sig' column is NaN)
    group_data_aa['Experiment_Not_Used_VT_VP'] = comprehensive_df_aa[comprehensive_df_aa['sig'].isna()]

    # Significant peptides (from RITA experiment)
    group_data_aa['Experiment_Significant_VT_VP'] = comprehensive_df_aa[comprehensive_df_aa['sig'] == 'Yes']

    # Non-significant peptides (from RITA experiment)
    group_data_aa['Experiment_NonSignificant_VT_VP'] = comprehensive_df_aa[comprehensive_df_aa['sig'] == 'No']

    # Upregulated significant peptides
    group_data_aa['Experiment_Upregulated_VT_VP'] = comprehensive_df_aa[
        (comprehensive_df_aa['sig'] == 'Yes') & (comprehensive_df_aa['log2FoldChange'] > 0)
    ]

    # Downregulated significant peptides
    group_data_aa['Experiment_Downregulated_VT_VP'] = comprehensive_df_aa[
        (comprehensive_df_aa['sig'] == 'Yes') & (comprehensive_df_aa['log2FoldChange'] < 0)
    ]

    # Report sizes of groups
    print("\nGroup Sizes:")
    for name, df in group_data_aa.items():
        print(f"  {name}: {len(df)} peptides")
        if df.empty:
            print(f"    Warning: {name} is empty. This group cannot be used in comparisons.")

    # Define the specific pairwise comparisons to perform
    # Each tuple is (Group_A_name, Group_B_name)
    comparisons_to_run = [
        ('Experiment_Significant_VT_VP', 'Full_Library_VT_VP'),
        ('Experiment_NonSignificant_VT_VP', 'Full_Library_VT_VP'),
        ('Experiment_Upregulated_VT_VP', 'Experiment_Downregulated_VT_VP'),
        ('Experiment_Upregulated_VT_VP', 'Full_Library_VT_VP'),
        ('Experiment_Downregulated_VT_VP', 'Full_Library_VT_VP'),
        ('Experiment_Significant_VT_VP', 'Experiment_NonSignificant_VT_VP'),
        ('Experiment_Used_VT_VP', 'Experiment_Not_Used_VT_VP'),
    ]

    results = []
    p_values_for_correction = []
    
    print("\nPerforming Mann-Whitney U tests for amino acid compositions...")
    for group_name_a, group_name_b in comparisons_to_run:
        df_a = group_data_aa.get(group_name_a)
        df_b = group_data_aa.get(group_name_b)

        if df_a is None or df_b is None:
            print(f"Skipping comparison {group_name_a} vs {group_name_b}: One or both groups not found.")
            continue
        if df_a.empty or df_b.empty:
            print(f"Skipping comparison {group_name_a} vs {group_name_b}: One or both groups are empty ({len(df_a)} vs {len(df_b)} peptides).")
            continue

        # Iterate through AMINO_ACIDS for properties ---
        for aa_prop in AMINO_ACIDS:
            data_a = df_a[aa_prop].dropna() # Drop NA values for the test
            data_b = df_b[aa_prop].dropna()

            if len(data_a) < 2 or len(data_b) < 2: # Mann-Whitney U requires at least 2 observations per group
                print(f"Skipping {aa_prop} for {group_name_a} vs {group_name_b}: Insufficient data ({len(data_a)} vs {len(data_b)} observations).")
                continue

            try:
                # Perform Mann-Whitney U test
                statistic, p_value = mannwhitneyu(data_a, data_b, alternative='two-sided')
                results.append({
                    'Comparison': f"{group_name_a} vs {group_name_b}",
                    'Property': aa_prop,
                    'Statistic': statistic,
                    'P_Value': p_value
                })
                p_values_for_correction.append(p_value)
            except ValueError as e:
                print(f"Error running Mann-Whitney U test for {aa_prop} between {group_name_a} and {group_name_b}: {e}")
                print(f"Data A (first 5): {data_a.head().tolist()}")
                print(f"Data B (first 5): {data_b.head().tolist()}")
                results.append({
                    'Comparison': f"{group_name_a} vs {group_name_b}",
                    'Property': aa_prop,
                    'Statistic': None,
                    'P_Value': None
                })
                p_values_for_correction.append(1.0)


    if not results:
        print("No statistical tests could be performed. Please check your data and group definitions.")
        return

    results_df = pd.DataFrame(results)

    # 3. Apply Multiple Hypothesis Correction (Benjamini-Hochberg FDR)
    valid_p_values = [p for p in p_values_for_correction if p is not None]
    
    if valid_p_values:
        reject_null, pvals_corrected, _, _ = multipletests(valid_p_values, alpha=0.05, method='fdr_bh')
        
        corrected_p_value_iter = iter(pvals_corrected)
        corrected_p_values_for_df = []
        for p in p_values_for_correction:
            if p is not None:
                corrected_p_values_for_df.append(next(corrected_p_value_iter))
            else:
                corrected_p_values_for_df.append(None)

        results_df['Adjusted_P_Value'] = corrected_p_values_for_df
        results_df['Significant_FDR_0.05'] = [p < 0.05 if p is not None else False for p in corrected_p_values_for_df]
    else:
        results_df['Adjusted_P_Value'] = None
        results_df['Significant_FDR_0.05'] = False
        print("Warning: No valid p-values to apply multiple hypothesis correction.")


    results_df = results_df.sort_values(by=['Property', 'Adjusted_P_Value']).reset_index(drop=True)

    # 4. Report results
    print("\n--- Amino Acid Composition Statistical Test Results ---")
    print(results_df)

    # Save results to a CSV file
    results_df.to_csv(STAT_TEST_RESULTS_PATH_AA, index=False)
    print(f"\nAmino Acid Composition statistical test results saved to: {STAT_TEST_RESULTS_PATH_AA}")
    print(f"\n--- Amino Acid Composition Statistical Analysis Complete ---")

# Call the function to run the tests
if __name__ == "__main__":
    perform_statistical_tests_amino_acids()


--- Starting Statistical Analysis for Amino Acid Compositions ---
Loaded comprehensive amino acid data with 28112 peptides.
Columns available: ['identifier', 'Aminoacids', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'sig', 'log2FoldChange', 'padj']

Group Sizes:
  Full_Library_VT_VP: 28112 peptides
  Experiment_Used_VT_VP: 25734 peptides
  Experiment_Not_Used_VT_VP: 2378 peptides
  Experiment_Significant_VT_VP: 498 peptides
  Experiment_NonSignificant_VT_VP: 25236 peptides
  Experiment_Upregulated_VT_VP: 186 peptides
  Experiment_Downregulated_VT_VP: 312 peptides

Performing Mann-Whitney U tests for amino acid compositions...

--- Amino Acid Composition Statistical Test Results ---
                                            Comparison Property    Statistic  \
0    Experiment_Used_VT_VP vs Experiment_Not_Used_V...        A   27901857.5   
1    Experiment_Significant_VT_VP vs Full_Library_V...        A    6386840.0   
2    Experim