## Differential Expression Analysis for Protein Biomarker Discovery

Differences in protein abundance between pairs of clinical groups were evaluated by using nested generalised linear models, in which for each individual protein feature, we assessed if its addition to a base model containing age and gender contributed to model fit. This approach entails a nested F-test equivalent to the two-sided regression tests. For each pairwise comparison, multiplicity was taken into account by controlling the False Discovery Rate (FDR) at qâ‰¤ 0.05 based on the number of features analysed.

Input:
- olink.xlsx: Protein expression data with columns:
  * SampleID: Unique sample identifier
  * Group: Clinical group classification
  * SubGroup: Clinical subgroup
  * Strain: Sample strain type
  * age at LP: Age at lumbar puncture
  * Sex: Patient sex
  * [Protein Names]: NPX values for each protein

Output:
- differential.csv: Differential expression results including:
  * Protein: Protein identifier
  * Group comparisons
  * P-values
  * Q-values (FDR corrected)
  * Log2 fold changes
  * Beta coefficients
- Volcano plots for each comparison
- Venn diagram of overlapping differential proteins
- Trajectory plots for key biomarkers

Analysis Steps:
1. Data preprocessing and formatting
2. Differential expression using nested linear models
3. Multiple testing correction
4. Visualization of results
5. Identification of subtype-specific biomarkers

In [1]:
# General utilities
import os
import warnings
import statistics as stat
import numpy as np
import pandas as pd
import itertools
import statsmodels.api as sm
from scipy import stats
from itertools import combinations
from statsmodels.formula.api import logit
from scipy.stats import chi2
from statsmodels.stats.multitest import multipletests
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels import api as sm
from statsmodels.tools.tools import add_constant

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text
from matplotlib_venn import venn3

# Ignore warnings
warnings.filterwarnings("ignore")

In [2]:
# Define path
data_path = os.path.dirname(os.getcwd()) + '/data/'
figure_path = os.path.dirname(os.getcwd()) + '/figures/differential'
results_path = os.path.dirname(os.getcwd()) + '/data/results/differential'

In [3]:
# Import Olink Data
df = pd.read_excel(data_path + 'curated/olink.xlsx')

In [5]:
# Drop unnecessary columns
columns_to_drop = [
    'Codon 129',
    'onset-LP', 'onset-death', 'LP-death', 'NP_subtype'
]

df = df.drop(columns=columns_to_drop)

# Pivoting the DataFrame in long format
df_pivoted = df.melt(
    id_vars=['SampleID', 'Group', 'SubGroup', 'Strain', 'age at LP', 'Sex'],  
    var_name='Assay',  
    value_name='NPX' 
)

# Setting index
df_pivoted.set_index('SampleID', inplace=True)
df = df_pivoted.copy()

# Rename column for consistency
df = df.rename(columns={"age at LP": "age_at_LP"})

## Differential expression analysis

In [6]:
# Function for pairwise comparisons
def perform_pairwise_comparisons(df, group_col, results, threshold=0.05):
    unique_groups = df[group_col].dropna().unique()
    comparisons = list(itertools.combinations(unique_groups, 2))  # All pairwise combinations

    for g1, g2 in comparisons:
        # Filter data for the two groups
        subset = df[df[group_col].isin([g1, g2])]
        subset[group_col] = subset[group_col].map({g1: 1, g2: 0})  # Encode groups as 1 and 0

        for protein in subset['Assay'].unique():
            protein_subset = subset[subset['Assay'] == protein]

            if len(protein_subset) < 2:
                # Skip if there is insufficient data for the model
                continue

            # Define the formula for the first (nested) model
            formula1 = f"NPX ~ age_at_LP + Sex"

            # Define the formula for the second (full) model including clinical group variables
            formula2 = f"NPX ~ age_at_LP + Sex + {group_col}"

            # Fit the first (nested) model
            model1 = ols(formula1, data=protein_subset).fit()

            # Fit the second (full) model
            model2 = ols(formula2, data=protein_subset).fit()

            # Perform an ANOVA to compare the two models
            anova_results = anova_lm(model1, model2)

            # Extract p-value for model comparison
            p_value = anova_results['Pr(>F)'][1]  # This is the p-value for the second model

            # Calculate log2 fold change as before
            vals1 = protein_subset[protein_subset[group_col] == 1]['NPX']
            vals2 = protein_subset[protein_subset[group_col] == 0]['NPX']

            if len(vals1) > 0 and len(vals2) > 0:
                log2_fold_change = vals1.mean() - vals2.mean()
            else:
                log2_fold_change = np.nan

            # Extract beta coefficient for the added variable in model 2
            beta_coefficient = model2.params.get(group_col, np.nan)

            # Append results
            results.append({
                "Protein": protein,
                "Group_Col": group_col,
                "Group1": g1,
                "Group2": g2,
                "Group1_vs_Group2": f"{g1} vs {g2}",
                "F_P_Value": p_value,
                "Log2_Fold_Change": log2_fold_change,
                "Beta_Coefficient": beta_coefficient
            })

# List to collect all results
results = []

# Perform comparisons for each categorical column
for group_col in ['Group', 'SubGroup', 'Strain']:
    perform_pairwise_comparisons(df, group_col, results)

# FDR correction for multiple comparisons
results_df = pd.DataFrame(results)
results_df['Q_Value'] = multipletests(results_df['F_P_Value'], method='fdr_bh')[1]
results_df['Significant'] = results_df['Q_Value'] < 0.05

# Save the results
output_file = os.path.join(results_path, "differential.csv")
results_df.to_csv(output_file, index=False)
#print(f"Results exported to: {output_file}")

## Visualise differentially expressed proteins among subgroups

In [7]:
# import dataframe
results_df = pd.read_csv(results_path + '/differential.csv')

### Volcano plots

In [None]:
# Function for the volcano plot using beta coefficient 
def create_volcano_plot_beta(results, title, figure_path, threshold=0.05):
    plt.figure(figsize=(6, 8))
    # Scatter plot with beta coefficient vs -log10 P-value (Q-value)
    plt.scatter(results['Beta_Coefficient'], 
                -np.log10(results['Q_Value']), 
                alpha=0.5, color='#87cefa', label="Not Significant")  

    # Highlight significant results based only on Q-value threshold
    significant = results[results['Q_Value'] < threshold]
    
    # Sort by Q-value to get the most significant ones
    significant_sorted = significant.sort_values(by='Q_Value').head(25)
    
    # Highlight significant results in a nice blue with transparency
    plt.scatter(significant_sorted['Beta_Coefficient'], 
                -np.log10(significant_sorted['Q_Value']), 
                color='#87cefa', alpha=0.5, label="Significant")
    
    # Threshold line for Q-value
    plt.axhline(-np.log10(threshold), color='darkred', linestyle='--', label=f'P-value threshold={threshold}')

    # Add a vertical line at Beta Coefficient = 0
    plt.axvline(0, color='darkred', linestyle='--', label='Beta Coefficient = 0')
    
    # Prepare the text labels (protein names) for the significant points
    texts = []
    for _, row in significant_sorted.iterrows():
        # Adding an offset to the position of the protein names to avoid overlap with the points
        text = plt.text(row['Beta_Coefficient'] + 0.05,  # Increase offset to X position
                        -np.log10(row['Q_Value']) + 0.05,  # Increase offset to Y position
                        row['Protein'], 
                        fontsize=8,
                        color='black',
                        ha='left', va='bottom')
        texts.append(text)
    
    # Adjust text to avoid overlap using adjustText (without arrows)
    adjust_text(texts, 
                only_move={'points': 'xy', 'texts': 'xy'},  # Move both points and texts along both axes
                expand_text=(1.2, 1.2),  # Make more space for text expansion
                force_text=0.05,  # Apply more force to adjust text placement
                lim=200)  # Limit the number of iterations to avoid overcomplicating
    
    # Axis labels and title
    plt.xlabel('Beta Coefficient')
    plt.ylabel('-Log10 Q-value')
    plt.title(f"{title}")
    
    # Save the plot with high resolution and tight layout
    plt.tight_layout()
    os.makedirs(os.path.dirname(figure_path), exist_ok=True)  # Create directory if it doesn't exist
    plt.savefig(figure_path, dpi=1200, bbox_inches='tight')
    plt.close()

# Define the figure path for saving volcano plots
figure_path = os.path.dirname(os.getcwd()) + '/figures/differential'

# Function to clean up the title for a valid file name
def clean_filename(title):
    return title.replace(":", "_").replace(" ", "_")

# Generate volcano plots for all group combinations using Beta Coefficient
for group_col in results_df['Group_Col'].unique():
    subset_group_col = results_df[results_df['Group_Col'] == group_col]
    for group1, group2 in subset_group_col[['Group1', 'Group2']].drop_duplicates().values:
        # Filter results for this combination
        subset = subset_group_col[(subset_group_col['Group1'] == group1) & 
                                  (subset_group_col['Group2'] == group2)]
        
        if subset.empty:
            continue
        
        # Create a title and clean it for the filename
        title = f"{group1} vs {group2}"
        cleaned_title = clean_filename(title)
        
        # Define the directory path where the figure should be saved
        output_directory = figure_path  # This should be the base directory
        
        # Ensure the directory exists
        os.makedirs(output_directory, exist_ok=True)
        
        # Define the full output file path (combining directory and filename)
        filename = f"{cleaned_title}_beta.png"
        plot_path = os.path.join(output_directory, filename)
        
        # Create the volcano plot using Beta Coefficient
        create_volcano_plot_beta(subset, title, plot_path)
        #print(f"Saved: {plot_path}")


In [13]:
# Function to create volcano plot using Log2 Fold Change
def create_volcano_plot_fold_change(results, title, figure_path, threshold=0.05):
    """
    Create and save volcano plot using Log2 Fold Change.
    """
    plt.figure(figsize=(6, 8))
    
    # Scatter plot with log2 fold change vs -log10 P-value
    plt.scatter(results['Log2_Fold_Change'], 
                -np.log10(results['Q_Value']), 
                alpha=0.5, color='lightblue', label="Not Significant")  # Light Sky Blue

    # Highlight significant results
    significant = results[(results['Q_Value'] < threshold) & (abs(results['Log2_Fold_Change']) > 1)]
    plt.scatter(significant['Log2_Fold_Change'], 
                -np.log10(significant['Q_Value']), 
                color='lightblue', label="Significant")
    
    # Threshold lines
    plt.axhline(-np.log10(threshold), color='darkred', linestyle='--', label=f'P-value threshold={threshold}')
    plt.axvline(-1, color='darkred', linestyle='--', label='Fold Change Threshold = -1')
    plt.axvline(1, color='darkred', linestyle='--', label='Fold Change Threshold = 1')
    
    # Prepare the text labels (protein names) for the significant points
    texts = []
    for _, row in significant.iterrows():
        # Adding an offset to the position of the protein names to avoid overlap with the points
        text = plt.text(row['Log2_Fold_Change'] + 0.15,  # Increase offset to X position
                        -np.log10(row['Q_Value']) + 0.05,  # Increase offset to Y position
                        row['Protein'], 
                        fontsize=10,
                        color='black',
                        ha='left', va='center')  # Annotate protein name near the point (not above)
        texts.append(text)
    
    # Adjust text to avoid overlap using adjustText
    adjust_text(texts, 
                only_move={'points': 'xy', 'texts': 'xy'},  # Move both points and texts along both axes
                expand_text=(1.2, 1.2),  # Make more space for text expansion
                force_text=0.05,  # Apply more force to adjust text placement
                lim=200)  # Limit the number of iterations to avoid overcomplicating
    
    # Axis labels and title
    plt.xlabel('Log2 Fold Change', fontsize=14)
    plt.ylabel('-Log10 Q-value', fontsize=14)
    #plt.title(f"{title}")
    plt.title(f"{title}", fontsize=16, fontweight='bold')
    
    # Save the plot with high resolution and tight layout
    plt.tight_layout()
    os.makedirs(os.path.dirname(figure_path), exist_ok=True)  # Create directory if it doesn't exist
    plt.savefig(figure_path, dpi=1200, bbox_inches='tight')
    plt.close()

# Define the figure path for saving volcano plots
figure_path = os.path.dirname(os.getcwd()) + '/figures/differential'

# Function to clean up the title for a valid file name
def clean_filename(title):
    return title.replace(":", "_").replace(" ", "_")

# Generate volcano plots for all group combinations using Log2 Fold Change
for group_col in results_df['Group_Col'].unique():
    subset_group_col = results_df[results_df['Group_Col'] == group_col]
    for group1, group2 in subset_group_col[['Group1', 'Group2']].drop_duplicates().values:
        # Filter results for this combination
        subset = subset_group_col[(subset_group_col['Group1'] == group1) & 
                                  (subset_group_col['Group2'] == group2)]
        
        if subset.empty:
            continue
        
        # Create a title and clean it for the filename
        title = f"{group1} vs {group2}"
        cleaned_title = clean_filename(title)

        # Define the directory path where the figure should be saved
        output_directory = figure_path  # This should be the base directory
        
        # Ensure the directory exists
        os.makedirs(output_directory, exist_ok=True)
        
        # Define the full output file path (combining directory and filename)
        filename = f"{cleaned_title}_fold_change.png"
        plot_path = os.path.join(output_directory, filename)
        
        # Create the volcano plot using Log2 Fold Change
        create_volcano_plot_fold_change(subset, title, plot_path)
        #print(f"Saved: {plot_path}")


### Venn diagram and biomarker trajectories

distinguishing all three subtypes

In [None]:
# Filter data: 'Significant' = True
df_significant = results_df[results_df['Significant'] == True]

# Select groups to compare
group1_vs_group2 = ['MV2K vs MM(V)1', 'MV2K vs VV2', 'VV2 vs MM(V)1']
filtered_df = df_significant[df_significant['Group1_vs_Group2'].isin(group1_vs_group2)]

# Create a dictionary with all combinations
proteins_dict = {
    'MV2K_vs_MM(V)1': set(filtered_df[filtered_df['Group1_vs_Group2'] == 'MV2K vs MM(V)1']['Protein']),
    'MV2K_vs_VV2': set(filtered_df[filtered_df['Group1_vs_Group2'] == 'MV2K vs VV2']['Protein']),
    'VV2_vs_MM(V)1': set(filtered_df[filtered_df['Group1_vs_Group2'] == 'VV2 vs MM(V)1']['Protein'])
}

# Calculate the total number of proteins
total_proteins = len(set(filtered_df['Protein']))

# Calculate the number of overlaps
only_mv2k_mm = len(proteins_dict['MV2K_vs_MM(V)1'] - proteins_dict['MV2K_vs_VV2'] - proteins_dict['VV2_vs_MM(V)1'])
only_mv2k_vv = len(proteins_dict['MV2K_vs_VV2'] - proteins_dict['MV2K_vs_MM(V)1'] - proteins_dict['VV2_vs_MM(V)1'])
only_vv_mm = len(proteins_dict['VV2_vs_MM(V)1'] - proteins_dict['MV2K_vs_MM(V)1'] - proteins_dict['MV2K_vs_VV2'])
mv2k_mm_vv = len(proteins_dict['MV2K_vs_MM(V)1'] & proteins_dict['MV2K_vs_VV2'] - proteins_dict['VV2_vs_MM(V)1'])
mv2k_mm_vv_only = len(proteins_dict['MV2K_vs_MM(V)1'] & proteins_dict['VV2_vs_MM(V)1'] - proteins_dict['MV2K_vs_VV2'])
mv2k_vv_vv_mm = len(proteins_dict['MV2K_vs_VV2'] & proteins_dict['VV2_vs_MM(V)1'])
mv2k_mm_vv_all = len(proteins_dict['MV2K_vs_MM(V)1'] & proteins_dict['MV2K_vs_VV2'] & proteins_dict['VV2_vs_MM(V)1'])

# Define path
figure_path = os.path.join(os.path.dirname(os.getcwd()), 'figures', 'differential')
os.makedirs(figure_path, exist_ok=True)

# Clear any existing plots
plt.clf()
plt.close('all')

# Create new figure
fig = plt.figure(figsize=(10, 10))

# Create Venn diagram
venn = venn3(
    subsets=(
        only_mv2k_mm, 
        only_mv2k_vv, 
        mv2k_mm_vv, 
        only_vv_mm, 
        mv2k_mm_vv_only, 
        mv2k_vv_vv_mm, 
        mv2k_mm_vv_all
    ),
    set_labels=('MV2K vs MM(V)1', 'MV2K vs VV2', 'VV2 vs MM(V)1')
)

# Set colors
color_map = {
    '100': '#FF9999',
    '010': '#99CCFF',
    '001': '#FFCC66',
    '110': '#66FF66',
    '101': '#FF66CC',
    '011': '#CC99FF',
    '111': '#FFFF00'
}

for region_id in ['100', '010', '001', '110', '101', '011', '111']:
    patch = venn.get_patch_by_id(region_id)
    if patch is not None:
        patch.set_fc(color_map[region_id])  
        patch.set_alpha(0.4)  
        patch.set_ec('black')  
        patch.set_linewidth(1.5)

# Set lable
def update_label(label_id, value, total_proteins):
    label = venn.get_label_by_id(label_id)
    if label is not None:
        label.set_text(f"{value}\n({100 * value / total_proteins:.1f}%)")

# Add percentages
update_label('100', only_mv2k_mm, total_proteins)
update_label('010', only_mv2k_vv, total_proteins)
update_label('001', only_vv_mm, total_proteins)
update_label('110', mv2k_mm_vv, total_proteins)
update_label('101', mv2k_mm_vv_only, total_proteins)
update_label('011', mv2k_vv_vv_mm, total_proteins)
update_label('111', mv2k_mm_vv_all, total_proteins)

# Font and labels
for label in venn.subset_labels:
    if label is not None:
        label.set_fontsize(14)  
        label.set_fontweight('bold')  
        label.set_color('black')  

# Font and labels
for label in venn.set_labels:
    label.set_fontsize(14)
    label.set_fontweight('bold')
    label.set_color('black')

# Save plot
output_path = os.path.join(figure_path, 'venn_subtypes.png')
plt.savefig(output_path, dpi=1200, bbox_inches='tight', facecolor='white')

plt.show()

plt.close()

In [32]:
# Find overlapping proteins
proteins_intersection = list(proteins_dict['MV2K_vs_MM(V)1'] & proteins_dict['MV2K_vs_VV2'] & proteins_dict['VV2_vs_MM(V)1'])

# Create a DataFrame
proteins_df = pd.DataFrame(proteins_intersection, columns=["Protein"])

# Save the results
output_file = os.path.join(results_path, "proteins_intersection.csv")
proteins_df.to_csv(output_file, index=False)
#print(f"Results exported to: {output_file}")


In [None]:
# Define the function to create a biomarker trajectory plot for 20 overlapping proteins
def plot_protein_log2fc(protein_list, title, df, output_path=None, figsize=(10, 6), title_fontsize=16, axis_fontsize=12):
    
    fig = plt.figure(figsize=figsize)
    
    # Define groups
    groups = ['CTRL', 'MM(V)1', 'VV2', 'MV2K']
    comparisons = {
        'MM(V)1': 'MM(V)1 vs CTRL',
        'VV2': 'VV2 vs CTRL',
        'MV2K': 'MV2K vs CTRL'
    }
    
    for protein in protein_list:
        protein_data = []
        
        # Compute log fold change
        for group in groups:
            if group == 'CTRL':  
                protein_data.append(0)
            else:
                comparison = comparisons[group]
                group_data = df[(df['Protein'] == protein) & (df['Group1_vs_Group2'] == comparison)]['Log2_Fold_Change']
                protein_data.append(group_data.mean() if not group_data.empty else None)  
        # Plot lines
        plt.plot(groups, protein_data, marker='o', label=protein, alpha=0.6)
    
    plt.axhline(0, color='black', linestyle='--', linewidth=1)  
    
    # Set title and axis lables
    plt.title(title, fontsize=title_fontsize)

    plt.xlabel('', fontsize=axis_fontsize)
    plt.ylabel('Log2 Fold Change', fontsize=axis_fontsize)
    
    plt.xticks(rotation=45, fontsize=axis_fontsize)  
    plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left', fontsize=12)
    
    plt.tight_layout()

    if output_path:
        plt.savefig(output_path, dpi=1200, bbox_inches='tight', facecolor='white')
    
    plt.close()

# Proteins to plot
proteins_intersection = list(proteins_dict['MV2K_vs_MM(V)1'] & proteins_dict['MV2K_vs_VV2'] & proteins_dict['VV2_vs_MM(V)1'])

# Create a Dataframe
proteins_df = pd.DataFrame(proteins_intersection, columns=["Protein"])

# Merge with dataframe results_df
merged_df = results_df[results_df['Protein'].isin(proteins_df['Protein'])]

# Create plot
output_path = os.path.join(figure_path, 'protein_log2fc.png')
plot_protein_log2fc(
    proteins_intersection, 
    'Biomarkers differentiating all three subgroups (n=20)', 
    merged_df,
    output_path=output_path,
    title_fontsize=16, 
    axis_fontsize=12
)

# Save plot
output_path = os.path.join(figure_path, '20_overlapping.png')
plt.savefig(output_path, dpi=1200, bbox_inches='tight', facecolor='white')

plt.show()

plt.close()

#### Venn diagram and trajectories of biomarkers specific to each subtype

In [None]:
def create_protein_venn_diagram(results_df, figure_path, filename='venn_vs_ctrl.png'):
    # Filter data for significant proteins only
    df_significant = results_df[results_df['Significant'] == True]
    
    # Define groups to compare with control
    group_comparisons = ['MM(V)1 vs CTRL', 'VV2 vs CTRL', 'MV2K vs CTRL']
    filtered_df = df_significant[df_significant['Group1_vs_Group2'].isin(group_comparisons)]
    
    # Create dictionary of protein sets for each comparison
    proteins_dict = {
        'MM(V)1_vs_CTRL': set(filtered_df[filtered_df['Group1_vs_Group2'] == 'MM(V)1 vs CTRL']['Protein']),
        'VV2_vs_CTRL': set(filtered_df[filtered_df['Group1_vs_Group2'] == 'VV2 vs CTRL']['Protein']),
        'MV2K_vs_CTRL': set(filtered_df[filtered_df['Group1_vs_Group2'] == 'MV2K vs CTRL']['Protein'])
    }
    
    # Calculate total unique significant proteins
    total_proteins = len(set(filtered_df['Protein']))
    
    # Calculate intersection sizes
    only_mm_ctrl = len(proteins_dict['MM(V)1_vs_CTRL'] - proteins_dict['VV2_vs_CTRL'] - proteins_dict['MV2K_vs_CTRL'])
    only_vv_ctrl = len(proteins_dict['VV2_vs_CTRL'] - proteins_dict['MM(V)1_vs_CTRL'] - proteins_dict['MV2K_vs_CTRL'])
    only_mv2k_ctrl = len(proteins_dict['MV2K_vs_CTRL'] - proteins_dict['MM(V)1_vs_CTRL'] - proteins_dict['VV2_vs_CTRL'])
    mm_vv_ctrl = len(proteins_dict['MM(V)1_vs_CTRL'] & proteins_dict['VV2_vs_CTRL'] - proteins_dict['MV2K_vs_CTRL'])
    mm_mv2k_ctrl = len(proteins_dict['MM(V)1_vs_CTRL'] & proteins_dict['MV2K_vs_CTRL'] - proteins_dict['VV2_vs_CTRL'])
    vv_mv2k_ctrl = len(proteins_dict['VV2_vs_CTRL'] & proteins_dict['MV2K_vs_CTRL'] - proteins_dict['MM(V)1_vs_CTRL'])
    all_intersection = len(proteins_dict['MM(V)1_vs_CTRL'] & proteins_dict['VV2_vs_CTRL'] & proteins_dict['MV2K_vs_CTRL'])
    
    # Create figure
    plt.figure(figsize=(10, 10))
    
    # Create Venn diagram
    venn = venn3(
        subsets=(
            only_mm_ctrl, 
            only_vv_ctrl, 
            mm_vv_ctrl, 
            only_mv2k_ctrl, 
            mm_mv2k_ctrl, 
            vv_mv2k_ctrl, 
            all_intersection
        ),
        set_labels=('MM(V)1 vs CTRL', 'VV2 vs CTRL', 'MV2K vs CTRL')
    )
    
    # Function to update labels with percentages
    def update_label(label_id, value, total_proteins):
        label = venn.get_label_by_id(label_id)
        if label is not None:
            label.set_text(f"{value}\n({100 * value / total_proteins:.1f}%)")
    
    # Update all labels
    label_values = {
        '100': only_mm_ctrl,
        '010': only_vv_ctrl,
        '001': only_mv2k_ctrl,
        '110': mm_vv_ctrl,
        '101': mm_mv2k_ctrl,
        '011': vv_mv2k_ctrl,
        '111': all_intersection
    }
    
    for label_id, value in label_values.items():
        update_label(label_id, value, total_proteins)
    
    # Define and apply colors
    color_map = {
        '100': '#FF9999',
        '010': '#99CCFF',
        '001': '#FFCC66',
        '110': '#66FF66',
        '101': '#FF66CC',
        '011': '#CC99FF',
        '111': '#FFFF00'
    }
    
    # Apply styling to each region
    for region_id in color_map.keys():
        patch = venn.get_patch_by_id(region_id)
        if patch is not None:
            patch.set_facecolor(color_map[region_id])
            patch.set_alpha(0.5)
            patch.set_edgecolor('black')
            patch.set_linewidth(2)
    
    # Style labels
    for label in venn.set_labels:
        if label is not None:
            label.set_fontsize(14)
            label.set_fontweight('bold')
            label.set_color('black')

    # Font and labels
    for label in venn.subset_labels:
        if label is not None:
            label.set_fontsize(14)  
            label.set_fontweight('bold')  
            label.set_color('black') 
    
    plt.title('', fontsize=16)
    
    # Ensure the output directory exists
    os.makedirs(figure_path, exist_ok=True)
    
    # Save the figure
    output_path = os.path.join(figure_path, filename)
    plt.savefig(output_path, dpi=1200, bbox_inches='tight', facecolor='white')
    
    # Show the plot (optional)
    plt.show()

    # Close the figure
    plt.close()
    
    return proteins_dict  # Return the dictionary for potential further analysis

figure_path = os.path.join(os.path.dirname(os.getcwd()), 'figures', 'differential')
proteins_dict = create_protein_venn_diagram(results_df, figure_path)

In [None]:
# Calulate the proteins uniquely altered in each subgroup 
unique_mm_ctrl = proteins_dict['MM(V)1_vs_CTRL'] - proteins_dict['VV2_vs_CTRL'] - proteins_dict['MV2K_vs_CTRL']
unique_vv_ctrl = proteins_dict['VV2_vs_CTRL'] - proteins_dict['MM(V)1_vs_CTRL'] - proteins_dict['MV2K_vs_CTRL']
unique_mv2k_ctrl = proteins_dict['MV2K_vs_CTRL'] - proteins_dict['MM(V)1_vs_CTRL'] - proteins_dict['VV2_vs_CTRL']

# Save lists
unique_proteins_dict = {
    'Biomarkers altered in MM(V)1 vs CTRL': list(unique_mm_ctrl),
    'Biomarkers altered in VV2 vs CTRL': list(unique_vv_ctrl),
    'Biomarkers altered in MV2K vs CTRL': list(unique_mv2k_ctrl)
}

# Print lits
for subgroup, proteins in unique_proteins_dict.items():
    print(f"{subgroup} ({len(proteins)} proteine):")
    print(", ".join(proteins))
    print()

# Prepare lists for plots
unique_proteins_mm = unique_proteins_dict['Biomarkers altered in MM(V)1 vs CTRL']
unique_proteins_vv = unique_proteins_dict['Biomarkers altered in VV2 vs CTRL']
unique_proteins_mv = unique_proteins_dict['Biomarkers altered in MV2K vs CTRL']

def export_unique_proteins_to_csv(unique_proteins_dict, results_path, filename='unique_proteins.csv'):
    # Create full output path
    output_file = os.path.join(results_path, filename)
    
    # Create DataFrame with all data
    all_data = []
    for group, proteins in unique_proteins_dict.items():
        for protein in proteins:
            all_data.append({
                'Group': group,
                'Protein': protein,
                'Total_Proteins_In_Group': len(proteins)
            })
    
    # Create DataFrame and save to CSV
    overview_df = pd.DataFrame(all_data)
    overview_df.to_csv(output_file, index=False)

# Export to CSV
export_unique_proteins_to_csv(unique_proteins_dict, results_path)

In [None]:
# Function to create and save the plot
def plot_protein_log2fc(protein_list, title, df, figure_path, figsize=(10, 5), title_fontsize=16, axis_fontsize=12):
    plt.figure(figsize=figsize)
    
    # Define the groups and comparisons
    groups = ['CTRL', 'MM(V)1', 'VV2', 'MV2K']
    comparisons = {
        'MM(V)1': 'MM(V)1 vs CTRL',
        'VV2': 'VV2 vs CTRL',
        'MV2K': 'MV2K vs CTRL'
    }
    
    for protein in protein_list:
        protein_data = []
        
        # For each group, calculate the log fold change value
        for group in groups:
            if group == 'CTRL':  # CTRL has a log fold change value of 0
                protein_data.append(0)
            else:
                # Get the log fold change value for the group compared to CTRL
                comparison = comparisons[group]
                group_data = df[(df['Protein'] == protein) & (df['Group1_vs_Group2'] == comparison)]['Log2_Fold_Change']
                protein_data.append(group_data.mean() if not group_data.empty else None)  # Use None if no data
        
        # Plot the line for the protein
        plt.plot(groups, protein_data, marker='o', label=protein, alpha=0.6)
    
    plt.axhline(0, color='black', linestyle='--', linewidth=1)  # Horizontal line at y=0
    
    # Set the title of the plot with the specified font size
    plt.title(title, fontsize=title_fontsize)
    
    # Set the axis labels with the specified font size
    plt.xlabel('', fontsize=axis_fontsize)
    plt.ylabel('Log2 Fold Change', fontsize=axis_fontsize)
    
    plt.xticks(rotation=45, fontsize=axis_fontsize)  # Set the font size for the x-axis labels
    
    # Move the legend closer to the figure box
    plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left', fontsize=12, ncol=2)  # Modify bbox_to_anchor to move the legend closer
#    plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left', fontsize='small')  # Modify bbox_to_anchor to move the legend closer

    plt.tight_layout()
    
    # Create the directory if it doesn't exist
    os.makedirs(figure_path, exist_ok=True)

    file_path = os.path.join(figure_path, f"{title.replace(' ', '_')}.png")
    plt.savefig(file_path, bbox_inches='tight')  # Save the plot to a file, including the legend
    
figure_path = os.path.dirname(os.getcwd()) + '/figures/differential'  # Path to save the figures

plot_protein_log2fc(unique_proteins_mm, 'MM(V)1', results_df, figure_path,
                    title_fontsize=16, axis_fontsize=14)


output_path = os.path.join(figure_path, 'MM(V)1.png')
plt.savefig(output_path, dpi=1200, bbox_inches='tight', facecolor='white')
plt.show()
plt.close()

plot_protein_log2fc(unique_proteins_vv, 'VV2', results_df, figure_path,
                    figsize=(10, 8), title_fontsize=16, axis_fontsize=14)


output_path = os.path.join(figure_path, 'VV2.png')
plt.savefig(output_path, dpi=1200, bbox_inches='tight', facecolor='white')
plt.show()
plt.close()

plot_protein_log2fc(unique_proteins_mv, 'MV2K', results_df, figure_path,
                    title_fontsize=16, axis_fontsize=14)

output_path = os.path.join(figure_path, 'MV2K.png')
plt.savefig(output_path, dpi=1200, bbox_inches='tight', facecolor='white')
plt.show()
plt.close()
