# Subanalysis related to patients with a neuropathological diagnosis
==================================================================================================

This notebook performs UMAP and hierarchical clustering considering only patients with a neuropathological diagnosis. Moreover it performs the differential expression analysis between MM(V)1 and MM(V)1+2C sCJD subtypes and plots ROC curves of top differentially expressed biomarkers.

Input:
------
- olink.xlsx: Protein expression data with columns:
  * SampleID: Unique sample identifier
  * Group: Clinical group classification
  * SubGroup: Clinical subgroup
  * Strain: Sample strain type
  * age at LP: Age at lumbar puncture
  * Sex: Patient sex
  * [Protein Names]: NPX values for each protein
- feature_importance_rankings.csv: List of the top 20 proteins useful for sCJD subtypes classification
- differential.csv: Differential expression results including:
  * Protein: Protein identifier
  * Group comparisons
  * P-values
  * Q-values (FDR corrected)
  * Log2 fold changes
  * Beta coefficients

Output:
-------
- UMAP visualisation of all proteomic data for each subtype
- UMAP visualisation of top 20 proteins useful for sCJD subtypes classification data 
- Heatmaps showing the hierarchical clustering 
- mixed.xlsx: Differential expression results including:
  * Protein: Protein identifier
  * Group comparisons
  * P-values
  * Q-values (FDR corrected)
  * Log2 fold changes
  * Beta coefficients
- Volcano plot to show the results of the differential expression analysis
- ROC curves of top differentially expressed proteins in distinguishing MM(V)1 vs MM(V)1+2C

Analysis Steps:
---------------
Details regarding the analysis steps are provided in the respective notebooks as follows: 
- UMAP: 01_demographics.ipynb
- Hierarchical clustering: 03_hierarchical_clustering.ipynb
- Differential expression analysis: 02_differential.ipynb

In [1]:
# General utilities
import os
import itertools
import numpy as np
import pandas as pd

# Statistical analysis
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multitest import multipletests

# Machine learning & data preprocessing
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.utils import resample

# Dimensionality reduction & clustering
from umap import UMAP
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
from adjustText import adjust_text

# ROC curve & AUC
from sklearn.metrics import roc_curve, auc

In [11]:
# Define path
data_path = os.path.dirname(os.getcwd()) + '/data/'
figure_path = os.path.dirname(os.getcwd()) + '/figures/mixed_subtypes'
results_path = os.path.dirname(os.getcwd()) + '/data/results/differential'

## Perform UMAP on proteomic data of patients with a neuropathological diagnosis

In [None]:
# Import data
df = pd.read_excel(data_path + '/curated/olink.xlsx')

# Keep necessary columns from Olink data
columns_to_drop = ['age at LP', 'Sex', 'Codon 129',
                   'onset-LP', 'onset-death', 'LP-death', 'Group', 'Strain', 'SubGroup']

df = df.drop(columns=columns_to_drop)

df = df[~df['NP_subtype'].isin(['VV2 probable', 'MV2K probable', 'MM(V)1 probable', 'CTRL'])]

In [None]:
# Check total number of NaN values in the dataset
total_nan = df.isna().sum().sum()
print(f"Total number of NaN values in the dataset: {total_nan}")

# Check NaN values per column
nan_per_column = df.isna().sum()

# Display only columns that have NaN values
columns_with_nan = nan_per_column[nan_per_column > 0]

if len(columns_with_nan) > 0:
    print("\nColumns with NaN values:")
    print(columns_with_nan)
    
    # Calculate percentage of NaN values per column
    nan_percentage = (columns_with_nan / len(df)) * 100
    print("\nPercentage of NaN values per column:")
    print(nan_percentage)
else:
    print("\nNo columns contain NaN values")

# Check if any rows have all NaN values
rows_all_nan = df[df.isna().all(axis=1)]
print(f"\nNumber of rows with all NaN values: {len(rows_all_nan)}")

# Check if any rows have any NaN values
rows_with_nan = df[df.isna().any(axis=1)]
print(f"Number of rows containing at least one NaN value: {len(rows_with_nan)}")

if len(rows_with_nan) > 0:
    print("\nSample IDs of rows with NaN values:")
    print(rows_with_nan['SampleID'].tolist())

### Perform UMAP on all proteomic data

In [None]:
def plot_umap(df, figure_path):

    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Define matching colors
    colors = {
        'MV2K': '#2ecc71', # green
        #'MV2K probable': '#2ecc71', # green
        'MV2K+2C': '#27ae60', # dark green
        'VV2': '#e74c3c',  # red
        #'VV2 probable': '#e74c3c',  # red
        'MM(V)1': '#3498db', # blue 
        #'MM(V)1 probable': '#8e44ad', # purple
        'MM(V)1+2C': '#f39c12', # orange
        #'CTRL': '#333c42', # grey
    }
    
    # Prepare data
    X = df.drop(['SampleID', 'NP_subtype'], axis=1)
    
    # Scale the data
    X_scaled = StandardScaler().fit_transform(X)
    
    # Perform UMAP
    umap_embedding = UMAP(
        n_neighbors=15,
        min_dist=0.1,
        n_components=2,
        random_state=42
    ).fit_transform(X_scaled)
    
    # Plot UMAP results for each subtype
    for subtype in df['NP_subtype'].unique():
        mask = df['NP_subtype'] == subtype
        ax.scatter(
            umap_embedding[mask, 0],
            umap_embedding[mask, 1],
            c=colors[subtype],
            label=f"{subtype} (n={sum(mask)})",
            alpha=0.7
        )
    
    # Customize plot
    plt.title('UMAP Analysis by sCJD Subtype', pad=20, fontsize=14)
    plt.xlabel('UMAP1', fontsize=12)
    plt.ylabel('UMAP2', fontsize=12)
    plt.grid(True, alpha=0.3)
    
    # Move legend
    plt.legend(bbox_to_anchor=(0.0005, 0.0005), loc='lower left', frameon=True, framealpha=0.8)
    
    # Adjust layout
    plt.subplots_adjust(right=0.85)
    
    # Save and show
    plt.savefig(figure_path + '/umap_NP_cjd_subtype.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return fig

# Generate plot
fig = plot_umap(df, figure_path)
fig

### Perform UMAP on top 20 proteins for subtype classification

In [None]:
# Import feature importance from ML results
df = pd.read_excel(data_path + '/curated/olink.xlsx')

feature_importance_rankings = pd.read_csv(data_path + '/results/feature_importance_rankings.csv')
top_biomarkers = list(feature_importance_rankings['Feature'].head(20))

# Select the columns
columns_to_select = top_biomarkers + ['SampleID', 'NP_subtype']
df = df[columns_to_select]
df = df[~df['NP_subtype'].isin(['VV2 probable', 'MV2K probable', 'MM(V)1 probable', 'CTRL'])]

In [None]:
def plot_umap(df, figure_path):

    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Define matching colors
    colors = {
        'MV2K': '#2ecc71', # green
        #'MV2K probable': '#2ecc71', # green
        'MV2K+2C': '#27ae60', # dark green
        'VV2': '#e74c3c',  # red
        #'VV2 probable': '#e74c3c',  # red
        'MM(V)1': '#3498db', # blue 
        #'MM(V)1 probable': '#8e44ad', # purple
        'MM(V)1+2C': '#f39c12', # orange
        #'CTRL': '#333c42', # grey
    }
    
    # Prepare data
    X = df.drop(['SampleID', 'NP_subtype'], axis=1)
    
    # Scale the data
    X_scaled = StandardScaler().fit_transform(X)
    
    # Perform UMAP
    umap_embedding = UMAP(
        n_neighbors=15,
        min_dist=0.1,
        n_components=2,
        random_state=42
    ).fit_transform(X_scaled)
    
    # Plot UMAP results for each subtype
    for subtype in df['NP_subtype'].unique():
        mask = df['NP_subtype'] == subtype
        ax.scatter(
            umap_embedding[mask, 0],
            umap_embedding[mask, 1],
            c=colors[subtype],
            label=f"{subtype} (n={sum(mask)})",
            alpha=0.7
        )
    
    # Customize plot
    plt.title('UMAP Analysis by sCJD Subtype', pad=20, fontsize=14)
    plt.xlabel('UMAP1', fontsize=12)
    plt.ylabel('UMAP2', fontsize=12)
    plt.grid(True, alpha=0.3)
    
    # Move legend
    plt.legend(bbox_to_anchor=(0.0005, 0.9995), loc='upper left', frameon=True, framealpha=0.8)
    
    # Adjust layout
    plt.subplots_adjust(right=0.85)
    
    # Save and show
    plt.savefig(figure_path + '/umap_NP_cjd_subtype_top20_subtype.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return fig

# Generate plot
fig = plot_umap(df, figure_path)
fig

## Perform Hierarchical clustering including only patients with a neuropathological diagnosis

In [None]:
# Load clean data
def load_and_clean_data():
    df_results = pd.read_csv(results_path + '/differential.csv')
    df_olink = pd.read_excel(data_path + 'curated/olink.xlsx')

    # Drop unnecessary columns
    columns_to_drop = ['SubGroup', 'Group', 'Strain', 'age at LP', 'Sex', 'Codon 129', 'onset-LP', 'onset-death', 'LP-death']
    df_olink = df_olink.drop(columns=columns_to_drop)
    df_olink = df_olink[df_olink['NP_subtype'].isin(['MM(V)1', 'MM(V)1+2C', 'VV2', 'MV2K', 'MV2K+2C'])]

    return df_results, df_olink

# Filter significant
def filter_significant_results(df_results, comparisons):
    filtered_df = df_results[df_results['Significant'] == True]
    filtered_comparisons_df = filtered_df[filtered_df['Group1_vs_Group2'].isin(comparisons)]

    hc_list = []
    for comparison in comparisons:
        comparison_df = filtered_comparisons_df[filtered_comparisons_df['Group1_vs_Group2'] == comparison]
        top_50 = comparison_df.nsmallest(50, 'Q_Value')[['Protein', 'Q_Value', 'Group1_vs_Group2']]
        hc_list.append(top_50)

    HC_list = pd.concat(hc_list)
    proteins_to_include = HC_list['Protein'].unique()

    # Filter results per comparison
    filtered_dfs = {}
    for comp in comparisons:
        filtered_dfs[comp] = df_results[(df_results['Significant'] == True) & (df_results['Group1_vs_Group2'] == comp)]
        filtered_dfs[comp] = filtered_dfs[comp][filtered_dfs[comp]['Protein'].isin(HC_list['Protein'])]

    return proteins_to_include, filtered_dfs

# Clustering
def perform_clustering(protein_data_normalized):
    protein_corr = protein_data_normalized.corr(method='spearman')
    sample_corr = protein_data_normalized.T.corr(method='spearman')

    protein_dist = 1 - protein_corr
    sample_dist = 1 - sample_corr

    protein_condensed = pdist(protein_dist.to_numpy())
    sample_condensed = pdist(sample_dist.to_numpy())

    protein_linkage = sch.linkage(protein_condensed, method='average')
    sample_linkage = sch.linkage(sample_condensed, method='average')

    return protein_linkage, sample_linkage

# Plot Heatmap
def plot_heatmap(protein_data, protein_data_normalized, protein_linkage, sample_linkage, 
                 filtered_dfs, proteins_to_include, comparisons, df_olink, figure_path):
    # Map colors for proteins
    protein_colors = {comp: {} for comp in comparisons}
    for comp in comparisons:
        for protein in proteins_to_include:
            q_value = filtered_dfs[comp].loc[filtered_dfs[comp]['Protein'] == protein, 'Q_Value'].min()

            # Color coding based on significance levels
            if q_value < 0.01:
                protein_colors[comp][protein] = 'deepskyblue'
            elif q_value < 0.05:
                protein_colors[comp][protein] = 'lightblue'
            else:
                protein_colors[comp][protein] = 'gainsboro'

    protein_colors_combined = pd.DataFrame({
        comp: [protein_colors[comp].get(protein, 'lightgray') for protein in protein_data.columns]
        for comp in comparisons
    })

    # Define color mapping for SubGroup
    group_colors = {'MM(V)1': 'yellow', 'MM(V)1+2C': 'coral', 'VV2': 'purple', 'MV2K': 'green', 'MV2K+2C': 'teal'}
    col_colors = df_olink.set_index('SampleID').loc[protein_data.index, 'NP_subtype'].map(group_colors).values

    # Plot the heatmap
    g = sns.clustermap(
        protein_data_normalized.T,
        row_linkage=protein_linkage,
        col_linkage=sample_linkage,
        cmap='bwr',
        col_colors=col_colors,
        xticklabels=False,
        yticklabels=False,  # To display protein labels, use yticklabels=protein_data.columns
        figsize=(12, 8),  # Increase figure size for better spacing
        vmin=-3,
        vmax=3,
        row_colors=protein_colors_combined.values.T,
        cbar_pos=(1.001, 0.3, 0.02, 0.3),
        dendrogram_ratio=(0.05, 0.1)
    )

    # Remove axis labels
    g.ax_heatmap.set_xlabel('')
    g.ax_heatmap.set_ylabel('')

    # Add legend for SubGroup colors
    handles_group = [Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10)
                     for color in group_colors.values()]
    labels_group = list(group_colors.keys())

    g.ax_heatmap.legend(handles=handles_group, labels=labels_group, loc='upper left', title='Diagnostic Group',
                        bbox_to_anchor=(1.001, 1.0), ncol=1)

    # Create a second legend for significance levels
    handles_qvalue = [
        Patch(color='deepskyblue', label='q < 0.01'),
        Patch(color='lightblue', label='q < 0.05'),
        Patch(color='gainsboro', label='Non-significant')
    ]

    ax = plt.gca()
    ax.legend(handles=handles_qvalue, title='Significance of DEA', loc='upper center',
              bbox_to_anchor=(-40.55, -0.85), ncol=3)  # Positioned below the heatmap

    # Function to add labels to comparisons
    def add_comparison_label(ax, index, comparison, vertical_spacing=10.3):
        ax.text(index - 50, -vertical_spacing, comparison, ha='center', va='center', fontsize=10, color='black',
                rotation=90, bbox=dict(facecolor='white', edgecolor='None', boxstyle='round,pad=0.5'))

    # Add labels for each comparison with custom positioning
    for i, comp in enumerate(comparisons):
        if comp == "VV2 vs MM(V)1":
            add_comparison_label(g.ax_heatmap, i + 45, comp, vertical_spacing=9)
        elif comp == "MV2K vs VV2":
            add_comparison_label(g.ax_heatmap, i + 47, comp, vertical_spacing=9)
        elif comp == "MV2K vs MM(V)1":
            add_comparison_label(g.ax_heatmap, i + 46, comp, vertical_spacing=9)
        else:
            add_comparison_label(g.ax_heatmap, i, comp)

    # Save the figure
    output_file = os.path.join(figure_path, "heatmap_NP.png")
    plt.savefig(output_file, dpi=1200, bbox_inches='tight')  

    plt.show()

# Data norm
def preprocess_and_normalize(protein_data, proteins_to_include):
    # Normalize by total protein per sample
    total_protein_per_sample = protein_data.sum(axis=1)
    protein_data_normalized = protein_data.div(total_protein_per_sample, axis=0)

    # Z-score normalization
    protein_data_normalized = (protein_data_normalized - protein_data_normalized.mean(axis=0)) / protein_data_normalized.std(axis=0)

    return protein_data_normalized

def main():
    """Executes the entire data processing and visualization pipeline."""
    df_results, df_olink = load_and_clean_data()
    
    # Define comparisons (needed for the heatmap function)
    comparisons = ['VV2 vs MM(V)1', 'MV2K vs MM(V)1', 'MV2K vs VV2']
    
    proteins_to_include, filtered_dfs = filter_significant_results(df_results, comparisons)
    
    # Define protein_data here (original protein data matrix before normalization)
    protein_data = df_olink.set_index('SampleID').drop(columns=['NP_subtype'])
    protein_data = protein_data[proteins_to_include]
    
    # Pass protein_data to preprocess_and_normalize function
    protein_data_normalized = preprocess_and_normalize(protein_data, proteins_to_include)
    
    protein_linkage, sample_linkage = perform_clustering(protein_data_normalized)
    plot_heatmap(protein_data, protein_data_normalized, protein_linkage, sample_linkage, filtered_dfs, proteins_to_include, comparisons, df_olink, figure_path)

if __name__ == "__main__":
    main()

## Perform Differential Expression Analysis between MM(V)1 and MM(V)1+2C subgroups

In [5]:
# Import Olink Data
df = pd.read_excel(data_path + 'curated/olink.xlsx')

In [6]:
# Drop unnecessary columns
columns_to_drop = [
    'Codon 129',
    'onset-LP', 'onset-death', 'LP-death', 'Group', 'SubGroup', 'Strain'
]

df = df.drop(columns=columns_to_drop)

# Pivoting the DataFrame in long format
df_pivoted = df.melt(
    id_vars=['SampleID', 'NP_subtype', 'age at LP', 'Sex'],  
    var_name='Assay',  
    value_name='NPX' 
)

# Setting index
df_pivoted.set_index('SampleID', inplace=True)
df = df_pivoted.copy()

# Rename column for consistency
df = df.rename(columns={"age at LP": "age_at_LP"})

In [None]:
# Function for the specific pairwise comparison
def perform_differential_expression(df, results, threshold=0.05):
    # Define the groups to compare
    group1, group2 = "MM(V)1", "MM(V)1+2C"
    
    # Filter data for only these groups
    subset = df[df["NP_subtype"].isin([group1, group2])].copy()
    
    # Encode groups as binary (1 for MM(V)1, 0 for MM(V)1+2C)
    subset["NP_subtype"] = subset["NP_subtype"].map({group1: 1, group2: 0})
    
    for protein in subset["Assay"].unique():
        protein_subset = subset[subset["Assay"] == protein]

        if len(protein_subset) < 2:
            continue  # Skip if insufficient data

        # Define models
        formula1 = "NPX ~ age_at_LP + Sex"
        formula2 = "NPX ~ age_at_LP + Sex + NP_subtype"

        # Fit models
        model1 = ols(formula1, data=protein_subset).fit()
        model2 = ols(formula2, data=protein_subset).fit()

        # Perform ANOVA
        anova_results = anova_lm(model1, model2)
        p_value = anova_results["Pr(>F)"][1]  # Extract p-value

        # Calculate log2 fold change
        vals1 = protein_subset[protein_subset["NP_subtype"] == 1]["NPX"]
        vals2 = protein_subset[protein_subset["NP_subtype"] == 0]["NPX"]
        log2_fold_change = vals1.mean() - vals2.mean() if len(vals1) > 0 and len(vals2) > 0 else np.nan

        # Extract beta coefficient
        beta_coefficient = model2.params.get("NP_subtype", np.nan)

        # Store results
        results.append({
            "Protein": protein,
            "Group_Col": "NP_subtype",
            "Group1": group1,
            "Group2": group2,
            "Group1_vs_Group2": f"{group1} vs {group2}",
            "F_P_Value": p_value,
            "Log2_Fold_Change": log2_fold_change,
            "Beta_Coefficient": beta_coefficient
        })

# Run analysis
results = []
perform_differential_expression(df, results)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Apply FDR correction
results_df["Q_Value"] = multipletests(results_df["F_P_Value"], method="fdr_bh")[1]
results_df["Significant"] = results_df["Q_Value"] < 0.05

# Save results
output_file = os.path.join(results_path, "mixed.xlsx")
results_df.to_excel(output_file, index=False)

In [8]:
# import dataframe
results_df = pd.read_excel(results_path + '/mixed.xlsx')

In [None]:
# Function for creating the volcano plot based on Beta Coefficient
def create_volcano_plot_beta(results, title, figure_path, threshold=0.05):
    plt.figure(figsize=(6, 8))
    
    # Scatter plot with beta coefficient vs -log10 Q-value
    plt.scatter(results['Beta_Coefficient'], 
                -np.log10(results['Q_Value']), 
                alpha=0.5, color='#87cefa', label="Not Significant")  # Light Sky Blue

    # Highlight significant results based only on Q-value threshold
    significant = results[results['Q_Value'] < threshold]
    
    # Sort by Q-value to get the most significant ones
    significant_sorted = significant.sort_values(by='Q_Value').head(25)
    
    # Highlight significant results 
    plt.scatter(significant_sorted['Beta_Coefficient'], 
                -np.log10(significant_sorted['Q_Value']), 
                color='#87cefa', alpha=0.7, label="Significant")
    
    # Threshold line for Q-value
    plt.axhline(-np.log10(threshold), color='darkred', linestyle='--', label=f'P-value threshold={threshold}')

    # Add a vertical line at Beta Coefficient = 0
    plt.axvline(0, color='darkred', linestyle='--', label='Beta Coefficient = 0')
    
    # Increase the Y-axis range to give more space to significant proteins
    plt.ylim(0, max(-np.log10(results['Q_Value'])) + 1)
    
    # Prepare the text labels (protein names) for the significant points
    texts = []
    for _, row in significant_sorted.iterrows():
        text = plt.text(row['Beta_Coefficient'] + 0.05,  # Increase offset to X position
                        -np.log10(row['Q_Value']) + 0.05,  # Increase offset to Y position
                        row['Protein'], 
                        fontsize=8,
                        color='black',
                        ha='left', va='bottom')
        texts.append(text)
    
    # Adjust text to avoid overlap using adjustText
    adjust_text(texts, 
                only_move={'points': 'xy', 'texts': 'xy'},  
                expand_text=(1.3, 1.3),  # More space for text expansion
                force_text=0.05,  
                lim=200)  
    
    # Axis labels and title
    plt.xlabel('Beta Coefficient')
    plt.ylabel('-Log10 Q-value')
    plt.title(f"{title}")
    
    # Save the plot with high resolution and tight layout
    plt.tight_layout()
    #os.makedirs(os.path.dirname(figure_path), exist_ok=True)  # Create directory if it doesn't exist
    plt.savefig(figure_path, dpi=1200, bbox_inches='tight')
    plt.close()

# Function to clean up the title for a valid file name
def clean_filename(title):
    return title.replace(":", "_").replace(" ", "_")

# Filter the results for MM(V)1 vs MM(V)1+2C
subset = results_df[(results_df['Group1'] == "MM(V)1") & (results_df['Group2'] == "MM(V)1+2C")]

if not subset.empty:
    # Create a title and clean it for the filename
    title = "MM(V)1 vs MM(V)1+2C"
    cleaned_title = clean_filename(title)

    filename = f"{cleaned_title}2.png"
    plot_path = os.path.join(figure_path, filename)
    
    # Create the volcano plot using Beta Coefficient
    create_volcano_plot_beta(subset, title, plot_path)
    print(f"Saved: {plot_path}")
else:
    print("No data available for MM(V)1 vs MM(V)1+2C comparison.")
