# Hierarchical clustering
=====================================

This notebook performs hierarchical clustering based on the abundance levels of the top 50 significantly differentially expressed proteins resulting from 3 setes of pairwise comparisons: 
1. MM(V)1 vs CTRL, VV2 vs CTRL, MV2K vs CTRL
2. MM(V)1 vs VV2, MM(V)1vs MV2K, VV2 vs MV2K  

Specifically, protein abundances are first normalized for total protein count per sample then scaled. Spearman correlations between both the samples and the proteins are calculated and used as a distance measure for clustering. Results are visualized through heatmaps. 

Input:
------
- olink.xlsx: Protein expression data with columns:
  * SampleID: Unique sample identifier
  * Group: Clinical group classification
  * SubGroup: Clinical subgroup
  * Strain: Sample strain type
  * age at LP: Age at lumbar puncture
  * Sex: Patient sex
  * [Protein Names]: NPX values for each protein
- differential.csv: Differential expression results including:
  * Protein: Protein identifier
  * Group comparisons
  * P-values
  * Q-values (FDR corrected)
  * Log2 fold changes
  * Beta coefficients

Output:
-------
- Heatmaps showing the hierarchical clustering of the samples based on the abundance levels of the top 50  significantly differentially expressed proteins resulting from 3 sets of pairwise comparisons. The samples are represented on the x-axis (MM(V)1 in yellow, VV2 in purple, MV2K in green). Normalized protein abundance levels are represented on the y-axis. Q-values (from the differential expression analysis) for each pairwise comparison are annotated on the left of the figure (Q<0.01 in blue, Q<0.05 in light blue, non-significant in grey).

Analysis Steps:
---------------
1. Data Preprocessing:
   - Imports protein expression data and differential expression analysis results
   - Drops unnecessary columns
   - Selects the top 50 differentially expressed proteins for the 3 sets of pairwise comparisons 
   - Normalizes protein abundance levels

2. Statistical Analysis:
   - Calculates Spearman correlations 
   - Performs hierarchical clustering

3. Visualizations:
   - Plots heatmaps:
     * Maps colors for proteins and sCJD subtypes
     * Plots heatmap
     * Adds legends for sCJD subtype and significance levels
     * Adds tags for pairwise comparisons

In [18]:
# General utilities 
import pandas as pd
import os
import scipy.cluster.hierarchy as sch
import numpy as np
from scipy.spatial.distance import pdist

# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

In [19]:
# Define path
data_path = os.path.dirname(os.getcwd()) + '/data/'
figure_path = os.path.dirname(os.getcwd()) + '/figures/hierarchical_clustering'
results_path = os.path.dirname(os.getcwd()) + '/data/results/'

### Plot heatmap for the inter-subtype pairwise comparisons (MM(V)1 vs VV2, VV2 vs MV2K, MM(V)1 vs MV2K)

In [None]:
def load_and_clean_data():
    """Loads and preprocesses the dataset."""
    df_results = pd.read_csv(results_path + 'differential/differential.csv')
    df_olink = pd.read_excel(data_path + 'curated/olink.xlsx')

    # Drop unnecessary columns
    columns_to_drop = ['Group', 'Strain', 'age at LP', 'Sex', 'Codon 129', 'onset-LP', 'onset-death', 'LP-death', '14-3-3 ELISA', 't-tau', 'NfL', 'NP_subtype']
    df_olink = df_olink.drop(columns=columns_to_drop)
    df_olink = df_olink[df_olink['SubGroup'].isin(['MM(V)1', 'VV2', 'MV2K'])]

    return df_results, df_olink

def filter_significant_results(df_results, comparisons):
    """Filters significant proteins and extracts top 50 per comparison."""
    filtered_df = df_results[df_results['Significant'] == True]
    filtered_comparisons_df = filtered_df[filtered_df['Group1_vs_Group2'].isin(comparisons)]

    hc_list = []
    for comparison in comparisons:
        comparison_df = filtered_comparisons_df[filtered_comparisons_df['Group1_vs_Group2'] == comparison]
        top_50 = comparison_df.nsmallest(50, 'Q_Value')[['Protein', 'Q_Value', 'Group1_vs_Group2']]
        hc_list.append(top_50)

    HC_list = pd.concat(hc_list)
    proteins_to_include = HC_list['Protein'].unique()

    # Filter results per comparison
    filtered_dfs = {}
    for comp in comparisons:
        filtered_dfs[comp] = df_results[(df_results['Significant'] == True) & (df_results['Group1_vs_Group2'] == comp)]
        filtered_dfs[comp] = filtered_dfs[comp][filtered_dfs[comp]['Protein'].isin(HC_list['Protein'])]

    return proteins_to_include, filtered_dfs

def perform_clustering(protein_data_normalized):
    """Calculates Spearman correlations and performs hierarchical clustering."""
    protein_corr = protein_data_normalized.corr(method='spearman')
    sample_corr = protein_data_normalized.T.corr(method='spearman')

    protein_dist = 1 - protein_corr
    sample_dist = 1 - sample_corr

    protein_condensed = pdist(protein_dist.to_numpy())
    sample_condensed = pdist(sample_dist.to_numpy())

    protein_linkage = sch.linkage(protein_condensed, method='average')
    sample_linkage = sch.linkage(sample_condensed, method='average')

    return protein_linkage, sample_linkage

def plot_heatmap(protein_data, protein_data_normalized, protein_linkage, sample_linkage, 
                 filtered_dfs, proteins_to_include, comparisons, df_olink, figure_path):

    # Map colors for proteins
    protein_colors = {comp: {} for comp in comparisons}
    for comp in comparisons:
        for protein in proteins_to_include:
            q_value = filtered_dfs[comp].loc[filtered_dfs[comp]['Protein'] == protein, 'Q_Value'].min()

            # Color coding based on significance levels
            if q_value < 0.01:
                protein_colors[comp][protein] = 'deepskyblue'
            elif q_value < 0.05:
                protein_colors[comp][protein] = 'lightblue'
            else:
                protein_colors[comp][protein] = 'gainsboro'

    protein_colors_combined = pd.DataFrame({
        comp: [protein_colors[comp].get(protein, 'lightgray') for protein in protein_data.columns]
        for comp in comparisons
    })

    # Define color mapping for SubGroup
    group_colors = {'MM(V)1': 'yellow', 'VV2': 'purple', 'MV2K': 'green'}
    col_colors = df_olink.set_index('SampleID').loc[protein_data.index, 'SubGroup'].map(group_colors).values

    # Plot the heatmap
    g = sns.clustermap(
        protein_data_normalized.T,
        row_linkage=protein_linkage,
        col_linkage=sample_linkage,
        cmap='bwr',
        col_colors=col_colors,
        xticklabels=False,
        yticklabels=False,  # To display protein labels, use yticklabels=protein_data.columns
        figsize=(12, 8),  # Increase figure size for better spacing
        vmin=-3,
        vmax=3,
        row_colors=protein_colors_combined.values.T,
        cbar_pos=(1.001, 0.3, 0.02, 0.3),
        dendrogram_ratio=(0.05, 0.1)
    )

    # Remove axis labels
    g.ax_heatmap.set_xlabel('')
    g.ax_heatmap.set_ylabel('')

    # Add legend for SubGroup colors
    handles_group = [Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10)
                     for color in group_colors.values()]
    labels_group = list(group_colors.keys())

    g.ax_heatmap.legend(handles=handles_group, labels=labels_group, loc='upper left', title='Diagnostic Group',
                        bbox_to_anchor=(1.001, 1.0), ncol=1)

    # Create a second legend for significance levels
    handles_qvalue = [
        Patch(color='deepskyblue', label='q < 0.01'),
        Patch(color='lightblue', label='q < 0.05'),
        Patch(color='gainsboro', label='Non-significant')
    ]

    ax = plt.gca()
    ax.legend(handles=handles_qvalue, title='Significance of DEA', loc='upper center',
              bbox_to_anchor=(-40.55, -0.85), ncol=3)  # Positioned below the heatmap

    # Function to add labels to comparisons
    def add_comparison_label(ax, index, comparison, vertical_spacing=10.3):
        ax.text(index - 50, -vertical_spacing, comparison, ha='center', va='center', fontsize=10, color='black',
                rotation=90, bbox=dict(facecolor='white', edgecolor='None', boxstyle='round,pad=0.5'))

    # Add labels for each comparison with custom positioning
    for i, comp in enumerate(comparisons):
        if comp == "VV2 vs MM(V)1":
            add_comparison_label(g.ax_heatmap, i + 39, comp, vertical_spacing=9)
        elif comp == "MV2K vs VV2":
            add_comparison_label(g.ax_heatmap, i + 46, comp, vertical_spacing=9)
        elif comp == "MV2K vs MM(V)1":
            add_comparison_label(g.ax_heatmap, i + 42.5, comp, vertical_spacing=9)
        else:
            add_comparison_label(g.ax_heatmap, i, comp)

    # Save the figure
    output_file = os.path.join(figure_path, "heatmap1.png")
    plt.savefig(output_file, dpi=1200, bbox_inches='tight')  

    plt.show()

def preprocess_and_normalize(protein_data, proteins_to_include):
    """Prepares the dataset by filtering, normalizing, and standardizing."""
    # Normalize by total protein per sample
    total_protein_per_sample = protein_data.sum(axis=1)
    protein_data_normalized = protein_data.div(total_protein_per_sample, axis=0)

    # Z-score normalization
    protein_data_normalized = (protein_data_normalized - protein_data_normalized.mean(axis=0)) / protein_data_normalized.std(axis=0)

    return protein_data_normalized

def main():
    """Executes the entire data processing and visualization pipeline."""
    df_results, df_olink = load_and_clean_data()
    
    # Define comparisons (needed for the heatmap function)
    comparisons = ['VV2 vs MM(V)1', 'MV2K vs MM(V)1', 'MV2K vs VV2']
    
    proteins_to_include, filtered_dfs = filter_significant_results(df_results, comparisons)
    
    # Define protein_data here (original protein data matrix before normalization)
    protein_data = df_olink.set_index('SampleID').drop(columns=['SubGroup'])
    protein_data = protein_data[proteins_to_include]
    
    # Pass protein_data to preprocess_and_normalize function
    protein_data_normalized = preprocess_and_normalize(protein_data, proteins_to_include)
    
    protein_linkage, sample_linkage = perform_clustering(protein_data_normalized)
    plot_heatmap(protein_data, protein_data_normalized, protein_linkage, sample_linkage, filtered_dfs, proteins_to_include, comparisons, df_olink, figure_path)

# Run the script
if __name__ == "__main__":
    main()

### Plot heatmap for the subtype vs controls pairwise comparisons (MM(V)1 vs CTRL, VV2 vs CTRL, MV2K vs CTRL)

In [None]:
def load_and_clean_data():
    """Loads and preprocesses the dataset."""
    df_results = pd.read_csv(results_path + 'differential/differential.csv')
    df_olink = pd.read_excel(data_path + 'curated/olink.xlsx')

    # Drop unnecessary columns
    columns_to_drop = ['Group', 'Strain', 'age at LP', 'Sex', 'Codon 129', 'onset-LP', 'onset-death', 'LP-death', '14-3-3 ELISA', 't-tau', 'NfL']
    df_olink = df_olink.drop(columns=columns_to_drop)
    df_olink = df_olink[df_olink['SubGroup'].isin(['MM(V)1', 'VV2', 'MV2K'])]

    return df_results, df_olink

def filter_significant_results(df_results, comparisons):
    """Filters significant proteins and extracts top 50 per comparison."""
    filtered_df = df_results[df_results['Significant'] == True]
    filtered_comparisons_df = filtered_df[filtered_df['Group1_vs_Group2'].isin(comparisons)]

    hc_list = []
    for comparison in comparisons:
        comparison_df = filtered_comparisons_df[filtered_comparisons_df['Group1_vs_Group2'] == comparison]
        top_50 = comparison_df.nsmallest(50, 'Q_Value')[['Protein', 'Q_Value', 'Group1_vs_Group2']]
        hc_list.append(top_50)

    HC_list = pd.concat(hc_list)
    proteins_to_include = HC_list['Protein'].unique()

    # Filter results per comparison
    filtered_dfs = {}
    for comp in comparisons:
        filtered_dfs[comp] = df_results[(df_results['Significant'] == True) & (df_results['Group1_vs_Group2'] == comp)]
        filtered_dfs[comp] = filtered_dfs[comp][filtered_dfs[comp]['Protein'].isin(HC_list['Protein'])]

    return proteins_to_include, filtered_dfs

def perform_clustering(protein_data_normalized):
    """Calculates Spearman correlations and performs hierarchical clustering."""
    protein_corr = protein_data_normalized.corr(method='spearman')
    sample_corr = protein_data_normalized.T.corr(method='spearman')

    protein_dist = 1 - protein_corr
    sample_dist = 1 - sample_corr

    protein_condensed = pdist(protein_dist.to_numpy())
    sample_condensed = pdist(sample_dist.to_numpy())

    protein_linkage = sch.linkage(protein_condensed, method='average')
    sample_linkage = sch.linkage(sample_condensed, method='average')

    return protein_linkage, sample_linkage


def plot_heatmap(protein_data, protein_data_normalized, protein_linkage, sample_linkage, 
                 filtered_dfs, proteins_to_include, comparisons, df_olink, figure_path):
    # Map colors for proteins
    protein_colors = {comp: {} for comp in comparisons}
    for comp in comparisons:
        for protein in proteins_to_include:
            q_value = filtered_dfs[comp].loc[filtered_dfs[comp]['Protein'] == protein, 'Q_Value'].min()

            # Color coding based on significance levels
            if q_value < 0.01:
                protein_colors[comp][protein] = 'deepskyblue'
            elif q_value < 0.05:
                protein_colors[comp][protein] = 'lightblue'
            else:
                protein_colors[comp][protein] = 'gainsboro'

    protein_colors_combined = pd.DataFrame({
        comp: [protein_colors[comp].get(protein, 'lightgray') for protein in protein_data.columns]
        for comp in comparisons
    })

    # Define color mapping for SubGroup
    group_colors = {'MM(V)1': 'yellow', 'VV2': 'purple', 'MV2K': 'green'}
    col_colors = df_olink.set_index('SampleID').loc[protein_data.index, 'SubGroup'].map(group_colors).values

    # Plot the heatmap
    g = sns.clustermap(
        protein_data_normalized.T,
        row_linkage=protein_linkage,
        col_linkage=sample_linkage,
        cmap='bwr',
        col_colors=col_colors,
        xticklabels=False,
        yticklabels=False,  # To display protein labels, use yticklabels=protein_data.columns
        figsize=(12, 8),  # Increase figure size for better spacing
        vmin=-3,
        vmax=3,
        row_colors=protein_colors_combined.values.T,
        cbar_pos=(1.001, 0.3, 0.02, 0.3),
        dendrogram_ratio=(0.05, 0.1)
    )

    # Remove axis labels
    g.ax_heatmap.set_xlabel('')
    g.ax_heatmap.set_ylabel('')

    # Add legend for SubGroup colors
    handles_group = [Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10)
                     for color in group_colors.values()]
    labels_group = list(group_colors.keys())

    g.ax_heatmap.legend(handles=handles_group, labels=labels_group, loc='upper left', title='Diagnostic Group',
                        bbox_to_anchor=(1.001, 1.0), ncol=1)

    # Create a second legend for significance levels
    handles_qvalue = [
        Patch(color='deepskyblue', label='q < 0.01'),
        Patch(color='lightblue', label='q < 0.05'),
        Patch(color='gainsboro', label='Non-significant')
    ]

    ax = plt.gca()
    ax.legend(handles=handles_qvalue, title='Significance of DEA', loc='upper center',
              bbox_to_anchor=(-40.55, -0.85), ncol=3)  # Positioned below the heatmap

    # Function to add labels to comparisons
    def add_comparison_label(ax, index, comparison, vertical_spacing=10.3):
        ax.text(index - 50, -vertical_spacing, comparison, ha='center', va='center', fontsize=10, color='black',
                rotation=90, bbox=dict(facecolor='white', edgecolor='None', boxstyle='round,pad=0.5'))

    # Add labels for each comparison with custom positioning
    for i, comp in enumerate(comparisons):
        if comp == "MM(V)1 vs CTRL":
            add_comparison_label(g.ax_heatmap, i + 39, comp, vertical_spacing=7.5)
        elif comp == "MV2K vs CTRL":
            add_comparison_label(g.ax_heatmap, i + 46, comp, vertical_spacing=7.5)
        elif comp == "VV2 vs CTRL":
            add_comparison_label(g.ax_heatmap, i + 42.5, comp, vertical_spacing=7.5)
        else:
            add_comparison_label(g.ax_heatmap, i, comp)

    # Save the figure
    output_file = os.path.join(figure_path, "heatmap2.png")
    plt.savefig(output_file, dpi=1200, bbox_inches='tight')  

    plt.show()

def preprocess_and_normalize(protein_data, proteins_to_include):
    """Prepares the dataset by filtering, normalizing, and standardizing."""
    # Normalize by total protein per sample
    total_protein_per_sample = protein_data.sum(axis=1)
    protein_data_normalized = protein_data.div(total_protein_per_sample, axis=0)

    # Z-score normalization
    protein_data_normalized = (protein_data_normalized - protein_data_normalized.mean(axis=0)) / protein_data_normalized.std(axis=0)

    return protein_data_normalized

def main():
    """Executes the entire data processing and visualization pipeline."""
    df_results, df_olink = load_and_clean_data()
    
    # Define comparisons (needed for the heatmap function)
    comparisons = ['MM(V)1 vs CTRL', 'VV2 vs CTRL', 'MV2K vs CTRL']
    
    proteins_to_include, filtered_dfs = filter_significant_results(df_results, comparisons)
    
    # Define protein_data here (original protein data matrix before normalization)
    protein_data = df_olink.set_index('SampleID').drop(columns=['SubGroup'])
    protein_data = protein_data[proteins_to_include]
    
    # Pass protein_data to preprocess_and_normalize function
    protein_data_normalized = preprocess_and_normalize(protein_data, proteins_to_include)
    
    protein_linkage, sample_linkage = perform_clustering(protein_data_normalized)
    plot_heatmap(protein_data, protein_data_normalized, protein_linkage, sample_linkage, filtered_dfs, proteins_to_include, comparisons, df_olink, figure_path)

if __name__ == "__main__":
    main()