# Missing proteomic data imputation and UMAP analyses
==================================================================

This notebook performs imputation of missing proteomic data and dimentionality reduction analyses (UMAP)

Input:
------------------
- olink_raw.xlsx: Protein expression data with columns:
  * SampleID: Unique sample identifier
  * Group: Clinical group classification
  * SubGroup: Clinical subgroup
  * Strain: Sample strain type
  * age at LP: Age at lumbar puncture
  * Sex: Patient sex
  * [Protein Names]: NPX values for each protein
- feature_importance_rankings.csv: List of the top 20 proteins useful for sCJD subtypes classification

Output:
-------
- olink.xlsx: Protein expression data (with imputed proteomic data) with columns:
  * SampleID: Unique sample identifier
  * Group: Clinical group classification
  * SubGroup: Clinical subgroup
  * Strain: Sample strain type
  * age at LP: Age at lumbar puncture
  * Sex: Patient sex
  * [Protein Names]: NPX values for each protein
- Three UMAP plots of the Olink measurements of which: 
  * UMAP plot of all proteomic data
  * UMAP plot of proteomic data most useful for sCJD subtypes classification
  * UMAP plot of proteomic data most useful for sCJD subtypes prognostication

Analysis Components:
------------------
- KNN imputation
- UMAP visualisation of Olink PEA measurements for each subtype and survival range

In [1]:
# Standard imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Scikit-learn imports for preprocessing and imputation
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

# UMAP
from umap import UMAP

In [2]:
# Define path
data_path = os.path.dirname(os.getcwd()) + '/data'
figure_path = os.path.dirname(os.getcwd()) + '/figures/UMAP'

# Perform KNN imputation on missing proteomic data

In [3]:
# Import data
df = pd.read_excel(data_path + '/raw/olink_raw.xlsx')

# Columns to exclude from general imputation
excluded_columns_general = [
    'SampleID', 'Group', 'SubGroup', 'Strain', 'age at LP', 'Sex', 
    'Codon 129', 'onset-LP', 'onset-death', 'LP-death', 'NP_subtype'
]

def perform_imputation(df, excluded_columns_general):
    """Perform KNN imputation on the dataset excluding specified columns."""
    # Separate columns for general imputation
    columns_to_impute = [col for col in df.columns if col not in excluded_columns_general]

    # Initialize the KNN imputer
    imputer = KNNImputer(n_neighbors=3)

    # Copy the DataFrame to avoid modifying the original
    df_imputed = df.copy()

    # General imputation
    df_imputed[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])

    return df_imputed

# Define the output file paths
output_path_excel = os.path.join(data_path, 'curated', 'olink.xlsx')

# Call the imputation function
df_imputed = perform_imputation(df, excluded_columns_general)

# Save the imputed DataFrame as an Excel file
df_imputed.to_excel(output_path_excel, index=False)

print(f"Imputed data saved as Excel file: {output_path_excel}")

Imputed data saved as Excel file: c:\Users\gbent\Documents\GitHub\sCJD-subtypes/data\curated\olink.xlsx


# UMAP analyses

### UMAP plot of all proteomic data

In [None]:
# Import data
df = pd.read_excel(data_path + '/curated/olink.xlsx')

# Keep necessary columns from Olink data
columns_to_drop = ['age at LP', 'Sex', 'Codon 129',
                   'onset-LP', 'onset-death', 'LP-death', 'Group', 'Strain', 'NP_subtype']

df = df.drop(columns=columns_to_drop)

In [None]:
# Check total number of NaN values in the dataset
total_nan = df.isna().sum().sum()
print(f"Total number of NaN values in the dataset: {total_nan}")

# Check NaN values per column
nan_per_column = df.isna().sum()

# Display only columns that have NaN values
columns_with_nan = nan_per_column[nan_per_column > 0]

if len(columns_with_nan) > 0:
    print("\nColumns with NaN values:")
    print(columns_with_nan)
    
    # Calculate percentage of NaN values per column
    nan_percentage = (columns_with_nan / len(df)) * 100
    print("\nPercentage of NaN values per column:")
    print(nan_percentage)
else:
    print("\nNo columns contain NaN values")

# Check if any rows have all NaN values
rows_all_nan = df[df.isna().all(axis=1)]
print(f"\nNumber of rows with all NaN values: {len(rows_all_nan)}")

# Check if any rows have any NaN values
rows_with_nan = df[df.isna().any(axis=1)]
print(f"Number of rows containing at least one NaN value: {len(rows_with_nan)}")

if len(rows_with_nan) > 0:
    print("\nSample IDs of rows with NaN values:")
    print(rows_with_nan['SampleID'].tolist())

In [None]:
def plot_umap(df, figure_path):

    fig, ax = plt.subplots(figsize=(8, 6))
    
    # Define matching colors
    colors = {
        'MV2K': '#2ecc71', # green
        'VV2': '#e74c3c',  # red
        'MM(V)1': '#3498db', # blue 
        'CTRL': '#333c42', # grey
    }
    
    # Prepare data
    X = df.drop(['SampleID', 'SubGroup'], axis=1)
    
    # Scale the data
    X_scaled = StandardScaler().fit_transform(X)
    
    # Perform UMAP
    umap_embedding = UMAP(
        n_neighbors=15,
        min_dist=0.1,
        n_components=2,
        random_state=42
    ).fit_transform(X_scaled)
    
    # Plot UMAP results for each subtype
    for subtype in df['SubGroup'].unique():
        mask = df['SubGroup'] == subtype
        ax.scatter(
            umap_embedding[mask, 0],
            umap_embedding[mask, 1],
            c=colors[subtype],
            label=f"{subtype} (n={sum(mask)})",
            alpha=0.7
        )
    
    # Customize plot
    plt.title('UMAP Analysis by sCJD Subtype', pad=20, fontsize=14)
    plt.xlabel('UMAP1', fontsize=12)
    plt.ylabel('UMAP2', fontsize=12)
    plt.grid(True, alpha=0.3)
    
    # Move legend
    plt.legend(bbox_to_anchor=(0.05, 0.3), loc='upper left', frameon=True, framealpha=0.8)
    
    # Adjust layout
    plt.subplots_adjust(right=0.85)
    
    # Save and show
    plt.savefig(figure_path + '/umap_cjd_subtype.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return fig

# Generate plot
fig = plot_umap(df, figure_path)
fig

### UMAP plot of proteomic data most useful for sCJD subtypes classification

In [79]:
# Import feature importance from ML results
df = pd.read_excel(data_path + '/curated/olink.xlsx')
#df = df[df['LP-death'] < 20]
feature_importance_rankings = pd.read_csv(data_path + '/results/feature_importance_rankings.csv')
top_biomarkers = list(feature_importance_rankings['Feature'].head(20))

# Select the columns
columns_to_select = top_biomarkers + ['SampleID', 'SubGroup','LP-death']
df = df[columns_to_select]

In [80]:
def categorize_survival(value):
    if pd.isna(value):
        return 'CTRL or Unknown'
    #elif value > 20:
    #    return '> 10 months'
    elif value > 8:
        return '> 8 months'
    elif value > 2:
        return '2-8 months'
    else:
        return '≤ 2 months'

# Add the new column
df['Survival'] = df['LP-death'].apply(categorize_survival)

## UMAP plot of proteomic data most useful for sCJD subtypes prognostication

In [76]:
# Import feature importance from ML results
df = pd.read_excel(data_path + '/curated/olink.xlsx')
#df = df[df['LP-death'] < 20]
def categorize_survival(value):
    if pd.isna(value):
        return 'CTRL or Unknown'
    elif value > 8:
        return '> 8 months'
    #elif value > 8:
    #    return '10-20 months'
    elif value > 2:
        return '2-8 months'
    else:
        return '≤ 2 months'

# Add the new column
df['Survival'] = df['LP-death'].apply(categorize_survival)

top_biomarkers = ['METAP1D', 'IL8', 'WASF1', 'SRPK2', 'EIF4G1', 'RBKS', 
                'MAPT', 'PAG1', 'PRDX3', 'CAMKK1']

# Select the columns
columns_to_select = top_biomarkers + ['SampleID', 'SubGroup','Survival']
df = df[columns_to_select]

In [None]:


def plot_umap(df, figure_path):
    # Ensure the output directory exists
    os.makedirs(figure_path, exist_ok=True)
    
    # Define base colors
    colors = {
        'MV2K': '#2ecc71',   # green
        'VV2': '#e74c3c',    # red
        'MM(V)1': '#3498db', # blue 
        'CTRL': '#333c42',   # grey
    }
    
    # Define colors for Survival groups
    survival_colors = {
        '≤ 2 months': '#cb181d',    # darkest red (shortest survival)
        '2-8 months': '#fb6a4a',   # medium red
        #'10-20 months': '#fcae91',  # light red
        '> 8 months': '#fcae91',   # lightest red/white (longest survival)
        'CTRL or Unknown': '#999999'  # grey
    }
    
    # Prepare data (drop non-numeric columns)
    X = df.drop(columns=['SampleID', 'SubGroup', 'LP-death', 'Survival'])
    
    # Scale the data
    X_scaled = StandardScaler().fit_transform(X)
    
    # Perform UMAP
    umap_embedding = UMAP(
        n_neighbors=15,
        min_dist=0.1,
        n_components=2,
        random_state=42
    ).fit_transform(X_scaled)
    
    # Create figure with two subplots side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # First subplot: Original categorical plot
    for subtype in df['SubGroup'].unique():
        mask = df['SubGroup'] == subtype
        ax1.scatter(
            umap_embedding[mask.to_numpy(), 0],
            umap_embedding[mask.to_numpy(), 1],
            c=colors.get(subtype, '#000000'),  # Default to black if subtype is missing
            label=f"{subtype} (n={sum(mask)})",
            alpha=0.7,
            s=100
        )
    
    # Customize first subplot
    ax1.set_title('UMAP by sCJD Subtype', pad=20, fontsize=14)
    ax1.set_xlabel('UMAP1', fontsize=12)
    ax1.set_ylabel('UMAP2', fontsize=12)
    ax1.grid(True, alpha=0.3)
    ax1.legend(bbox_to_anchor=(0.01, 0.99), loc='upper left', frameon=True, framealpha=0.8, fontsize=14)
    
    # Second subplot: Survival groups
    for category in df['Survival'].unique():
        mask = df['Survival'] == category
        ax2.scatter(
            umap_embedding[mask.to_numpy(), 0],
            umap_embedding[mask.to_numpy(), 1],
            c=survival_colors.get(category, '#000000'),  # Default to black if category is missing
            label=f"{category} (n={sum(mask)})",
            alpha=0.7,
            s=100
        )
    
    # Customize second subplot
    ax2.set_title('UMAP by Survival Risk Groups', pad=20, fontsize=14)
    ax2.set_xlabel('UMAP1', fontsize=12)
    ax2.set_ylabel('UMAP2', fontsize=12)
    ax2.grid(True, alpha=0.3)
    ax2.legend(bbox_to_anchor=(0.01, 0.99), loc='upper left', frameon=True, framealpha=0.8, fontsize=14)
    
    # Adjust layout
    plt.tight_layout(w_pad=4)
    
    # Save figure
    save_path = os.path.join(figure_path, 'umap_cjd_subtype_20_biomarkers.png')
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    
    return fig

# Generate plot
fig = plot_umap(df, figure_path)
plt.show()