# Machine Learning Classification Analysis for Olink Data

This notebook performs a machine learning analysis on Olink protein expression data using Random Forest classification. The analysis compares three different approaches:
1. Random Forest with all features
2. Random Forest with selected features (top 50)
3. Random Forest with shuffled labels (control model)

## Analysis Steps
1. **Data Preprocessing**
   - Load Olink protein expression data
   - Remove unnecessary columns
   - Filter control samples
   
2. **Model Training**
   - Implement 5-fold cross-validation
   - Train models with different feature sets
   - Generate performance metrics
   
3. **Evaluation**
   - Calculate standard metrics (accuracy, precision, recall, F1, ROC-AUC)
   - Generate confusion matrices
   - Analyze feature importance using SHAP
   - Compare model performances

## Expected Outputs
- Performance metrics for each model
- Confusion matrices
- Feature importance plots
- SHAP analysis visualisations

In [1]:
# General imports
import os
import warnings
import statistics as stat
import numpy as np
import pandas as pd
from scipy import stats

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

# Machine learning libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

# Metrics
from sklearn.metrics import (confusion_matrix, classification_report, 
                             accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score)

# SHAP for interpretability
import shap

# Ignore warnings
warnings.filterwarnings("ignore")

# Enable inline plotting for Jupyter
%matplotlib inline

In [2]:
data_path = os.path.dirname(os.getcwd()) + '/data'
figure_path = os.path.dirname(os.getcwd()) + '/figures'

In [4]:
# Import Olink Data
df = pd.read_excel(data_path + '/curated/olink.xlsx')

# Drop unnecessary columns
columns_to_drop = [
    'Codon 129', 'SampleID', 'Group', 'Strain', 'age at LP', 'Sex',
    'onset-LP', 'onset-death', 'LP-death', 'NP_subtype'
]

df = df.drop(columns=columns_to_drop)

# Filter out controls
df = df[df['SubGroup'] != 'CTRL']

### Function for 5 fold cross validation

In [None]:
X = df.drop(['SubGroup'], axis=1)
y = df['SubGroup']

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
def train_and_evaluate_with_plots(X, y, model, cv, k=None, plot_confusion=False, save_path=None, model_name=""):
    accuracies, precisions, recalls, f1_scores, roc_aucs = [], [], [], [], []
    all_confusion_matrices = np.zeros((len(y.unique()), len(y.unique())))
    classes = sorted(y.unique())

    for fold, (train_index, test_index) in enumerate(cv.split(X, y), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        if k is not None:
            selector = SelectKBest(f_classif, k=k)
            X_train_selected = selector.fit_transform(X_train, y_train)
            X_test_selected = selector.transform(X_test)
            X_train, X_test = X_train_selected, X_test_selected
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)
        
        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, average='weighted'))
        recalls.append(recall_score(y_test, y_pred, average='weighted'))
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
        
        y_test_bin = label_binarize(y_test, classes=classes)
        roc_auc = roc_auc_score(y_test_bin, y_pred_proba, average='macro', multi_class='ovr')
        roc_aucs.append(roc_auc)
        
        cm = confusion_matrix(y_test, y_pred, labels=classes)
        all_confusion_matrices += cm
    
    # Average confusion matrix (percentages)
    all_confusion_matrices /= all_confusion_matrices.sum(axis=1, keepdims=True)
    all_confusion_matrices *= 100
    
    # Plot Confusion Matrix if requested
    if plot_confusion:
        plt.figure(figsize=(8, 6))
        sns.heatmap(all_confusion_matrices, annot=True, fmt='.2f', cmap='Blues', 
                   xticklabels=classes, yticklabels=classes)
        plt.title(f'Confusion Matrix (Percentage) - {model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.savefig(figure_path + f"/ml_all/confusion_matrix_{model_name}.png", dpi=1200, bbox_inches='tight')
        plt.close()
    
    return {
        'Accuracy': (np.mean(accuracies), np.std(accuracies)),
        'Precision': (np.mean(precisions), np.std(precisions)),
        'Recall': (np.mean(recalls), np.std(recalls)),
        'F1 Score': (np.mean(f1_scores), np.std(f1_scores)),
        'ROC-AUC': (np.mean(roc_aucs), np.std(roc_aucs)),
        'Confusion Matrix': all_confusion_matrices
    }

def plot_metrics_comparison(metrics_original, metrics_reduced, metrics_random):
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC']
    models = ['RF all features', 'RF selected features', 'Random RF']
    
    fig, ax = plt.subplots(figsize=(7, 5))
    x = np.arange(len(metrics)) 
    width = 0.25 
    colors = sns.color_palette("Set2", 3)
    
    for i, (model_metrics, color) in enumerate(zip([metrics_original, metrics_reduced, metrics_random], colors)):
        means = [model_metrics[m][0] for m in metrics]
        stds = [model_metrics[m][1] for m in metrics]
        ax.bar(x + i*width, means, width, label=models[i], yerr=stds, capsize=5, color=color)
    
    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x + width)
    ax.set_xticklabels(metrics, fontsize=12)
    ax.legend(title='Models', fontsize=10, title_fontsize='12', loc='lower left')
    ax.grid(True, which='both', axis='y', linestyle='--', linewidth=0.7, alpha=0.7)
    
    sns.despine()
    plt.tight_layout()
    plt.savefig(figure_path + '/ml_all/model_performance.png', dpi=1200, bbox_inches='tight')
    plt.show()

# Define the folder path to save the plots
save_path = figure_path

# Create StratifiedKFold object
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Model 1: Original Random Forest
rf_original = RandomForestClassifier(random_state=42)
metrics_original = train_and_evaluate_with_plots(X, y, rf_original, cv, plot_confusion=True, 
                                               save_path=save_path, model_name="RF_original")

# Model 2: Random Forest with reduced features
rf_reduced = RandomForestClassifier(random_state=42)
metrics_reduced = train_and_evaluate_with_plots(X, y, rf_reduced, cv, k=50, plot_confusion=True, 
                                              save_path=save_path, model_name="RF_reduced")

# Model 3: Random model (shuffled labels)
rf_random = RandomForestClassifier(random_state=42)
y_shuffled = y.copy()
y_shuffled = y_shuffled.sample(frac=1, random_state=42).reset_index(drop=True)
metrics_random = train_and_evaluate_with_plots(X, y_shuffled, rf_random, cv, plot_confusion=True, 
                                             save_path=save_path, model_name="RF_random")

# Plot comparison
plot_metrics_comparison(metrics_original, metrics_reduced, metrics_random)