In [None]:
# Imports ||||Run on Restart|||

from utils import get_embeddings_from_state_dict, seed_everything

import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Statistics Silo

This is where you pipe in the data generated by the HBBS or ABBS to compare results and get statistics.

In [None]:
# --- Parameters to set before running the script ---

BASE_RANDOM_STATE = 42  # Initial seed for reproducibility of the entire sequence of experiments
DIR = "../Embedding_experiments" # Path to the directory containing the models in relation to this 
EXPERIMENT = "kingdom_emb1" # Name of the experiment to test
NUM_RUNS = 10 # Number of runs of experiment
RUN_NUMBER = 7 # Used for t-SNE and other analyzes that take a single run
NORMALIZE = True # Normalize the embeddings before t-SNE


## Euclidian distance analysis
Created on a whim. 
This allows you to compare euclidian distances (that annoying copilot autocomplete thing suggested it below code I was writing for t-SNE analysis and I got intrigued and let it finish the code. after editing the code to be actually useful I did some research. seems like this analysis could actually be useful for my project).

### Simple framework for tests

In [None]:
# Simple comparison framework to compare the embeddings of different models

# Construct paths to the relevant state dictionaries
baseline_trainable_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_trainable_run_{RUN_NUMBER}_state_dict.pth"
untrained_baseline_trainable_path = f"{DIR}/untrained_models_{EXPERIMENT}/baseline_trainable_run_{RUN_NUMBER}_state_dict.pth"
baseline_fixed_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_fixed_run_{RUN_NUMBER}_state_dict.pth"
untrained_baseline_fixed_path = f"{DIR}/untrained_models_{EXPERIMENT}/baseline_fixed_run_{RUN_NUMBER}_state_dict.pth"

# Load the embeddings from the state dictionaries\n",
baseline_emb = get_embeddings_from_state_dict(baseline_trainable_path)
untrained_baseline_emb = get_embeddings_from_state_dict(untrained_baseline_trainable_path)
fixed_baseline_emb = get_embeddings_from_state_dict(baseline_fixed_path)
untrained_fixed_baseline_emb2 = get_embeddings_from_state_dict(untrained_baseline_fixed_path)

# Calculate the Euclidean distance between the embeddings\n",
fixed_trained_euclidean_distance = np.linalg.norm(baseline_emb - fixed_baseline_emb)
fixed_untrained_euclidean_distance = np.linalg.norm(baseline_emb - untrained_baseline_emb)
trainable_untrained_euclidean_distance = np.linalg.norm(fixed_baseline_emb - untrained_fixed_baseline_emb2)


# Print the Euclidean distance values
print(f"Euclidean distance between baseline trainable and fixed embeddings: {fixed_trained_euclidean_distance}")
print(f"Euclidean distance between baseline trainable and untrained baseline embeddings: {fixed_untrained_euclidean_distance}")
print(f"Euclidean distance between fixed baseline and untrained fixed baseline embeddings: {trainable_untrained_euclidean_distance}")

Reproducability\untrained_models_kingdom_test1\baseline_fixed_run_2_state_dict.pth

### Proper analytical framwork running on all runs

In [None]:
# --- Generate a dictionary of lists containing the results of euclidian distance analysis of all embeddings of the experiment specified above ---

# Set the random seed for reproducibility
seed_everything(BASE_RANDOM_STATE, silent=True)


# Initialize a dictionary to store the results
Euclidian_results = {"baseline_euclidean_distance": [],
                    "glylm_euclidean_distance": [],
                    "baseline_glylm_euclidean_distance": [],
                    "baseline_runs_euclidean_distance": []}

print(f"Calculating the Euclidean distance of the embeddings of the experiment {EXPERIMENT} with {NUM_RUNS} runs...")
print()

for runs in range(NUM_RUNS):
    # --- load the embeddings ---
    # Construct paths to the relevant state dictionaries
    other_run = runs + 2 if runs < NUM_RUNS - 1 else 1 
    baseline_trainable_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_trainable_run_{runs+1}_state_dict.pth"
    baseline_trainable_comp_path = f"{DIR}/untrained_models_{EXPERIMENT}/baseline_trainable_run_{runs+1}_state_dict.pth"
    baseline_fixed_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_fixed_run_{runs+1}_state_dict.pth"
    infused_trainable_path = f"{DIR}/saved_models_{EXPERIMENT}/infused_trainable_run_{runs+1}_state_dict.pth"
    infused_fixed_path = f"{DIR}/saved_models_{EXPERIMENT}/infused_fixed_run_{runs+1}_state_dict.pth"

    # Load the embeddings from the state dictionaries
    baseline_emb = get_embeddings_from_state_dict(baseline_trainable_path)
    baseline_comp_emb = get_embeddings_from_state_dict(baseline_trainable_comp_path)
    fixed_baseline_emb = get_embeddings_from_state_dict(baseline_fixed_path)
    glylm_emb = get_embeddings_from_state_dict(infused_fixed_path)
    trained_glylm_emb = get_embeddings_from_state_dict(infused_trainable_path)

    # run L2 normalization on the embeddings
    baseline_emb = baseline_emb / np.max(np.linalg.norm(baseline_emb, axis=1, keepdims=True))
    baseline_comp_emb = baseline_comp_emb / np.max(np.linalg.norm(baseline_comp_emb, axis=1, keepdims=True))
    fixed_baseline_emb = fixed_baseline_emb / np.max(np.linalg.norm(fixed_baseline_emb, axis=1, keepdims=True))
    glylm_emb = glylm_emb / np.max(np.linalg.norm(glylm_emb, axis=1, keepdims=True))
    trained_glylm_emb = trained_glylm_emb / np.max(np.linalg.norm(trained_glylm_emb, axis=1, keepdims=True))

    # Calculate the Euclidean distance between the embeddings
    baseline_euclidean_dist = np.linalg.norm(baseline_emb - fixed_baseline_emb)
    glylm_euclidean_dist = np.linalg.norm(glylm_emb - trained_glylm_emb)
    baseline_glylm_euclidean_dist = np.linalg.norm(baseline_emb - glylm_emb)
    # Calculate the Euclidean distance between the embeddings of two runs
    baseline_runs_euclidean_dist = np.linalg.norm(baseline_emb - baseline_comp_emb)

    # add the euclidean distance of the current run to the list of results
    Euclidian_results["baseline_euclidean_distance"].append(baseline_euclidean_dist)
    Euclidian_results["glylm_euclidean_distance"].append(glylm_euclidean_dist)
    Euclidian_results["baseline_glylm_euclidean_distance"].append(baseline_glylm_euclidean_dist)
    Euclidian_results["baseline_runs_euclidean_distance"].append(baseline_runs_euclidean_dist)

# --- Calculate the average of the euclidean distances ---
baseline_euclidean_dist_avg = np.mean(Euclidian_results["baseline_euclidean_distance"])
glylm_euclidean_dist_avg = np.mean(Euclidian_results["glylm_euclidean_distance"])
baseline_glylm_euclidean_dist_avg = np.mean(Euclidian_results["baseline_glylm_euclidean_distance"])
baseline_runs_euclidean_dist_avg = np.mean(Euclidian_results["baseline_runs_euclidean_distance"])

# Calculate the standard deviation of the euclidean distances
baseline_euclidean_dist_std = np.std(Euclidian_results["baseline_euclidean_distance"])
glylm_euclidean_dist_std = np.std(Euclidian_results["glylm_euclidean_distance"])
baseline_glylm_euclidean_dist_std = np.std(Euclidian_results["baseline_glylm_euclidean_distance"])
baseline_runs_euclidean_dist_std = np.std(Euclidian_results["baseline_runs_euclidean_distance"])

# Print the average euclidean distance values with their standard deviation
#print(f"Average Baseline (fixed to trainable) Euclidean Distance: {baseline_euclidean_dist_avg:.2f} ± {baseline_euclidean_dist_std:.2f}")
print(f"Average Baseline (fixed to trainable) Euclidean Distance: {baseline_runs_euclidean_dist_avg:.3f} ± {baseline_runs_euclidean_dist_std:.3f}")
print(f"Average GlyLM (fixed to trainable) Euclidean Distance: {glylm_euclidean_dist_avg:.3f} ± {glylm_euclidean_dist_std:.3f}")
print(f"Average Baseline-GlyLM [trainable] Euclidean Distance: {baseline_glylm_euclidean_dist_avg:.3f} ± {baseline_glylm_euclidean_dist_std:.3f}")
print(f"Average Baseline [trainable] (comparing runs) Euclidean Distance: {baseline_runs_euclidean_dist_avg:.3f} ± {baseline_runs_euclidean_dist_std:.3f}")



#### Before  normalization
```
Calculating the Euclidean distance of the embeddings of the experiment Kingdom1 with 10 runs...

Average Baseline (fixed to trainable) Euclidean Distance: 1281.91 ± 0.83 <-- ignore this, that comparison was flawed
Average GlyLM (fixed to trainable) Euclidean Distance: 30.51 ± 1.66
Average Baseline-GlyLM [trainable] Euclidean Distance: 1217.21 ± 0.54
Average Baseline [trainable] (comparing runs) Euclidean Distance: 1282.16 ± 1.21
```
#### After normalization
```
Calculating the Euclidean distance of the embeddings of the experiment Kingdom1 with 10 runs...

Average Baseline (fixed to trainable) Euclidean Distance needs to be calculated with the untrained baseline
Average GlyLM (fixed to trainable) Euclidean Distance: 5.23 ± 0.99
Average Baseline-GlyLM [trainable] Euclidean Distance: 61.78 ± 1.04
Average Baseline [trainable] (comparing runs) Euclidean Distance: 60.21 ± 1.00
```

## t-SNE Analysis

In [None]:
import numpy as np
from typing import List, Tuple

def prepare_tsne_data(embedding_arrays: List[np.ndarray], 
                      embedding_names: List[str],
                      normalize: bool = True
                      ) -> Tuple[np.ndarray, List[str]]:
    """
    Prepares embedding data for t-SNE visualization by normalizing and concatenating.

    Parameters
    ----------
    embedding_arrays : List[np.ndarray]
        A list of NumPy arrays, where each array contains a set of embeddings
        (e.g., [baseline_embs, raw_glm_embs, infused_embs]).
        All arrays in the list must have the same number of rows (glycowords)
        and same number of columns (embedding dimensions).
    embedding_names : List[str]
        A list of string names corresponding to each array in `embedding_arrays`.
        These names will be used as labels in the t-SNE plot legend.
    normalize : bool, optional
        If True, each embedding array will be normalized before concatenation.
        Default is True.

    Returns
    -------
    Tuple[np.ndarray, List[str]]
        A tuple containing:
        - tsne_embeddings (np.ndarray): All input embeddings, normalized and vertically stacked.
        - tsne_labels (List[str]): Corresponding labels for each row in all_embs_for_tsne.

    Raises
    ------
    ValueError
        If the number of embedding arrays does not match the number of names,
        are empty, don't have the same number of rows,
        or if the arrays have inconsistent shapes.
    Exception
        If an error occurs during normalization.
    """
    if len(embedding_arrays) != len(embedding_names):
        raise ValueError("Number of embedding arrays must match number of embedding names.")
    if not embedding_arrays: # Handle empty input list
        raise ValueError("No embedding arrays provided.")

    # Get the number of glycowords (rows) from the first embedding array
    num_glycowords = embedding_arrays[0].shape[0]

    # Normalize each embedding array and collect them if that flag is set
    
    normalized_arrays = []
    for arr in embedding_arrays:
        if arr.shape[0] != num_glycowords:
            raise ValueError("All embedding arrays must have the same number of rows (glycowords).")
        if normalize:
            try:
                arr = arr / np.max(np.linalg.norm(arr, axis=1, keepdims=True))
            except Exception as e:
                raise Exception(f"Error normalizing array: {e}")
            normalized_arrays.append(arr)
            embedding_arrays = normalized_arrays

    # Concatenate all normalized arrays vertically
    tsne_embeddings = np.concatenate(embedding_arrays, axis=0)
    
    # Create the combined list of labels
    tsne_labels = []
    for name in embedding_names:
        tsne_labels.extend([name] * num_glycowords) # Extend with 'num_glycowords' repetitions of each name

    return tsne_embeddings, tsne_labels

In [None]:
# --- Loading the embeddings of the specified run number for t-SNE ---

# Construct paths to the relevant state dictionaries for the specified run number
baseline_trained_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_trainable_run_{RUN_NUMBER}_state_dict.pth"
baseline_fixed_path = f"{DIR}/untrained_models_{EXPERIMENT}/baseline_trainable_run_{RUN_NUMBER}_state_dict.pth"
infused_trained_path = f"{DIR}/saved_models_{EXPERIMENT}/infused_trainable_run_{RUN_NUMBER}_state_dict.pth"
infused_fixed_path = f"{DIR}/saved_models_{EXPERIMENT}/infused_fixed_run_{RUN_NUMBER}_state_dict.pth"

# comparing different runs of the same model
other_run = RUN_NUMBER + 1 if RUN_NUMBER != 5 else 1 

baseline_trained_comp_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_trainable_run_{other_run}_state_dict.pth"
baseline_fixed_comp_path = f"{DIR}/untrained_models_{EXPERIMENT}/baseline_trainable_run_{other_run}_state_dict.pth"
infused_trained_comp_path = f"{DIR}/saved_models_{EXPERIMENT}/infused_trainable_run_{other_run}_state_dict.pth"

# Load the embeddings from the state dictionaries
baseline_trained_emb = get_embeddings_from_state_dict(baseline_trained_path)
baseline_fixed_emb = get_embeddings_from_state_dict(baseline_fixed_path)
infused_trained_emb = get_embeddings_from_state_dict(infused_trained_path)
infused_fixed_emb = get_embeddings_from_state_dict(infused_fixed_path)

baseline_trained_comp_emb = get_embeddings_from_state_dict(baseline_trained_comp_path)
baseline_fixed_comp_emb = get_embeddings_from_state_dict(baseline_fixed_comp_path)
infused_trained_comp_emb = get_embeddings_from_state_dict(infused_trained_comp_path)

# --- Construct a dictionary of the lists of embedding arrays and labels for prepare_tsne_data ---
all_tsne_embeddings = [

# All embeddings for t-SNE
{'name' : 'all', 'embeddings' : [baseline_trained_emb, baseline_fixed_emb, infused_trained_emb, infused_fixed_emb],
'labels' : ["Baseline (trained)", "Baseline (original)", "GlyLM (trained)", "GlyLM (original)"]},
# Baseline comparison before and after training
{'name' : 'baseline', 'embeddings' : [baseline_trained_emb, baseline_fixed_emb],
'labels' : ["Baseline (trained)", "Baseline (original)"]},
# Infused comparison before and after training
{'name' : 'infused', 'embeddings' : [infused_trained_emb, infused_fixed_emb],
'labels' : ["Infused (trained)", "Infused (original)"]},
# Baseline vs Infused comparison
{'name' : 'baseline_infused', 'embeddings' : [baseline_trained_emb, infused_trained_emb],
'labels' : ["Baseline (trained)", "Infused (trained)"]},
# Just the Baseline embeddings
{'name' : 'base', 'embeddings' : [baseline_trained_emb],
'labels' : ["Baseline embeddings after training"]},
# Just the Infused embeddings
{'name' : 'GlyLM', 'embeddings' : [infused_trained_emb],
'labels' : ["GlyLM embeddings"]},
# Baseline comparison between two runs
{'name' : 'baseline_comp', 'embeddings' : [baseline_trained_comp_emb, baseline_trained_emb],
'labels' : ["Baseline (trained)", "Baseline (original)"]},
# Infused comparison between two runs
{'name' : 'infused_comp', 'embeddings' : [infused_trained_comp_emb, infused_trained_emb],
'labels' : ["Infused (trained)", "Infused (original)"]},
# Baseline Fixed comparison between two runs
{'name' : 'baseline_fixed_comp', 'embeddings' : [baseline_fixed_comp_emb, baseline_fixed_emb],
'labels' : ["Baseline (fixed)", "Baseline (original)"]},
# Add more comparisons as needed
]   


# Prepare the data for t-SNE
for tsne_target in all_tsne_embeddings:
    tsne_target['embeddings'], tsne_target['labels'] = prepare_tsne_data(
        tsne_target['embeddings'], tsne_target['labels'], normalize=NORMALIZE
    )


In [None]:
# --- t-SNE analysis ---
# possible tsne_target_ids: baseline_infused, infused, baseline, base, GlyLM, all 
# Comparison between runs of the same model: baseline_comp, infused_comp, baseline_fixed_comp

tsne_target_id = "baseline"
for tsne_target in all_tsne_embeddings:
    if tsne_target['name'] == tsne_target_id:
        tsne_embeddings = tsne_target['embeddings']
        tsne_labels = tsne_target['labels']
        break
else:
    raise ValueError(f"Target '{tsne_target}' not found in the list of embeddings.")



# Initialize t-SNE
tsne = TSNE(n_components=2, 
            random_state=BASE_RANDOM_STATE, 
            perplexity=30, 
            learning_rate=200,
            max_iter=1000)
# Experiment with perplexity, learning_rate, and n_iter later for optimal visualization.
# (create a function to do this automatically)

# Initialize the t-SNE model and fit it to the embeddings
tsne_coords = tsne.fit_transform(tsne_embeddings)

In [None]:
# Plot t-SNE

# Generate a DataFrame for the t-SNE coordinates and labels
plot_df = pd.DataFrame(tsne_coords, columns=['x', 'y'])
plot_df['Embedding Source'] = tsne_labels


plt.figure(figsize=(10, 8)) # Adjust figure size as needed
sns.scatterplot(
    x='x', y='y',
    hue='Embedding Source', # Color points by their source (Trained Baseline, Raw GlyLM, Trained Infused)
    palette='tab10', # Choose a color palette (e.g., 'viridis', 'tab10', 'Paired')
    data=plot_df,
    legend='full',
    alpha=0.7, # Transparency for overlapping points
    s=5 # Size of points
)
plt.title('t-SNE Visualization of Glycan Embeddings')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.grid(True, linestyle='--', alpha=0.6) # Add a subtle grid
plt.show()