In [33]:
# Imports ||||Run on Restart|||

from utils import get_embeddings_from_state_dict, seed_everything

import os
import numpy as np

# Statistics Silo

This is where you pipe in the data generated by the HBBS or ABBS to compare results and get statistics.

In [38]:
# --- Parameters to set before running the script ---

BASE_RANDOM_STATE = 42  # Initial seed for reproducibility of the entire sequence of experiments
DIR = "../Reproducability" # Path to the directory containing the models in relation to this 
EXPERIMENT = "kingdom_test1" # Name of the experiment to test
NUM_RUNS = 3 # Number of runs of experiment
RUN_NUMBER = 2 # Used for t-SNE and other analyzes that take a single run


## Euclidian distance analysis
Created on a whim. 
This allows you to compare euclidian distances (that annoying copilot autocomplete thing suggested it below code I was writing for t-SNE analysis and I got intrigued and let it finish the code. after editing the code to be actually useful I did some research. seems like this analysis could actually be useful for my project).

In [39]:
# Simple comparison framework to compare the embeddings of different models

# Construct paths to the relevant state dictionaries
baseline_trainable_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_trainable_run_{RUN_NUMBER}_state_dict.pth"
untrained_baseline_trainable_path = f"{DIR}/untrained_models_{EXPERIMENT}/baseline_trainable_run_{RUN_NUMBER}_state_dict.pth"
baseline_fixed_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_fixed_run_{RUN_NUMBER}_state_dict.pth"
untrained_baseline_fixed_path = f"{DIR}/untrained_models_{EXPERIMENT}/baseline_fixed_run_{RUN_NUMBER}_state_dict.pth"

# Load the embeddings from the state dictionaries\n",
baseline_emb = get_embeddings_from_state_dict(baseline_trainable_path)
untrained_baseline_emb = get_embeddings_from_state_dict(untrained_baseline_trainable_path)
fixed_baseline_emb = get_embeddings_from_state_dict(baseline_fixed_path)
untrained_fixed_baseline_emb2 = get_embeddings_from_state_dict(untrained_baseline_fixed_path)

# Calculate the Euclidean distance between the embeddings\n",
fixed_trained_euclidean_distance = np.linalg.norm(baseline_emb - fixed_baseline_emb)
fixed_untrained_euclidean_distance = np.linalg.norm(baseline_emb - untrained_baseline_emb)
trainable_untrained_euclidean_distance = np.linalg.norm(fixed_baseline_emb - untrained_fixed_baseline_emb2)


# Print the Euclidean distance values
print(f"Euclidean distance between baseline trainable and fixed embeddings: {fixed_trained_euclidean_distance}")
print(f"Euclidean distance between baseline trainable and untrained baseline embeddings: {fixed_untrained_euclidean_distance}")
print(f"Euclidean distance between fixed baseline and untrained fixed baseline embeddings: {trainable_untrained_euclidean_distance}")

Euclidean distance between baseline trainable and fixed embeddings: 1282.095458984375
Euclidean distance between baseline trainable and untrained baseline embeddings: 15.252898216247559
Euclidean distance between fixed baseline and untrained fixed baseline embeddings: 0.0


Reproducability\untrained_models_kingdom_test1\baseline_fixed_run_2_state_dict.pth

In [None]:
# --- Generate a dictionary of lists containing the results of euclidian distance analysis of all embeddings of the experiment specified above ---

# Set the random seed for reproducibility
seed_everything(BASE_RANDOM_STATE, silent=True)


# Initialize a dictionary to store the results
Euclidian_results = {"baseline_euclidean_distance": [],
                    "glylm_euclidean_distance": [],
                    "baseline_glylm_euclidean_distance": [],
                    "baseline_runs_euclidean_distance": []}

print(f"Calculating the Euclidean distance of the embeddings of the experiment {EXPERIMENT} with {NUM_RUNS} runs...")
print()

for runs in range(NUM_RUNS):
    # --- load the embeddings ---
    # Construct paths to the relevant state dictionaries
    other_run = runs + 2 if runs < NUM_RUNS - 1 else 1 
    baseline_trainable_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_trainable_run_{runs+1}_state_dict.pth"
    baseline_trainable_comp_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_trainable_run_{other_run}_state_dict.pth"
    baseline_fixed_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_fixed_run_{runs+1}_state_dict.pth"
    infused_trainable_path = f"{DIR}/saved_models_{EXPERIMENT}/infused_trainable_run_{runs+1}_state_dict.pth"
    infused_fixed_path = f"{DIR}/saved_models_{EXPERIMENT}/infused_fixed_run_{runs+1}_state_dict.pth"

    # Load the embeddings from the state dictionaries
    baseline_emb = get_embeddings_from_state_dict(baseline_trainable_path)
    baseline_comp_emb = get_embeddings_from_state_dict(baseline_trainable_comp_path)
    fixed_baseline_emb = get_embeddings_from_state_dict(baseline_fixed_path)
    glylm_emb = get_embeddings_from_state_dict(infused_fixed_path)
    trained_glylm_emb = get_embeddings_from_state_dict(infused_trainable_path)

    # Calculate the Euclidean distance between the embeddings
    baseline_euclidean_dist = np.linalg.norm(baseline_emb - fixed_baseline_emb)
    glylm_euclidean_dist = np.linalg.norm(glylm_emb - trained_glylm_emb)
    baseline_glylm_euclidean_dist = np.linalg.norm(baseline_emb - glylm_emb)
    # Calculate the Euclidean distance between the embeddings of two runs
    baseline_runs_euclidean_dist = np.linalg.norm(baseline_emb - baseline_comp_emb)

    # add the euclidean distance of the current run to the list of results
    Euclidian_results["baseline_euclidean_distance"].append(baseline_euclidean_dist)
    Euclidian_results["glylm_euclidean_distance"].append(glylm_euclidean_dist)
    Euclidian_results["baseline_glylm_euclidean_distance"].append(baseline_glylm_euclidean_dist)
    Euclidian_results["baseline_runs_euclidean_distance"].append(baseline_runs_euclidean_dist)

# --- Calculate the average of the euclidean distances ---
baseline_euclidean_dist_avg = np.mean(Euclidian_results["baseline_euclidean_distance"])
glylm_euclidean_dist_avg = np.mean(Euclidian_results["glylm_euclidean_distance"])
baseline_glylm_euclidean_dist_avg = np.mean(Euclidian_results["baseline_glylm_euclidean_distance"])
baseline_runs_euclidean_dist_avg = np.mean(Euclidian_results["baseline_runs_euclidean_distance"])

# Calculate the standard deviation of the euclidean distances
baseline_euclidean_dist_std = np.std(Euclidian_results["baseline_euclidean_distance"])
glylm_euclidean_dist_std = np.std(Euclidian_results["glylm_euclidean_distance"])
baseline_glylm_euclidean_dist_std = np.std(Euclidian_results["baseline_glylm_euclidean_distance"])
baseline_runs_euclidean_dist_std = np.std(Euclidian_results["baseline_runs_euclidean_distance"])

# Print the average euclidean distance values with their standard deviation
print(f"Average Baseline (fixed to trainable) Euclidean Distance: {baseline_euclidean_dist_avg:.2f} ± {baseline_euclidean_dist_std:.2f}")
print(f"Average GlyLM (fixed to trainable) Euclidean Distance: {glylm_euclidean_dist_avg:.2f} ± {glylm_euclidean_dist_std:.2f}")
print(f"Average Baseline-GlyLM [trainable] Euclidean Distance: {baseline_glylm_euclidean_dist_avg:.2f} ± {baseline_glylm_euclidean_dist_std:.2f}")
print(f"Average Baseline [trainable] (comparing runs) Euclidean Distance: {baseline_runs_euclidean_dist_avg:.2f} ± {baseline_runs_euclidean_dist_std:.2f}")



Calculating the Euclidean distance of the embeddings of the experiment Kingdom1 with 10 runs...

Average Baseline (fixed to trainable) Euclidean Distance: 1281.91 ± 0.83
Average GlyLM (fixed to trainable) Euclidean Distance: 30.37 ± 1.78
Average Baseline-GlyLM [trainable] Euclidean Distance: 1217.21 ± 0.54
Average Baseline [trainable] (comparing runs) Euclidean Distance: 1282.16 ± 1.21


Calculating the Euclidean distance of the embeddings of the experiment Kingdom1 with 10 runs...

Average Baseline (fixed to trainable) Euclidean Distance: 1281.91 ± 0.83
Average GlyLM (fixed to trainable) Euclidean Distance: 30.51 ± 1.66
Average Baseline-GlyLM [trainable] Euclidean Distance: 1217.21 ± 0.54
Average Baseline [trainable] (comparing runs) Euclidean Distance: 1282.16 ± 1.21

## t-SNE Analysis

In [None]:
# --- Loading the embeddings of the specified run number for t-SNE ---

# Construct paths to the relevant state dictionaries for the specified run number
baseline_trainable_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_trainable_run_{RUN_NUMBER}_state_dict.pth"
baseline_fixed_path = f"{DIR}/saved_models_{EXPERIMENT}/baseline_fixed_run_{RUN_NUMBER}_state_dict.pth"
infused_trainable_path = f"{DIR}/saved_models_{EXPERIMENT}/infused_trainable_run_{RUN_NUMBER}_state_dict.pth"
infused_fixed_path = f"{DIR}/saved_models_{EXPERIMENT}/infused_fixed_run_{RUN_NUMBER}_state_dict.pth"

# Load the embeddings from the state dictionaries
baseline_emb = get_embeddings_from_state_dict(baseline_trainable_path)
fixed_baseline_emb = get_embeddings_from_state_dict(baseline_fixed_path)
glylm_emb = get_embeddings_from_state_dict(infused_fixed_path)
trained_glylm_emb = get_embeddings_from_state_dict(infused_trainable_path)