# Imports

In [1]:
from __future__ import annotations
from typing import Tuple, List, Dict, Optional, Any

import numpy as np
import torch
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, DownloadMode
import random
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import tqdm

from helper_utils.enum_keys import (
    FPKey,
    ModelKey,
    QuantStyle,
    MiscPrompts,
    Contexts,
    Texts
)

from PTQ.bitlinear_wrapper_class import BitLinear
from PTQ.apply_ptq import applyPTQ
from PTQ.olmo_act_fns import patch_olmo_mlp
import helper_utils.utils as utils
from helper_utils.models_loader import load_4bit_auto, load_8bit_auto
from mech_interp_utils.utils_main.src.transformer_utils import (
    logit_lens,
    activation_lens,
    dictionary_learning,
    chatbot_analysis
)

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
torch.manual_seed(0)
original = torch.randn(512) * 0.5  # Original activations

def quantize_dequantize(tensor, scale_value):
    scale = max(scale_value, 1e-8)
    qmin, qmax = -127, 127
    tensor_int = (tensor / scale).round().clamp(qmin, qmax).to(torch.int8)
    tensor_dequant = tensor_int.float() * scale
    return tensor_int, tensor_dequant

# Quantize with different scales
_, dequant_1e2 = quantize_dequantize(original, 1e-2)
_, dequant_1e5 = quantize_dequantize(original, 1e-5)

# L2 distance
print("L2 Distance (scale=1e-2):", torch.norm(original - dequant_1e2).item())
print("L2 Distance (scale=1e-5):", torch.norm(original - dequant_1e5).item())

# Plot histograms + KDEs
plt.figure(figsize=(14, 6))
sns.histplot(original.numpy(),bel='Original', kde=True, stat="count", bins=50, color='black', alpha=0.5)
sns.histplot(dequant_1e2.numpy(), label='Dequant (scale=1e-2)', kde=True, stat="count", bins=50, color='red', alpha=0.5)
sns.histplot(dequant_1e5.numpy(), label='Dequant (scale=1e-5)', kde=True, stat="count", bins=50, color='blue', alpha=0.5)

plt.title("Histogram (Count) + KDE of Quantized vs Original Activations")
plt.xlabel("Activation Value")
plt.ylabel("Count")
plt.ylim(0, 40)  
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [2]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

### Datasets for calibrating activations and analysis

In [None]:
filepath = r'D:\ThesisData\wikitext'

destination_path = str(Path(filepath))
dataset = load_dataset(
    'wikitext', 'wikitext-103-raw-v1',
    split={
        'train': 'train[:200]',
    },
    cache_dir=destination_path,
    download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
    keep_in_memory=True
)

In [None]:
train_texts = dataset['train']

In [None]:
calibration_texts = [t for t in dataset['train']["text"] if isinstance(t, str) and t.strip()]
#calibration_texts = [t for t in sub_txts["text"] if isinstance(t, str) and t.strip()]

In [None]:
sub_txts = train_texts.take(200)

#### GSM8K (Math) "gsm8k"
#### LogiQA (Logic & Reasoning): "logiq"

In [3]:
filepath = r'D:\ThesisData\nq'

destination_path = str(Path(filepath))
nq_dataset = load_dataset(
    'sentence-transformers/natural-questions',
    split={
        'train': 'train[:20]'
    },
    cache_dir=destination_path,
    download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
    keep_in_memory=True
)

In [None]:
nq_dataset

In [None]:
nq_queries= nq_dataset['train']['query']

In [None]:
nq_queries

In [4]:
nq_answers = nq_dataset['train']['answer']

In [5]:
nq_answers

["Richmond Football Club Richmond began 2017 with 5 straight wins, a feat it had not achieved since 1995. A series of close losses hampered the Tigers throughout the middle of the season, including a 5-point loss to the Western Bulldogs, 2-point loss to Fremantle, and a 3-point loss to the Giants. Richmond ended the season strongly with convincing victories over Fremantle and St Kilda in the final two rounds, elevating the club to 3rd on the ladder. Richmond's first final of the season against the Cats at the MCG attracted a record qualifying final crowd of 95,028; the Tigers won by 51 points. Having advanced to the first preliminary finals for the first time since 2001, Richmond defeated Greater Western Sydney by 36 points in front of a crowd of 94,258 to progress to the Grand Final against Adelaide, their first Grand Final appearance since 1982. The attendance was 100,021, the largest crowd to a grand final since 1986. The Crows led at quarter time and led by as many as 13, but the T

In [None]:
filepath = r'D:\ThesisData\logiqa'

destination_path = str(Path(filepath))
logiqa_dataset = load_dataset(
    'lucasmccabe/logiqa',
    split={
        'train': 'train[:20]'
    },
    cache_dir=destination_path,
    download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
    keep_in_memory=True
)

In [None]:
logiqa_dataset

In [None]:
filepath = r'D:\ThesisData\gsm8k'

destination_path = str(Path(filepath))
gsm8k_dataset = load_dataset(
    'gsm8k', 'main',
    split={
        'train': 'train[:20]'
    },
    cache_dir=destination_path,
    download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS,
    keep_in_memory=True
)

In [None]:
gsm8k_dataset

In [None]:
gsm8k_questions = gsm8k_dataset['train']['question']

In [None]:
gsm8k_answers = gsm8k_dataset['train']['answer']

In [None]:
gsm8k_answers

In [None]:
gsm8k_questions_sae = """
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?
"""

# Models and Tokenizer

In [6]:
def load_test_model(model_path:str, dtype=torch.dtype) -> AutoModelForCausalLM:
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        return_dict=True,
        output_hidden_states=True,
        torch_dtype=dtype,
        low_cpu_mem_usage=True,
        local_files_only=True,
        use_safetensors=True,
        #trust_remote_code=True
    )

    return model

In [None]:
hfbit1_tokenizer = AutoTokenizer.from_pretrained(FPKey.HFBIT1_TOKENIZER.value)

In [None]:
hfbit1_fp32 = load_test_model(FPKey.HFBIT1_8B.value, dtype=torch.float32) # https://huggingface.co/HF1BitLLM/Llama3-8B-1.58-100B-tokens

In [None]:
# Print model layers to inspect their names
for name, module in hfbit1_fp32.named_modules():
    print(f"Layer name: {name}, Module: {module}")


In [7]:
llama8b_tokenizer = AutoTokenizer.from_pretrained(FPKey.LINSTRUCT_TOKENIZER.value)

In [None]:
llama8b_fp32 = load_test_model(FPKey.LINSTRUCT_8B.value, dtype=torch.float32)

In [8]:
# Load the model with 8-bit quantization using BNB
llama8b_bnb8_float32 = AutoModelForCausalLM.from_pretrained(
    ModelKey.LLINSTRUCT8B.value,           # Replace with your actual model
    torch_dtype=torch.float32,     # Specify the dtype (for 8-bit, use uint8)
    #device_map="cpu",           # Automatically map the model to available devices (GPU/CPU)
    load_in_8bit=True,
    return_dict=True,
    output_hidden_states=True,            # Set to True for 8-bit quantization
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:28<00:00,  7.04s/it]


In [None]:
llama8b_bnb8_float32.config

In [9]:
# Load the model with 4-bit quantization using BNB
llama8b_bnb4_float32 = AutoModelForCausalLM.from_pretrained(
    ModelKey.LLINSTRUCT8B.value,           # Replace with your actual model
    torch_dtype=torch.float32,     # This would still use uint8 or another type based on quantization method
    #device_map="auto",           # Automatically map the model to available devices (GPU/CPU)
    load_in_4bit=True,            # Set to True for 4-bit quantization (if available)
    return_dict=True,
    output_hidden_states=True, 
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:16<00:00,  4.09s/it]


In [None]:
llama8b_ptsq_float32 = applyPTQ(
    load_test_model(ModelKey.LLINSTRUCT8B.value, dtype=torch.float32),
    tokenizer=llama8b_tokenizer,
    #calibration_input=None,
    #calibration_input=sub_txts['text'],
    calibration_input=Texts.T1.value,
    mode='1.58bit',
    safer_quant=True,
    q_lmhead=True,
    model_half=False,
    quant_half=False,
    layers_to_quant_weights=QuantStyle.BITNET.value,
    layers_to_quant_activations=QuantStyle.BITNET.value,
    fragile_layers=False,
    act_quant=True,
    act_bits=8,
    torch_backends=False,
    debugging=True,
    plot_debugging=False,
    plot_quantization=False,
    freeze_modules=True
)

In [None]:
hfbit1_tokenizer.vocab_size

In [None]:
llama8b_tokenizer.vocab_size

### allenai/OLMo

In [None]:
olmo1b_tokenizer = AutoTokenizer.from_pretrained(FPKey.OLMO1B_TOKENIZER.value)

In [None]:
olmo2t_tokenizer = AutoTokenizer.from_pretrained(FPKey.OLMO7B2T_TOKENIZER.value)

In [None]:
olmo1b_fp32 = load_test_model(FPKey.OLMO1B_FP.value, dtype=torch.float32)

In [None]:
olmo2t_fp32 = load_test_model(FPKey.OLMO7B2T_FP.value, dtype=torch.float32)

In [None]:
olmo1b_bitnet_fp32_ptsq = applyPTQ(
    load_test_model(FPKey.OLMO1B_FP.value, dtype=torch.float32),
    tokenizer=olmo1b_tokenizer,
    #calibration_input=None,
    #calibration_input=sub_txts['text'],
    calibration_input=Texts.T1.value,
    mode='1.58bit',
    safer_quant=True,
    q_lmhead=True,
    model_half=False,
    quant_half=False,
    layers_to_quant_weights=QuantStyle.BITNET.value,
    layers_to_quant_activations=QuantStyle.BITNET.value,
    fragile_layers=False,
    act_quant=True,
    act_bits=8,
    torch_backends=True,
    debugging=True,
    plot_debugging=False,
    plot_quantization=False,
    freeze_modules=True
)

In [None]:
olmo2t_bitnet_fp32_ptsq = applyPTQ(
    load_test_model(FPKey.OLMO7B2T_FP.value, dtype=torch.float32),
    tokenizer=olmo2t_tokenizer,
    #calibration_input=None,
    #calibration_input=sub_txts['text'],
    calibration_input=Texts.T1.value,
    mode='1.58bit',
    safer_quant=True,
    q_lmhead=True,
    model_half=False,
    quant_half=False,
    layers_to_quant_weights=QuantStyle.BITNET.value,
    layers_to_quant_activations=QuantStyle.BITNET.value,
    fragile_layers=False,
    act_quant=True,
    act_bits=8,
    torch_backends=False,
    debugging=True,
    plot_debugging=False,
    plot_quantization=False,
    freeze_modules=True
)

In [None]:
# Load the model with 8-bit quantization using BNB
olmo1b_bnb8_float32 = AutoModelForCausalLM.from_pretrained(
    FPKey.OLMO1B_FP.value,           # Replace with your actual model
    torch_dtype=torch.float32,     # Specify the dtype (for 8-bit, use uint8)
    #device_map="cpu",           # Automatically map the model to available devices (GPU/CPU)
    load_in_8bit=True,
    return_dict=True,
    output_hidden_states=True,            # Set to True for 8-bit quantization
)

In [None]:
# Load the model with 4-bit quantization using BNB
olmo1b_bnb4_float32 = AutoModelForCausalLM.from_pretrained(
    FPKey.OLMO1B_FP.value,           # Replace with your actual model
    torch_dtype=torch.float32,     # This would still use uint8 or another type based on quantization method
    #device_map="auto",           # Automatically map the model to available devices (GPU/CPU)
    load_in_4bit=True,            # Set to True for 4-bit quantization (if available)
    return_dict=True,
    output_hidden_states=True, 
)

### NousResearch/DeepHermes

In [None]:
dh3b_tokenizer = AutoTokenizer.from_pretrained(FPKey.TOKENIZER_3B.value)

In [None]:
dh8b_tokenizer = AutoTokenizer.from_pretrained(FPKey.TOKENIZER_8B.value)

In [None]:
dh3b_fp32 = load_test_model(FPKey.FP_3B.value, dtype=torch.float32)

In [None]:
dh8b_fp32 = load_test_model(FPKey.FP_8B.value, dtype=torch.float32)

In [None]:
dh3b_bitnet_fp32_ptsq = applyPTQ(
    load_test_model(FPKey.FP_3B.value, dtype=torch.float32),
    tokenizer=dh3b_tokenizer,
    #calibration_input=None,
    #calibration_input=sub_txts['text'],
    calibration_input=Texts.T1.value,
    mode='1.58bit',
    safer_quant=True,
    q_lmhead=True,
    model_half=False,
    quant_half=False,
    layers_to_quant_weights=QuantStyle.BITNET.value,
    layers_to_quant_activations=QuantStyle.BITNET.value,
    fragile_layers=False,
    act_quant=True,
    act_bits=8,
    torch_backends=False,
    debugging=True,
    plot_debugging=False,
    plot_quantization=False,
    freeze_modules=True
)

In [None]:
# Load the model with 8-bit quantization using BNB
dh3b_bnb8_float32 = AutoModelForCausalLM.from_pretrained(
    FPKey.FP_3B.value,           # Replace with your actual model
    torch_dtype=torch.float32,     # Specify the dtype (for 8-bit, use uint8)
    #device_map="cpu",           # Automatically map the model to available devices (GPU/CPU)
    load_in_8bit=True,
    return_dict=True,
    output_hidden_states=True,            # Set to True for 8-bit quantization
)

In [None]:
# Load the model with 4-bit quantization using BNB
dh3b_bnb4_float32 = AutoModelForCausalLM.from_pretrained(
    FPKey.FP_3B.value,           # Replace with your actual model
    torch_dtype=torch.float32,     # This would still use uint8 or another type based on quantization method
    #device_map="auto",           # Automatically map the model to available devices (GPU/CPU)
    load_in_4bit=True,            # Set to True for 4-bit quantization (if available)
    return_dict=True,
    output_hidden_states=True, 
)

In [None]:
# Load the model with 8-bit quantization using BNB
dh8b_bnb8_float32 = AutoModelForCausalLM.from_pretrained(
    FPKey.FP_8B.value,           # Replace with your actual model
    torch_dtype=torch.float32,     # Specify the dtype (for 8-bit, use uint8)
    #device_map="cpu",           # Automatically map the model to available devices (GPU/CPU)
    load_in_8bit=True,
    return_dict=True,
    output_hidden_states=True,            # Set to True for 8-bit quantization
)

In [None]:
# Load the model with 4-bit quantization using BNB
dh8b_bnb4_float32 = AutoModelForCausalLM.from_pretrained(
    FPKey.FP_8B.value,           # Replace with your actual model
    torch_dtype=torch.float32,     # This would still use uint8 or another type based on quantization method
    #device_map="auto",           # Automatically map the model to available devices (GPU/CPU)
    load_in_4bit=True,            # Set to True for 4-bit quantization (if available)
    return_dict=True,
    output_hidden_states=True, 
)

In [None]:
dh8b_bnb4_float32

# Activation Lens and Logit Lens

In [None]:
text_inputs = [
    # Language understanding
    "The quick brown fox jumps over the lazy dog.",
    "Despite the rain, the event continued as planned.",
    
    # Logic/reasoning
    "If all humans are mortal and Socrates is a human, then Socrates is mortal.",
    "Either the lights are off or the power is out. The lights are on, so the power must be out.",

    # Math/numerical
    "The derivative of sin(x) with respect to x is cos(x).",
    "What is the sum of the first 100 natural numbers?",

    # Programming
    "In Python, list comprehensions provide a concise way to create lists.",
    "To define a function in JavaScript, use the 'function' keyword.",

    # Commonsense knowledge
    "You should refrigerate milk after opening it to keep it fresh.",
    "People usually eat breakfast in the morning before starting their day.",

    # Scientific knowledge
    "Water boils at 100 degrees Celsius under standard atmospheric pressure.",
    "Photosynthesis is the process by which plants convert sunlight into chemical energy."
]


In [None]:
gsm8k_questions

In [None]:
gsm8k_answers

In [None]:
nq_queries

In [None]:
texts = ["The quick brown fox jumps over the lazy dog.", "Despite the rain, the event continued as planned."]

In [None]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [11]:
logit_lens.plot_logit_lens_plotly(
    model=llama8b_bnb4_float32,
    tokenizer=llama8b_tokenizer,
    inputs=nq_answers,
    start_ix=0, end_ix=15,
    topk=5,
    plot_topk_lens=False,
    #json_log_path=None,
    json_log_path='logs/nq_answers/llama.8b-bnb4bit.fp32', # 20 samples
    #save_fig_path=None,
    #save_fig_path='Outputs/LogitLens/DH3B/logits_3b_fp32_math.jpg',
    #entropy=True,
    model_precision=None
)

[Debug] Layer names being passed: ['model.layers.0', 'model.layers.1', 'model.layers.2', 'model.layers.3', 'model.layers.4', 'model.layers.5', 'model.layers.6', 'model.layers.7', 'model.layers.8', 'model.layers.9', 'model.layers.10', 'model.layers.11', 'model.layers.12', 'model.layers.13', 'model.layers.14', 'model.layers.15', 'model.layers.16', 'model.layers.17', 'model.layers.18', 'model.layers.19', 'model.layers.20', 'model.layers.21', 'model.layers.22', 'model.layers.23', 'model.layers.24', 'model.layers.25', 'model.layers.26', 'model.layers.27', 'model.layers.28', 'model.layers.29', 'model.layers.30', 'model.layers.31', 'model.embed_tokens']
[Debug] Trying to access layer: model.layers.0
[Debug] Successfully found layer: model.layers.0
[Debug] Trying to access layer: model.layers.1
[Debug] Successfully found layer: model.layers.1
[Debug] Trying to access layer: model.layers.2
[Debug] Successfully found layer: model.layers.2
[Debug] Trying to access layer: model.layers.3
[Debug] Su

In [None]:
dh8b_bnb8_float32

In [None]:
import pandas as pd
import json

json_path = 'logs/gsm8k/llama.8b-1.58.fp32'
# Load your JSON file
with open(json_path, 'r') as f:
    log_data = json.load(f)

# Convert the loaded JSON data to a DataFrame
df = pd.json_normalize(log_data)

# If you want to see the dataframe
print(df)

# Optionally, display it in a Jupyter notebook in a more readable format
df.head()


In [None]:
import pandas as pd
import json
# Load the JSON file and normalize into DataFrame
with open('logs/gsm8k/dh.3b-ptsq.fp32', 'r') as f:
    log_data = json.load(f)
df = pd.json_normalize(log_data)

# Ensure each row's layer_names and entropy have matching length
num_layers = len(df.loc[0, 'layer_names'])
sum_entropy = [0.0] * num_layers
valid_rows = 0

for _, row in df.iterrows():
    layer_names = row['layer_names']
    entropy = row['entropy']
    
    # Validate that both lists are the expected length
    if isinstance(entropy, list) and len(entropy) == num_layers:
        sum_entropy = [s + e for s, e in zip(sum_entropy, entropy)]
        valid_rows += 1

# Compute average
if valid_rows > 0:
    avg_entropy = [e / valid_rows for e in sum_entropy]
    layer_labels = df.loc[0, 'layer_names']

    # Plotting
    plt.figure(figsize=(14, 6))
    plt.plot(layer_labels, avg_entropy, marker='o', linestyle='-', color='b')
    plt.xlabel('Layer Name')
    plt.ylabel('Average Entropy')
    plt.title(f'Average Entropy Across Layers (n = {valid_rows} samples)')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
else:
    print("No valid rows matched expected layer length.")

In [None]:
df['entropy']

In [None]:
df.columns

In [None]:
df['normalized_entropy']

In [None]:
logit_lens.plot_logit_lens(
    model=llama8b_fp32,
    tokenizer=llama8b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    save_fig_path=None,
    #save_fig_path='Outputs/LogitLens/LI8B/probs_hf1bit_fp32_qa_blockstep10.jpg',
    probs=True,
    block_step=5,
)

In [None]:
logit_lens.plot_logit_lens(
    model=hfbit1_fp32,
    tokenizer=hfbit1_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    save_fig_path=None,
    #save_fig_path='Outputs/LogitLens/LI8B/probs_hf1bit_fp32_qa_blockstep10.jpg',
    #probs=True,
    block_step=5,
)

In [None]:
logit_lens.plot_logit_lens(
    model=dh3b_bitnet_fp32_qlmhead,
    tokenizer=dh3b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    save_fig_path=None,
    #save_fig_path='Outputs/LogitLens/LI8B/probs_hf1bit_fp32_qa_blockstep10.jpg',
    entropy=True,
    block_step=5,
)

In [None]:
logit_lens.plot_logit_lens(
    model=dh3b_bitnet_fp32_flmhead,
    tokenizer=dh3b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    #save_fig_path=None,
    save_fig_path='Outputs/LogitLens/DH3B/logits_flmhead_fp32_qa.jpg',
    #kl=True,
)

In [None]:
logit_lens.plot_comparing_lens(
    models=(dh3b_fp32, dh3b_bitnet_fp32_qlmhead),
    tokenizer=dh3b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=6,
    #save_fig_path='Outputs/LogitLens/LI8B/nwd_llama_hf1bit_qa_blockstep10.jpg',
    save_fig_path=None,
    wasserstein=True,
    #top_down=False,
    block_step=5,
)

In [None]:
logit_lens.plot_comparing_lens(
    models=(olmo1b_fp32, olmo1b_bitnet_fp32_qlmhead),
    tokenizer=olmo1b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    #save_fig_path='Outputs/LogitLens/OLMo1B/nwd_flmhead_qlmhead_math.jpg',
    #save_fig_path=None,
    wasserstein=True,
    #top_down=False,
    #block_step=3,
)

In [None]:
logit_lens.plot_comparing_lens(
    models=(llama8b_fp32, hfbit1_fp32),
    tokenizer=hfbit1_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    #save_fig_path='Outputs/LogitLens/LI8B/nwd_instruct_bitnet_fp32_qa.jpg',
    save_fig_path=None,
    wasserstein=True,
    #top_down=False,
)

In [None]:
logit_lens.plot_comparing_lens(
    models=(llama8b_fp32, llama8b_bnb4_fp16),
    tokenizer=llama8b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    #save_fig_path='Outputs/LogitLens/DH3B/nwd_3bfp32_ptdq_math.jpg',
    save_fig_path=None,
    wasserstein=True,
    #top_down=False,
)

In [None]:
logit_lens.plot_topk_lens(
    model=olmo1b_fp32,
    tokenizer=olmo1b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    topk_n=5,
    #save_fig_path='Outputs/LogitLens/LI8B/topk5logits_bnb4bit_qa.jpg'
    #save_fig_path=None,
    #top_down=False,
)

In [None]:
logit_lens.plot_topk_lens(
    model=olmo1b_bitnet_fp32_qlmhead,
    tokenizer=olmo1b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    topk_n=5,
    #save_fig_path='Outputs/LogitLens/LI8B/topk5logits_ptsq_fp32_math.jpg'
    #save_fig_path=None,
    #entropy=True,
    #top_down=False,
)

In [None]:
activation_lens.plot_activation_lens(
    model=olmo1b_fp32,
    tokenizer=olmo1b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    metric='norm',
    save_fig_path='Outputs/LogitLens/OLMo1B/actnorm_fp_qa.jpg', #HUSK FOR BITNET hfb1 Q12 qa!
    #save_fig_path=None,
)

In [None]:
activation_lens.plot_activation_lens(
    model=olmo1b_bitnet_fp32_qlmhead,
    tokenizer=olmo1b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    metric='norm',
    save_fig_path='Outputs/LogitLens/OLMo1B/actnorm_ptsq_qa.jpg',
    #save_fig_path=None,
)

In [None]:
activation_lens.plot_comparing_act_lens(
    models=(olmo1b_fp32, olmo1b_bitnet_fp32_qlmhead),
    tokenizer=olmo1b_tokenizer,
    input_ids=MiscPrompts.Q12.value,
    start_ix=0, end_ix=15,
    metric='norm',
    metric_name='l2',
    save_fig_path='Outputs/LogitLens/OLMo1B/actnorm_fp_compare_ptsq_qa.jpg',
    #save_fig_path=None,
)

# Dictionary Learning: SAE

In [None]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True


In [None]:
dh3b_bitnet_fp32_ptsq

In [None]:
dictionary_learning.plot_sae_heatmap(
    model=dh3b_fp32,
    tokenizer=dh3b_tokenizer,
    inputs=gsm8k_questions_sae,
    plot_sae=True,
    do_log=True,
    top_k=5,
    tokens_per_row=30,
    target_layers=[2, 9, 16, 20, 26],
    log_path='logs/sae_logs/DH3B/fp',
    log_name='dh.3b-ptsq.fp32',
    fig_path=None,
    deterministic_sae=False
)

In [None]:
dictionary_learning.plot_sae_tokens(
    model=dh3b_fp32,
    tokenizer=dh3b_tokenizer,
    inputs=PARAMS.get('prompt'),
    multi_tokens=False,
    do_log=False,
    target_layers=[5],
    vis_projection=None,
    log_path=None,
    log_name=None,
    fig_path=None
)

In [None]:
dictionary_learning.plot_sae_tokens(
    model=dh3b_fp32,
    tokenizer=dh3b_tokenizer,
    inputs=PARAMS.get('prompt'),
    multi_tokens=True,
    do_log=False,
    target_layers=[5],
    vis_projection=None,
    log_path=None,
    log_name=None,
    fig_path=None
)

In [None]:
dictionary_learning.plot_sae_tokens(
    model=dh3b_bitnet_fp32,
    tokenizer=dh3b_tokenizer,
    inputs=PARAMS.get('prompt'),
    multi_tokens=True,
    do_log=False,
    target_layers=[5],
    vis_projection=None,
    log_path=None,
    log_name=None,
    fig_path=None
)

In [None]:
dictionary_learning.plot_sae_heatmap(
    model=olmo1b_fp32,
    tokenizer=olmo1b_tokenizer,
    inputs=Texts.T1.value,
    do_log=False,
    top_k=5,
    tokens_per_row=30,
    target_layers=[5, 10, 15],
    log_path=None,
    log_name=None,
    fig_path=None,
    deterministic_sae=False
)

In [None]:
dictionary_learning.plot_sae_heatmap(
    model=olmo1b_fp32,
    tokenizer=olmo1b_tokenizer,
    inputs=Texts.T1.value,
    do_log=False,
    top_k=5,
    tokens_per_row=30,
    target_layers=[5, 10, 15],
    log_path=None,
    log_name=None,
    fig_path=None,
    deterministic_sae=True
)

In [None]:
dictionary_learning.plot_sae_heatmap(
    model=olmo2t_fp32,
    tokenizer=olmo2t_tokenizer,
    inputs=Texts.T1.value,
    do_log=False,
    top_k=5,
    tokens_per_row=30,
    target_layers=[5, 10, 15],
    log_path=None,
    log_name=None,
    fig_path=None,
    deterministic_sae=False
)

In [None]:
dictionary_learning.plot_comparing_heatmap(
    models=(dh3b_fp32, dh3b_bitnet_fp32_qlmhead),
    tokenizer=dh3b_tokenizer,
    inputs=Texts.T1.value,
    top_k=5,
    tokens_per_row=30,
    target_layers=[5, 15, 25],
    fig_path=None,
    deterministic_sae=False
)

In [None]:
dictionary_learning.plot_comparing_heatmap(
    models=(dh3b_fp32, dh3b_bitnet_fp32_ptsq),
    tokenizer=dh3b_tokenizer,
    inputs=gsm8k_questions_sae,
    top_k=5,
    tokens_per_row=30,
    target_layers=[2, 9, 16, 20, 26],
    fig_path=None,
    deterministic_sae=False
)

In [None]:
dictionary_learning.plot_comparing_heatmap(
    models=(llama8b_fp32, hfbit1_fp32),
    tokenizer=llama8b_tokenizer,
    inputs=gsm8k_questions_sae,
    top_k=5,
    tokens_per_row=30,
    target_layers=[2, 9, 16, 23, 30],
    fig_path=None
)

# Deep Hermes Chatbot Analysis (template only)

In [None]:
chat_dict = {
    #'dh.3b-llama.fp32': dh3b_fp32,
    #'dh.3b-bnb4bit.fp16': dh3b_bnb4_fp16,
    #'dh.3b-1.58.ptdq': dh3b_bitnet_fp32, 
    #'dh.3b-1.58.ptsq': dh3b_bitnet_fp32,
    #'dh.8b-llama.fp32': dh8b_fp32,
    #'dh.8b-bnb4bit.fp16': dh8b_bnb4_fp16,
    #'dh.8b-1.58.ptdq': dh8b_bitnet_fp32,
    #'dh.8b-1.58.ptsq': dh8b_bitnet_fp32,
    #'llama.8b-instruct.fp32': llama8b_fp32,
    #'llama.8b-bnb4bit.fp16': llama8b_bnb4_fp16,
    #'llama.8b-1.58.fp32': hfbit1_fp32,
    #'llama.8b-1.58.ptdq': llama8b_bitnet_fp32,
    #'llama.8b-1.58.ptsq': llama8b_bitnet_fp32,
}

In [None]:
PARAMS:Dict = {
    'context': Contexts.C1.value,
    'prompt': MiscPrompts.Q2.value,
    'max_new_tokens': 100,
    'temperature': 0.8,
    'repetition_penalty': 1.1,
    'sample': True,
    'device': None
}

In [None]:
chatbot_analysis.run_gsm8k_analysis(
    model=dh3b_bnb4_float32,
    tokenizer=dh3b_tokenizer,
    model_name='dh.3b-bnb4bit.fp32',
    dataset=gsm8k_dataset['train'],
    save_path='logs/gsm8k_logs/DH3B',
    num_samples=10,
    deterministic_backend=False
)

In [None]:
chatbot_analysis.run_chatbot_analysis(
    models=chat_dict,
    tokenizer=dh3b_tokenizer,
    deep_thinking=False,
    full_path='logs/chatbot_logs',
    deterministic_backend=False
)

In [None]:
chatbot_analysis.plot_chatbot_analysis(
    json_logs='logs/gsm8k_logs',
    parallel_plot=True,
    reference_file='logs/gsm8k_logs/llama.8b-1.58.fp32.json'
)

In [None]:
chatbot_analysis.plot_chatbot_analysis(
    json_logs='logs/gsm8k_logs',
    parallel_plot=False,
    reference_file='logs/gsm8k_logs/llama.8b-1.58.fp32.json',
    title="Model Metrics ('What is y if y=2*2-4+(3*2)')"
)

In [None]:
import os
import json
import pandas as pd
import matplotlib.cm as cm
# Path to your results folder
results_dir = "logs/gsm8k_logs"

# Load all JSONs into a DataFrame
all_results = []
for filename in os.listdir(results_dir):
    if filename.endswith(".json"):
        with open(os.path.join(results_dir, filename), 'r') as f:
            data = json.load(f)
            all_results.append(data)

df = pd.DataFrame(all_results)

In [None]:
# Define metrics
metrics = [
    'Perplexity',
    'CPU Usage (%)',
    'RAM Usage (%)',
    'GPU Memory (MB)',
    'Activation Similarity',
    'Latency (s)'
]

# Generate a color map
models = df['Model'].tolist()
num_models = len(models)
colors = cm.get_cmap('tab20c', num_models)  # You can try 'Set2', 'hsv', 'tab10', etc.

model_colors = {model: colors(i) for i, model in enumerate(models)}

# Plot
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 10))
axes = axes.flatten()

for i, metric in enumerate(metrics):
    ax = axes[i]
    for j, model in enumerate(models):
        ax.bar(model, df.loc[j, metric], color=model_colors[model])
    ax.set_title(metric)
    ax.set_xticklabels(models, rotation=45, ha='right')
    ax.grid(True)

# Custom legend
handles = [plt.Rectangle((0,0),1,1, color=model_colors[model]) for model in models]
fig.legend(handles, models, loc='upper center', ncol=num_models)

plt.tight_layout(rect=[0, 0, 1, 0.92])
plt.suptitle("GSM8K for n=10", fontsize=14)
plt.show()

In [None]:
metrics = [
    'Perplexity',
    'CPU Usage (%)',
    'RAM Usage (%)',
    'GPU Memory (MB)',
    'Activation Similarity',
    'Latency (s)',
    "Last Layer Mean Activation",
    "Last Layer Activation Std",
    "Mean Logits",
    "Logit Std",
]

# Generate a color map
models = df['Model'].tolist()
num_models = len(models)
colors = cm.get_cmap('coolwarm', num_models)
model_colors = {model: colors(i) for i, model in enumerate(models)}

# Prepare subplot grid dynamically
n_metrics = len(metrics)
ncols = 5
nrows = int(np.ceil(n_metrics / ncols))

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 10))
axes = axes.flatten()

for i, metric in enumerate(metrics):
    ax = axes[i]
    if metric in df.columns:
        for j, model in enumerate(models):
            ax.bar(model, df.loc[j, metric], color=model_colors[model])
        ax.set_title(metric, fontsize=10)
        ax.set_xticks(range(len(models)))
        ax.set_xticklabels(models, rotation=45, ha='right', fontsize=8)
        ax.grid(True)

# Hide unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Legend
##handles = [plt.Rectangle((0, 0), 1, 1, color=model_colors[model]) for model in models]
#fig.legend(handles, models, loc='upper center', ncol=min(num_models, 5))

plt.tight_layout(rect=[0, 0, 1, 0.93])
plt.suptitle("Deep Hermes LLaMA 3B & LLaMA Instruct 8B GSM8K (n=10)", fontsize=12)
plt.savefig('Outputs/Report/llama8bdh3b_subplots.png')
plt.show()

In [None]:
metrics = [
    'Perplexity',
    'CPU Usage (%)',
    'RAM Usage (%)',
    'GPU Memory (MB)',
    'Activation Similarity',
    'Latency (s)',
    "Last Layer Mean Activation",
    "Last Layer Activation Std",
    "Mean Logits",
    "Logit Std",
]

# Generate a color map
models = df['Model'].tolist()
num_models = len(models)
colors = cm.get_cmap('coolwarm', num_models)
model_colors = {model: colors(i) for i, model in enumerate(models)}

# Prepare subplot grid dynamically
n_metrics = len(metrics)
ncols = 5
nrows = int(np.ceil(n_metrics / ncols))

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 10))
axes = axes.flatten()

for i, metric in enumerate(metrics):
    ax = axes[i]
    if metric in df.columns:
        values = df[metric].copy()

        # Normalize perplexity via log-scale
        if metric == "Perplexity":
            values = np.log1p(values)  # log1p handles 0 gracefully

        for j, model in enumerate(models):
            ax.bar(model, values[j], color=model_colors[model])
        ax.set_title(metric + (" (log)" if metric == "Perplexity" else ""), fontsize=10)
        ax.set_xticks(range(len(models)))
        ax.set_xticklabels(models, rotation=45, ha='right', fontsize=8)
        ax.grid(True)

# Hide unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Legend
##handles = [plt.Rectangle((0, 0), 1, 1, color=model_colors[model]) for model in models]
#fig.legend(handles, models, loc='upper center', ncol=min(num_models, 5))

plt.tight_layout(rect=[0, 0, 1, 0.93])
plt.suptitle("Deep Hermes LLaMA 3B & LLaMA Instruct 8B GSM8K (n=10)", fontsize=12)
plt.savefig('Outputs/Report/llama8bdh3b_subplots_normalized_perplexity.png')
plt.show()

In [None]:
import seaborn as sns

# Metrics to include (excluding 'Activation Similarity')
metrics_for_corr = [
    'Perplexity',
    'CPU Usage (%)',
    'RAM Usage (%)',
    'GPU Memory (MB)',
    'Latency (s)',
    "Last Layer Mean Activation",
    "Last Layer Activation Std",
    "Mean Logits",
    "Logit Std",
]

# Compute correlation
corr_matrix = df[metrics_for_corr].corr()

# Plot
plt.figure(figsize=(10, 8))
sns.heatmap(
    corr_matrix, 
    annot=True, 
    cmap='coolwarm', 
    fmt=".2f", 
    linewidths=0.5, 
    square=True,
    cbar_kws={"shrink": 0.75}
)
plt.title("Deep Hermes LLaMA 3B & LLaMA Instruct 8B GSM8K Correlation (n=10)", fontsize=12)
plt.tight_layout()
plt.savefig('Outputs/Report/llama8bdh3b_corr_heatmap.png')
plt.show()