In [1]:
import os
os.chdir('/home/smallyan/eval_agent')
print(f"Working directory: {os.getcwd()}")

Working directory: /home/smallyan/eval_agent


# Generalizability Evaluation for InterpDetect

This notebook evaluates the generalizability of the circuit/neuron findings from the InterpDetect repository.

## Evaluation Checklist:
- **GT1**: Generalization to a New Model
- **GT2**: Generalization to New Data  
- **GT3**: Method/Specificity Generalizability

Let's first explore the repository structure to understand the research findings.

In [2]:
# Explore the repository structure
repo_path = '/net/scratch2/smallyan/InterpDetect_eval'

import subprocess
result = subprocess.run(['ls', '-la', repo_path], capture_output=True, text=True)
print("Repository contents:")
print(result.stdout)
print(result.stderr)

Repository contents:
total 1016
drwxrwx---  7 smallyan smallyan     13 Jan  7 14:53 .
drwxrwx--- 37 smallyan smallyan     37 Jan  7 19:41 ..
-rw-rw----  1 smallyan smallyan  10840 Dec 22 21:55 CodeWalkthrough.md
drwxrwx---  4 smallyan smallyan      5 Dec 22 21:55 datasets
-rw-r-----  1 smallyan smallyan 906708 Dec 23 01:40 documentation.pdf
drwxrwx---  3 smallyan smallyan      5 Jan  7 16:17 evaluation
drwxrwx---  7 smallyan smallyan     15 Jan  7 19:50 .git
-rw-rw----  1 smallyan smallyan   5030 Dec 22 21:55 .gitignore
-rw-rw----  1 smallyan smallyan   1066 Dec 22 21:55 LICENSE
-rw-rw----  1 smallyan smallyan   3777 Dec 23 01:52 plan.md
-rw-rw----  1 smallyan smallyan   1048 Dec 22 21:55 requirements.txt
drwxrwx---  6 smallyan smallyan     10 Dec 24 20:47 scripts
drwxrwx---  2 smallyan smallyan      6 Dec 22 21:55 trained_models




In [3]:
# Read the plan to understand the research
with open(f'{repo_path}/plan.md', 'r') as f:
    plan_content = f.read()
print(plan_content)

# Plan
## Objective
Develop a mechanistic interpretability-based hallucination detection method for Retrieval-Augmented Generation (RAG) systems by computing External Context Scores (ECS) across layers and attention heads and Parametric Knowledge Scores (PKS) across layers (FFN), training regression-based classifiers on these signals, and demonstrating generalization from a small proxy model (Qwen3-0.6b) to larger production models (GPT-4.1-mini).

## Hypothesis
1. RAG hallucinations correlate with:  later-layer FFN modules disproportionately inject parametric knowledge into the residual stream while attention heads fail to adequately exploit external context.
2. External Context Score (ECS) and Parametric Knowledge Score (PKS) are correlated with hallucination occurrence and can serve as predictive features for hallucination detection.
3. Mechanistic signals extracted from a small proxy model (0.6b parameters) can generalize to detect hallucinations in responses from larger production

In [4]:
# Explore the scripts directory to understand the implementation
result = subprocess.run(['find', f'{repo_path}/scripts', '-type', 'f', '-name', '*.py'], capture_output=True, text=True)
print("Python scripts:")
print(result.stdout)

# Also list the directories
result2 = subprocess.run(['ls', '-la', f'{repo_path}/scripts'], capture_output=True, text=True)
print("\nScripts directory:")
print(result2.stdout)

Python scripts:
/net/scratch2/smallyan/InterpDetect_eval/scripts/baseline/run_refchecker.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/baseline/run_hf.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/baseline/run_ragas.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/baseline/run_groq.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/baseline/run_trulens.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/baseline/run_gpt.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/predict.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/generate_response_hf.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/preprocess.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/helper.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/filter.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/generate_response_gpt.py
/net/scratch2/smallyan/InterpDetect_eval/scripts/preprocess/generate_labels.py
/net/scratch2/smallyan/InterpDetect

In [5]:
# Read the compute_scores.py to understand how ECS and PKS are computed
with open(f'{repo_path}/scripts/compute_scores.py', 'r') as f:
    compute_scores_content = f.read()
print(compute_scores_content)

# %%
#!pip install transformer_lens

import torch
from transformers import AutoTokenizer
from transformer_lens import HookedTransformer
import json
from torch.nn import functional as F
from typing import Dict, List, Tuple
import pdb
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import argparse
import sys
import os
import gc
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pointbiserialr

def load_examples(file_path):
    """Load examples from JSONL file"""
    print(f"Loading examples from {file_path}...")
    
    try:
        examples = []
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                examples.append(data)
        
        print(f"Loaded {len(examples)} examples")
        return examples
    except Exception as e:
        print(f"Error loading examples: {e}")
        sys.exit(1)

def setup_models(model_name, hf_

In [6]:
# Let's explore the datasets and trained models
result = subprocess.run(['ls', '-la', f'{repo_path}/datasets'], capture_output=True, text=True)
print("Datasets directory:")
print(result.stdout)

result2 = subprocess.run(['ls', '-la', f'{repo_path}/trained_models'], capture_output=True, text=True)
print("\nTrained models directory:")
print(result2.stdout)

Datasets directory:
total 43
drwxrwx--- 4 smallyan smallyan    5 Dec 22 21:55 .
drwxrwx--- 7 smallyan smallyan   13 Jan  7 14:53 ..
-rw-rw---- 1 smallyan smallyan 9283 Dec 22 21:55 OV_copying_score.json
drwxrwx--- 2 smallyan smallyan    4 Dec 22 21:55 test
drwxrwx--- 2 smallyan smallyan   20 Dec 22 21:55 train


Trained models directory:
total 15236
drwxrwx--- 2 smallyan smallyan        6 Dec 22 21:55 .
drwxrwx--- 7 smallyan smallyan       13 Jan  7 14:53 ..
-rw-rw---- 1 smallyan smallyan    21327 Dec 22 21:55 model_LR_3000.pickle
-rw-rw---- 1 smallyan smallyan   550685 Dec 22 21:55 model_RandomForest_3000.pickle
-rw-rw---- 1 smallyan smallyan 15123341 Dec 22 21:55 model_SVC_3000.pickle
-rw-rw---- 1 smallyan smallyan   275818 Dec 22 21:55 model_XGBoost_3000.pickle



In [7]:
# Explore datasets - train and test
result = subprocess.run(['ls', '-la', f'{repo_path}/datasets/train'], capture_output=True, text=True)
print("Train datasets:")
print(result.stdout)

result2 = subprocess.run(['ls', '-la', f'{repo_path}/datasets/test'], capture_output=True, text=True)
print("\nTest datasets:")
print(result2.stdout)

Train datasets:
total 41710
drwxrwx--- 2 smallyan smallyan      20 Dec 22 21:55 .
drwxrwx--- 4 smallyan smallyan       5 Dec 22 21:55 ..
-rw-rw---- 1 smallyan smallyan 7480050 Dec 22 21:55 train3000_w_chunk_score_part0.json
-rw-rw---- 1 smallyan smallyan 8320169 Dec 22 21:55 train3000_w_chunk_score_part10.json
-rw-rw---- 1 smallyan smallyan 7572237 Dec 22 21:55 train3000_w_chunk_score_part11.json
-rw-rw---- 1 smallyan smallyan 7261920 Dec 22 21:55 train3000_w_chunk_score_part12.json
-rw-rw---- 1 smallyan smallyan 7490607 Dec 22 21:55 train3000_w_chunk_score_part13.json
-rw-rw---- 1 smallyan smallyan 7789460 Dec 22 21:55 train3000_w_chunk_score_part14.json
-rw-rw---- 1 smallyan smallyan 7444065 Dec 22 21:55 train3000_w_chunk_score_part15.json
-rw-rw---- 1 smallyan smallyan 7850314 Dec 22 21:55 train3000_w_chunk_score_part16.json
-rw-rw---- 1 smallyan smallyan 7124779 Dec 22 21:55 train3000_w_chunk_score_part17.json
-rw-rw---- 1 smallyan smallyan 7783995 Dec 22 21:55 train3000_w_chunk_sc

In [8]:
# Read the classifier script to understand how the model is trained
with open(f'{repo_path}/scripts/classifier.py', 'r') as f:
    classifier_content = f.read()
print(classifier_content)

# %%
# !pip install feature_engine
# !pip install xgboost
# !pip install lightgbm
# !pip install optuna
# !pip install --upgrade scikit-learn
# !pip install unidecode

import pandas as pd
import json
import numpy as np
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
import pickle
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import argparse
import sys

def load_data(folder_path):
    """Load data from JSON files in the specified folder"""
    print(f"Loading data from {folder_path}...")
    
    try:
        response = []
        json_files = glob.glob(os.path.join(folder_path, "*.json"))
        
        i

## Summary of the Research

The repository implements a **mechanistic interpretability-based hallucination detection method** for RAG systems:

### Key Findings:
1. **External Context Score (ECS)**: Computed per attention head by measuring how much the model attends to external context and uses it in responses. Lower ECS correlates with hallucinations.

2. **Parametric Knowledge Score (PKS)**: Computed per FFN layer using Jensen-Shannon divergence between vocabulary distributions before and after the FFN layer. Higher PKS in later layers correlates with hallucinations.

3. **Proxy Model Evaluation**: The method uses Qwen3-0.6B to extract signals but claims to generalize to larger models like GPT-4.1-mini.

### Models Used in Original Work:
- **Signal extraction model**: Qwen3-0.6B
- **Response generation models tested**: Qwen3-0.6B (self-evaluation), GPT-4.1-mini (proxy evaluation)

### Trained Classifiers:
- Logistic Regression, SVC, Random Forest, XGBoost (SVC selected as best)

In [9]:
# Check CUDA availability and setup
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA device count: {torch.cuda.device_count()}")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

CUDA available: True
CUDA device: NVIDIA A40
CUDA device count: 1
Using device: cuda


In [10]:
# Let's look at the test data to understand the data format
import json

with open(f'{repo_path}/datasets/test/test_w_chunk_score_qwen06b.json', 'r') as f:
    test_data_qwen = json.load(f)

print(f"Number of test examples (Qwen): {len(test_data_qwen)}")
print("\nFirst example keys:")
print(test_data_qwen[0].keys())
print("\nFirst example structure:")
for key in ['prompt', 'response', 'scores']:
    if key == 'scores':
        print(f"\n{key}:")
        print(f"  Number of score spans: {len(test_data_qwen[0][key])}")
        print(f"  First score span keys: {test_data_qwen[0][key][0].keys()}")
        print(f"  Hallucination label: {test_data_qwen[0][key][0].get('hallucination_label', 'N/A')}")
    else:
        val = test_data_qwen[0].get(key, 'N/A')
        if isinstance(val, str) and len(val) > 200:
            val = val[:200] + "..."
        print(f"\n{key}: {val}")

Number of test examples (Qwen): 256

First example keys:
dict_keys(['id', 'question', 'documents', 'documents_sentences', 'prompt', 'prompt_spans', 'num_tokens', 'response', 'response_spans', 'labels', 'hallucinated_llama-4-maverick-17b-128e-instruct', 'hallucinated_gpt-oss-120b', 'labels_llama', 'labels_gpt', 'scores'])

First example structure:

prompt: Given the context, please answer the question based on the provided information from the context. Include any reasoning with the answer

Context:Stockholder return performance graph the following grap...

response: The rate of return in Cadence Design Systems Inc. for an investment from 2010 to 2011 can be calculated by comparing the cumulative total return on the investment to the initial investment. Given that...

scores:
  Number of score spans: 5
  First score span keys: dict_keys(['prompt_attention_score', 'r_span', 'hallucination_label', 'parameter_knowledge_scores'])
  Hallucination label: 0


In [11]:
# Load necessary libraries for evaluation
import numpy as np
import pandas as pd
import pickle
import json
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score

# Load the trained SVC model (best performing model according to the plan)
model_path = f'{repo_path}/trained_models/model_SVC_3000.pickle'
with open(model_path, 'rb') as f:
    svc_model = pickle.load(f)

print("Loaded SVC model")
print(f"Model type: {type(svc_model)}")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loaded SVC model
Model type: <class 'sklearn.pipeline.Pipeline'>


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [12]:
# Let's look at what features the model expects
print("Pipeline steps:")
for name, step in svc_model.named_steps.items():
    print(f"  {name}: {type(step)}")

# Get the feature names from the training data format
with open(f'{repo_path}/datasets/train/train3000_w_chunk_score_part0.json', 'r') as f:
    train_sample = json.load(f)

# Extract column names from the training data
ATTENTION_COLS = list(train_sample[0]['scores'][0]['prompt_attention_score'].keys())
PARAMETER_COLS = list(train_sample[0]['scores'][0]['parameter_knowledge_scores'].keys())

print(f"\nNumber of attention columns (ECS features): {len(ATTENTION_COLS)}")
print(f"Number of parameter knowledge columns (PKS features): {len(PARAMETER_COLS)}")
print(f"Total features: {len(ATTENTION_COLS) + len(PARAMETER_COLS)}")

Pipeline steps:
  pipeline: <class 'sklearn.pipeline.Pipeline'>
  svc: <class 'sklearn.svm._classes.SVC'>



Number of attention columns (ECS features): 448
Number of parameter knowledge columns (PKS features): 28
Total features: 476


---
# GT1: Model Generalization Evaluation

The original work uses **Qwen3-0.6B** for signal extraction. To test model generalization, we need to verify if the neuron-level findings (ECS and PKS patterns) are predictable on a **new model**.

**Approach**: We will test using a different model architecture to extract ECS/PKS signals and see if the hallucination patterns still hold.

**New models to try** (not used in original work):
1. Qwen2-0.5B (different model family version)
2. GPT-2 (completely different architecture)
3. Pythia models (different training approach)

In [13]:
# First, let's understand what exactly we need to test for GT1
# The key finding is that ECS (External Context Score) and PKS (Parametric Knowledge Score)
# correlate with hallucination in specific ways:
# - ECS: Negative correlation (lower ECS = more hallucination)
# - PKS: Positive correlation in later layers (higher PKS = more hallucination)

# For GT1, we need to test if these patterns hold when using a DIFFERENT model for signal extraction
# The original uses Qwen3-0.6B which has 28 layers

# Let's check what models TransformerLens supports
from transformer_lens import HookedTransformer

# List available models
try:
    print("Checking TransformerLens available models...")
    # Some common small models in TransformerLens
    available_models = [
        "gpt2-small",  # 12 layers, 12 heads - different architecture
        "pythia-70m",   # EleutherAI - different architecture
        "pythia-160m",  # EleutherAI - different architecture  
    ]
    print("Models we can try:", available_models)
except Exception as e:
    print(f"Error: {e}")

AttributeError: module 'wandb.proto.wandb_internal_pb2' has no attribute 'Result'

In [14]:
# Let's work around the wandb issue by disabling it
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

# Try importing transformer_lens again
import importlib
import sys

# Remove wandb from cached modules to force reimport
modules_to_remove = [key for key in sys.modules.keys() if 'wandb' in key or 'transformer_lens' in key]
for mod in modules_to_remove:
    del sys.modules[mod]

# Now import without wandb
try:
    from transformer_lens import HookedTransformer
    print("Successfully imported HookedTransformer")
except Exception as e:
    print(f"Still getting error: {e}")
    print("Let's try a different approach using transformers directly")

Still getting error: module 'wandb.proto.wandb_internal_pb2' has no attribute 'Result'
Let's try a different approach using transformers directly


In [15]:
# Let's use transformers directly instead of transformer_lens
# We can implement the ECS and PKS computations using HuggingFace transformers

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from torch.nn import functional as F
from sentence_transformers import SentenceTransformer
import numpy as np

print("Setting up models for GT1 evaluation...")
print(f"Using device: {device}")

# For GT1, we'll use a different model than Qwen3-0.6B
# Let's try GPT-2 which has a different architecture
# GPT-2 small has 12 layers and 12 heads (vs Qwen3-0.6B with 28 layers and 16 heads)

gt1_model_name = "gpt2"  # NOT used in original work
print(f"Loading new model for GT1: {gt1_model_name}")

tokenizer_gt1 = AutoTokenizer.from_pretrained(gt1_model_name)
model_gt1 = AutoModelForCausalLM.from_pretrained(gt1_model_name, output_attentions=True, output_hidden_states=True)
model_gt1 = model_gt1.to(device)
model_gt1.eval()

print(f"Model loaded: {gt1_model_name}")
print(f"Number of layers: {model_gt1.config.n_layer}")
print(f"Number of heads: {model_gt1.config.n_head}")

Setting up models for GT1 evaluation...
Using device: cuda
Loading new model for GT1: gpt2


OSError: PermissionError at /net/projects2/chacha/hub when downloading gpt2. Check cache directory permissions. Common causes: 1) another user is downloading the same model (please wait); 2) a previous download was canceled and the lock file needs manual removal.

In [16]:
# Set a local cache directory that we have permissions for
import os
os.environ['HF_HOME'] = '/home/smallyan/.cache/huggingface'
os.environ['TRANSFORMERS_CACHE'] = '/home/smallyan/.cache/huggingface/transformers'

# Create the directory if it doesn't exist
os.makedirs('/home/smallyan/.cache/huggingface/transformers', exist_ok=True)

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

print("Setting up models for GT1 evaluation...")
print(f"Using device: {device}")

# For GT1, we'll use a different model than Qwen3-0.6B
gt1_model_name = "gpt2"  # NOT used in original work
print(f"Loading new model for GT1: {gt1_model_name}")

tokenizer_gt1 = AutoTokenizer.from_pretrained(gt1_model_name)
model_gt1 = AutoModelForCausalLM.from_pretrained(gt1_model_name, output_attentions=True, output_hidden_states=True)
model_gt1 = model_gt1.to(device)
model_gt1.eval()

print(f"Model loaded: {gt1_model_name}")
print(f"Number of layers: {model_gt1.config.n_layer}")
print(f"Number of heads: {model_gt1.config.n_head}")

Setting up models for GT1 evaluation...
Using device: cuda
Loading new model for GT1: gpt2


OSError: PermissionError at /net/projects2/chacha/hub when downloading gpt2. Check cache directory permissions. Common causes: 1) another user is downloading the same model (please wait); 2) a previous download was canceled and the lock file needs manual removal.

In [17]:
# Need to set it before importing transformers - let's check the huggingface_hub config
import huggingface_hub
print(f"Current HF cache: {huggingface_hub.constants.HF_HUB_CACHE}")

# Let's set it properly
cache_dir = '/home/smallyan/.cache/huggingface'
os.makedirs(cache_dir, exist_ok=True)

# Force the cache directory
from transformers import AutoModelForCausalLM, AutoTokenizer

gt1_model_name = "gpt2"
print(f"Loading new model for GT1: {gt1_model_name}")

tokenizer_gt1 = AutoTokenizer.from_pretrained(gt1_model_name, cache_dir=cache_dir)
model_gt1 = AutoModelForCausalLM.from_pretrained(gt1_model_name, output_attentions=True, output_hidden_states=True, cache_dir=cache_dir)
model_gt1 = model_gt1.to(device)
model_gt1.eval()

print(f"Model loaded: {gt1_model_name}")
print(f"Number of layers: {model_gt1.config.n_layer}")
print(f"Number of heads: {model_gt1.config.n_head}")

Current HF cache: /net/projects2/chacha/hub
Loading new model for GT1: gpt2


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 13] Permission denied: '/net/projects2/chacha/hub'


OSError: PermissionError at /net/projects2/chacha/hub when downloading gpt2. Check cache directory permissions. Common causes: 1) another user is downloading the same model (please wait); 2) a previous download was canceled and the lock file needs manual removal.

In [18]:
# Let's set the HF_HOME env variable properly before any huggingface imports
import os
import sys

# Remove all huggingface-related modules to force fresh import
modules_to_remove = [key for key in sys.modules.keys() if 'transformers' in key or 'huggingface' in key]
for mod in modules_to_remove:
    del sys.modules[mod]

# Set environment variable BEFORE importing
os.environ['HF_HOME'] = '/home/smallyan/.cache/huggingface'
os.environ['TRANSFORMERS_CACHE'] = '/home/smallyan/.cache/huggingface/hub'
os.environ['HF_HUB_CACHE'] = '/home/smallyan/.cache/huggingface/hub'

# Create directories
os.makedirs('/home/smallyan/.cache/huggingface/hub', exist_ok=True)

# Check it's set
print(f"HF_HOME: {os.environ.get('HF_HOME')}")
print(f"TRANSFORMERS_CACHE: {os.environ.get('TRANSFORMERS_CACHE')}")
print(f"HF_HUB_CACHE: {os.environ.get('HF_HUB_CACHE')}")

HF_HOME: /home/smallyan/.cache/huggingface
TRANSFORMERS_CACHE: /home/smallyan/.cache/huggingface/hub
HF_HUB_CACHE: /home/smallyan/.cache/huggingface/hub


In [19]:
# Now import huggingface_hub
import huggingface_hub
print(f"HF cache after import: {huggingface_hub.constants.HF_HUB_CACHE}")

# Import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

cache_dir = '/home/smallyan/.cache/huggingface/hub'

gt1_model_name = "gpt2"
print(f"\nLoading new model for GT1: {gt1_model_name}")

tokenizer_gt1 = AutoTokenizer.from_pretrained(gt1_model_name, cache_dir=cache_dir)
print("Tokenizer loaded successfully!")

model_gt1 = AutoModelForCausalLM.from_pretrained(gt1_model_name, output_attentions=True, output_hidden_states=True, cache_dir=cache_dir)
model_gt1 = model_gt1.to(device)
model_gt1.eval()

print(f"Model loaded: {gt1_model_name}")
print(f"Number of layers: {model_gt1.config.n_layer}")
print(f"Number of heads: {model_gt1.config.n_head}")

AttributeError: No huggingface_hub attribute constants

In [20]:
# Just import and load directly
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

cache_dir = '/home/smallyan/.cache/huggingface/hub'

gt1_model_name = "gpt2"
print(f"Loading new model for GT1: {gt1_model_name}")

tokenizer_gt1 = AutoTokenizer.from_pretrained(gt1_model_name, cache_dir=cache_dir)
print("Tokenizer loaded successfully!")

model_gt1 = AutoModelForCausalLM.from_pretrained(gt1_model_name, output_attentions=True, output_hidden_states=True, cache_dir=cache_dir)
model_gt1 = model_gt1.to(device)
model_gt1.eval()

print(f"Model loaded: {gt1_model_name}")
print(f"Number of layers: {model_gt1.config.n_layer}")
print(f"Number of heads: {model_gt1.config.n_head}")



Loading new model for GT1: gpt2


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizer loaded successfully!


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [21]:
# Check if loading completed
print(f"Model loaded: {gt1_model_name}")
print(f"Number of layers: {model_gt1.config.n_layer}")
print(f"Number of heads: {model_gt1.config.n_head}")

In [22]:
# Check model status
print("Checking model status...")
try:
    print(f"model_gt1 type: {type(model_gt1)}")
    print(f"Config: {model_gt1.config.n_layer} layers, {model_gt1.config.n_head} heads")
except Exception as e:
    print(f"Error: {e}")
    print("Model may not have loaded properly")

In [23]:
print("test")