In [38]:
import pickle 
with open(r'src/Dataloaders/train_loader.pkl', 'rb') as f:
    train_loader = pickle.load(f)

In [39]:
for val in train_loader:
    data = val['input_ids'] 
    break

In [40]:
import os

In [58]:
os.getcwd()

'd:\\DecoderKAN'

In [59]:
cd src

d:\DecoderKAN\src


In [76]:
import sys
import os
import importlib
import tptransformer
importlib.reload(tptransformer)

# Add the src directory to sys.path
src_path = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), 'src'))
sys.path.append(src_path)
print(f"Added {src_path} to sys.path")

import torch
from tptransformer import build_transformer  # Import from src.tptransformer
from transformers import PreTrainedTokenizerFast

# Hyperparameters (must match training)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
VOCAB_SIZE = 512
MAX_SEQ_LEN = 44
PAD_IDX = 0
DROPOUT = 0.1
FILTER_DIM = 2048
N_LAYERS = 6
N_HEADS = 8
HIDDEN_DIM = 512

# Define HyperParams class (same as training)
class HyperParams:
    def __init__(self):
        self.input_dim = VOCAB_SIZE
        self.filter = FILTER_DIM
        self.n_layers = N_LAYERS
        self.n_heads = N_HEADS
        self.hidden = HIDDEN_DIM
        self.dropout = DROPOUT

params = HyperParams()

# Load tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("../src/tokenizer/QED_TOKENIZER")

# Check vocab size
print(f"Tokenizer vocab size: {len(tokenizer)}")
if len(tokenizer) != VOCAB_SIZE:
    raise ValueError(f"Tokenizer vocab size ({len(tokenizer)}) does not match VOCAB_SIZE ({VOCAB_SIZE}). Please ensure the tokenizer is reverted to 512 tokens.")

# Debug: Inspect the first few tokens in the vocab
vocab = tokenizer.get_vocab()
print(f"First few tokens: {list(vocab.items())[:5]}")
print(f"Does [PAD] exist? {'[PAD]' in vocab}")
print(f"Does [EOS] exist? {'[EOS]' in vocab}")
print(f"Does [BOS] exist? {'[BOS]' in vocab}")
print(f"Does [SEP] exist? {'[SEP]' in vocab}")

# Set special tokens without adding new ones
# PAD token
if tokenizer.pad_token is None:
    if '[PAD]' in tokenizer.get_vocab():
        tokenizer.pad_token = '[PAD]'
    else:
        tokenizer.pad_token = tokenizer.convert_ids_to_tokens(0)

# EOS token
if tokenizer.eos_token is None:
    if '[EOS]' in tokenizer.get_vocab():
        tokenizer.eos_token = '[EOS]'
    elif '[SEP]' in tokenizer.get_vocab():
        tokenizer.eos_token = '[SEP]'  # Use [SEP] as EOS (already set by default)
    else:
        tokenizer.eos_token = tokenizer.convert_ids_to_tokens(1)

# BOS token
if tokenizer.bos_token is None:
    if '[BOS]' in tokenizer.get_vocab():
        tokenizer.bos_token = '[BOS]'
    elif '[CLS]' in tokenizer.get_vocab():
        tokenizer.bos_token = '[CLS]'  # Use [CLS] as BOS if available
    else:
        # Use a different token ID (e.g., 0) that isn't PAD or EOS
        candidate_id = 0
        while candidate_id in [tokenizer.pad_token_id, tokenizer.eos_token_id]:
            candidate_id += 1
        tokenizer.bos_token = tokenizer.convert_ids_to_tokens(candidate_id)

# Update token IDs
PAD_IDX = tokenizer.pad_token_id
EOS_IDX = tokenizer.eos_token_id
SOS_IDX = tokenizer.bos_token_id

print(f"Pad token: {tokenizer.pad_token} (ID: {PAD_IDX})")
print(f"EOS token: {tokenizer.eos_token} (ID: {EOS_IDX})")
print(f"BOS token: {tokenizer.bos_token} (ID: {SOS_IDX})")

# Verify tokens are distinct
if PAD_IDX == EOS_IDX or PAD_IDX == SOS_IDX or EOS_IDX == SOS_IDX:
    raise ValueError("PAD, EOS, and BOS token IDs must be distinct!")

# Initialize model
model = build_transformer(params, pad_idx=PAD_IDX)
checkpoint = torch.load("../transformer_qed_sequence_full.pth", map_location=DEVICE)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(DEVICE)
model.eval()
print(f"Model device: {next(model.parameters()).device}")

# Example input
input_text = "AntiPart e_alpha_[STATE_ID](X) e_del_[STATE_ID](X)^(*) to u_gam_[STATE_ID](X)"
input_ids = tokenizer(input_text, return_tensors="pt", padding="max_length", max_length=MAX_SEQ_LEN, truncation=True)['input_ids'].to(DEVICE)
print(f"Input IDs device: {input_ids.device}")

# Generate prediction
output_ids = model.greedy_inference(model, input_ids, SOS_IDX, EOS_IDX, MAX_SEQ_LEN)
decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(f"Input text: {input_text}")
print(f"Predicted token IDs: {output_ids[0].tolist()}")
print(f"Decoded output: {decoded_output}")

Added d:\DecoderKAN\src to sys.path
Using device: cpu
Tokenizer vocab size: 512
First few tokens: [('105', 286), ('299', 404), ('17', 138), ('166', 449), ('29', 152)]
Does [PAD] exist? True
Does [EOS] exist? False
Does [BOS] exist? False
Does [SEP] exist? True
Pad token: [PAD] (ID: 2)
EOS token: [SEP] (ID: 1)
BOS token: [CLS] (ID: 0)
Model device: cpu
Input IDs device: cpu
Input text: AntiPart e_alpha_[STATE_ID](X) e_del_[STATE_ID](X)^(*) to u_gam_[STATE_ID](X)
Predicted token IDs: [0, 71, 49, 33, 17, 183, 13, 71, 49, 33, 16, 89, 54, 111, 46, 33, 38, 32, 15, 61, 114, 8, 51, 33, 90, 61, 135, 33, 108, 61, 155, 33, 95, 7, 110, 224, 12, 22, 8, 38, 32, 17, 104, 2, 2]
Decoded output: }( p _ 4 67 0 }( p _ 3 )_ v /( m _ e ^ 2 Ġ+ Ġ2 * s _ 12 Ġ+ Ġs _ 22 Ġ+ Ġreg _ prop ) Ġ: Ġ4 / 9 * e ^ 4 *(


In [None]:
data[0].unsqueeze(0)