In [8]:
import nltk
try:
    nltk.data.find("tokenizers/punkt")
    print("‚úÖ Using NLTK Punkt sentence tokenizer")
except LookupError:
    print("‚ö†Ô∏è Punkt tokenizer not found ‚Äì will fall back to regex")


‚úÖ Using NLTK Punkt sentence tokenizer


In [10]:
# --- TOKEN LENGTH ANALYZER (Notebook Edition) ---
import pandas as pd
import numpy as np

try:
    import tiktoken
except ImportError:
    raise SystemExit("‚ùå Please install tiktoken first: pip install tiktoken")

# === CONFIG ===
INPUT_PATH = "C:\\MSK_Triage_Chatbot\\MSK_Chat\\MSKArticlesINDEX\\chunks.parquet"   # or "chunks.jsonl"
TOKENIZER_MODEL = "cl100k_base"                  # same default as extractor
TOKEN_LIMIT = 512                                # your embedding cutoff
# ==============

print(f"üîç Loading {INPUT_PATH} ...")
if INPUT_PATH.endswith(".parquet"):
    df = pd.read_parquet(INPUT_PATH)
else:
    df = pd.read_json(INPUT_PATH, lines=True)

if "embed_text" not in df.columns:
    raise ValueError("Expected column 'embed_text' not found in dataset.")

print(f"‚úÖ Loaded {len(df)} chunks")

# Initialize tokenizer
enc = tiktoken.get_encoding(TOKENIZER_MODEL)

# Count tokens for each chunk
def count_tokens(series):
    counts = []
    for t in series.fillna(""):
        try:
            counts.append(len(enc.encode(t)))
        except Exception:
            counts.append(0)
    return np.array(counts, dtype=int)

tokens = count_tokens(df["embed_text"])

# Basic stats
mean_tokens = tokens.mean()
median_tokens = np.median(tokens)
max_tokens = tokens.max()
p95 = np.percentile(tokens, 95)
over_limit = (tokens > TOKEN_LIMIT).sum()

print("\nüìä Token Length Statistics")
print("---------------------------")
print(f"Mean:     {mean_tokens:.1f}")
print(f"Median:   {median_tokens:.1f}")
print(f"Max:      {max_tokens}")
print(f"95th %:   {p95:.1f}")
print(f"Over {TOKEN_LIMIT} tokens: {over_limit} chunks ({over_limit/len(tokens)*100:.2f}%)")

# Histogram summary
bins = [0,128,256,512,768,1024,2048,4096]
hist, _ = np.histogram(tokens, bins=bins)
print("\nHistogram (token ranges):")
for i in range(len(bins)-1):
    print(f"{bins[i]:>4}‚Äì{bins[i+1]-1:<4}: {hist[i]}")

# Optional: attach back to df if you want to inspect interactively
df["token_len"] = tokens
df.head(3)


üîç Loading C:\MSK_Triage_Chatbot\MSK_Chat\MSKArticlesINDEX\chunks.parquet ...
‚úÖ Loaded 1497 chunks

üìä Token Length Statistics
---------------------------
Mean:     268.5
Median:   255.0
Max:      556
95th %:   457.0
Over 512 tokens: 28 chunks (1.87%)

Histogram (token ranges):
   0‚Äì127 : 80
 128‚Äì255 : 675
 256‚Äì511 : 714
 512‚Äì767 : 28
 768‚Äì1023: 0
1024‚Äì2047: 0
2048‚Äì4095: 0


Unnamed: 0,article_id,chunk_id,title,section,chunk_idx,article_seq,embed_text,body,text_with_images,images,source_relpath,token_len,word_len
0,76e01d5cf5021533b0ff288d5690fa0adede76a97988cb...,fa7661033af177f757bef60020b95d71f8f699fbf42e76...,"The association between tinnitus, the neck and...",Main,0,0,"The association between tinnitus, the neck and...",Tinnitus is a common hearing disorder that aff...,"The association between tinnitus, the neck and...",[],mskneurology.com/association-tinnitus-neck-tmj...,233,156
1,76e01d5cf5021533b0ff288d5690fa0adede76a97988cb...,5885a3583eb5286e9b45de9d27eb457e062a791909d9a9...,"The association between tinnitus, the neck and...",Function and anatomy of the ear,0,1,"The association between tinnitus, the neck and...",To understand why the neck and jaw may influen...,"The association between tinnitus, the neck and...",[],mskneurology.com/association-tinnitus-neck-tmj...,289,219
2,76e01d5cf5021533b0ff288d5690fa0adede76a97988cb...,3c0d69ee13ccfcee7f3772d12c1ce1e26789d7621f955e...,"The association between tinnitus, the neck and...",Function and anatomy of the ear,1,2,"The association between tinnitus, the neck and...",The design of the ear's lobes make it perfect ...,"The association between tinnitus, the neck and...",[],mskneurology.com/association-tinnitus-neck-tmj...,525,377
