In [None]:
#!/usr/bin/env python3
# %%
"""
find_odd_chunks.ipynb ‚Äî Detect citation debris or odd numeric fragments in chunk text.

Usage:
    Place this notebook in the same folder as your chunks.parquet
    (typically MSK_Chat/MSKArticlesINDEX or goldset_review_files).

    It will:
      ‚Ä¢ Load all chunk metadata
      ‚Ä¢ Find suspicious numeric sequences (e.g., "2 , 7 , 19 , 21")
      ‚Ä¢ Rank and display examples interactively
"""

import re
import pandas as pd
from pathlib import Path
from IPython.display import display
import os

# %%
# --- 1. Locate and load the chunks file ---

# Determine a sensible root (script or notebook)
try:
    root = Path(__file__).resolve().parent
except Exception:
    root = Path.cwd()

# Allow explicit override via environment variable (full file path or folder)
env_path = os.getenv("MSK_CHUNKS_PATH")
if env_path:
    chunks_path = Path(env_path)
    if chunks_path.is_dir():
        # if a directory was provided, look for common filenames inside it
        chunks_path = next(
            (chunks_path / n for n in ["chunks.parquet", "MSKArticlesINDEX/chunks.parquet", "goldset_review_files/chunks.parquet"]),
            None,
        )
else:
    candidates = []
    names = ["chunks.parquet", "MSKArticlesINDEX/chunks.parquet", "goldset_review_files/chunks.parquet"]
    for base in [root] + list(root.parents):
        for name in names:
            candidates.append(base / name)
    chunks_path = next((p for p in candidates if p.exists()), None)

if chunks_path is None:
    print("cwd:", Path.cwd())
    print("Searched locations (first 50):")
    for p in (candidates if 'candidates' in locals() else [] )[:50]:
        print(" -", p)
    raise FileNotFoundError(
        "Couldn't find chunks.parquet. Place it in the notebook folder or set MSK_CHUNKS_PATH to the file or folder containing it."
    )

print(f"üìÇ Using chunks file: {chunks_path.resolve()}")
df = pd.read_parquet(chunks_path)
print(f"Loaded {len(df)} chunks with columns: {list(df.columns)}")

# %%
# --- 2. Define "odd numeric pattern" regex ---

pattern = re.compile(
    r'(^\s*\(?\d+(?:\s*,\s*\d+){1,}\)?\s*)|(\b\d{1,2}\s*,\s*\d{1,2}\s*,\s*\d{1,2})'
)

def looks_odd(text: str) -> bool:
    if not isinstance(text, str):
        return False
    # Early numeric clutter
    head = text.strip()[:80]
    return bool(pattern.search(head))

df["is_odd"] = df["embed_text"].apply(looks_odd)

# %%
# --- 3. Count and summarize ---
odd_df = df[df["is_odd"]]
print(f"‚ö†Ô∏è Found {len(odd_df)} suspicious chunks out of {len(df)} total.")

if not odd_df.empty:
    display(
        odd_df[
            ["section", "source_relpath", "embed_text"]
        ]
        .head(25)
        .style.set_properties(**{"white-space": "pre-wrap"})
    )

# %%
# --- 4. Optional: export for manual review ---
odd_path = chunks_path.with_name("odd_chunks.csv")
odd_df.to_csv(odd_path, index=False)
print(f"üìù Saved detailed list to: {odd_path}")


üìÇ Using chunks file: C:\Users\Draco\OneDrive\Documents\MSK_Triage_Chatbot\MSK_Chat\MSKArticlesINDEX\chunks.parquet
Loaded 1095 chunks with columns: ['article_id', 'chunk_id', 'title', 'section', 'chunk_idx', 'article_seq', 'embed_text', 'body', 'text_with_images', 'images', 'source_relpath', 'token_len', 'word_len']
‚ö†Ô∏è Found 1 suspicious chunks out of 1095 total.


Unnamed: 0,section,source_relpath,embed_text
375,Main,mskneurology.com/how-truly-treat-thoracic-outlet-syndrome/index.html,"How to truly identify and treat thoracic outlet syndrome (TOS) ¬∑ Main 2 , 7 , 19 , 21 However, little agreement exists on which muscles need strengthening and which ones need lengthening. 5 These types of exercises do not detail how they address functional TOS as a result of respiratory alterations and they do not aim to inhibit muscle. 1 , 5 , 19 ‚Äì Robey & Neurogenic thoracic outlet syndrome (NTOS) is an oft-overlooked and obscure cause of shoulder pain that regularly presents to the office of shoulder surgeons and pain specialists. Taking the research above into account, the reader can probably start to understand that it's often very difficult to be properly diagnosed and treated if one has thoracic outlet syndrome. This article will shed light on what I consider a very¬†effective approach to both diagnosis and treatment, that have cured thoracic outlet syndrome¬†for most of our patients. Be aware though, that the actual treatment¬†is a demanding¬†procedure that will have to be managed through cooperation with a qualified therapist."


üìù Saved detailed list to: c:\Users\Draco\OneDrive\Documents\MSK_Triage_Chatbot\MSK_Chat\MSKArticlesINDEX\odd_chunks.csv


In [1]:
import json
import requests

# Standard Ollama API URL (unchanged in Intel build)
OLLAMA_URL = "http://localhost:11434/api/generate"

# Minimal prompt payload
payload = {
    "model": "llama3:latest",          # or whatever model you‚Äôve pulled
    "prompt": "Hello from Intel Arc!",
    "stream": False,                   # single JSON response for simplicity
    "options": {"num_predict": 20}
}

print("‚û°Ô∏è Sending test prompt to Ollama ...")
resp = requests.post(OLLAMA_URL, json=payload, timeout=120)
resp.raise_for_status()

data = resp.json()
print("\n‚úÖ Ollama replied successfully!\n")
print("Response text:\n", data.get("response", "").strip())
print("\nToken stats:")
print("  Prompt tokens:", data.get("prompt_eval_count"))
print("  Output tokens:", data.get("eval_count"))

# Optional: check for backend info in logs (Intel build prints this on startup)


‚û°Ô∏è Sending test prompt to Ollama ...

‚úÖ Ollama replied successfully!

Response text:
 A new player in the graphics card market!

Intel Arc is Intel's foray into the discrete graphics

Token stats:
  Prompt tokens: 15
  Output tokens: 20


In [1]:
import torch

print("Has XPU module:", hasattr(torch, "xpu"))
if hasattr(torch, "xpu"):
    print("XPU available:", torch.xpu.is_available())
    if torch.xpu.is_available():
        print("Number of XPUs:", torch.xpu.device_count())
        print("Current device:", torch.xpu.current_device())
        print("Device name:", torch.xpu.get_device_name(0))


Has XPU module: True
XPU available: True
Number of XPUs: 1
Current device: 0
Device name: Intel(R) Arc(TM) 140T GPU (16GB)
