<a href="https://colab.research.google.com/github/Algocrat/slm-dragon-labs/blob/main/labs/colab/lab3_fresh_data_loading_tokenization_revised_fixed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In local machine"/></a>

# Lab 3 – Data Loading and Tokenization
**Part 3 of the 7 Lab Hands-On SLM Training Series**

This notebook downloads the `ncbi/Open-Patients` dataset, performs basic cleaning and sanity checks, detects the text field automatically, and prepares tokenized chunks for causal language modeling (CLM). It saves a tokenized dataset to disk for use in Lab 4.

### Note on dataset access and licensing
The `ncbi/Open-Patients` dataset is publicly available on the Hugging Face Hub under CC-BY-SA 4.0. No authentication is required to download. Please provide attribution if you reuse the data, and ensure your use complies with the license and any applicable privacy rules.

## Step 0. Install dependencies (if needed)

In [None]:
# Stable, Colab-friendly deps for SLM labs
# - Pins pandas + pyarrow to avoid ABI conflicts with preinstalled GPU libs (RAPIDS)
# - Installs HF stack
# - Verifies imports and prints versions

pip install -q --force-reinstall "pandas==2.2.2" "pyarrow==17.0.0"
pip install -q "datasets>=2.19.0" "transformers>=4.41.0" "sentencepiece>=0.1.99" "tqdm>=4.66.0"

import importlib, traceback

mods = ["pandas", "pyarrow", "datasets", "transformers", "sentencepiece", "tqdm"]
ok = True
for m in mods:
    try:
        mod = importlib.import_module(m)
        print(f"{m}: {getattr(mod, '__version__', 'unknown')}")
    except Exception as e:
        ok = False
        print(f"[Import error] {m}: {e}")
        traceback.print_exc(limit=1)

if ok:
    print("Dependencies OK")
else:
    print("\nOne or more imports failed (usually due to mixed wheels after an upgrade).")
    print("Please go to Runtime → Restart runtime, then re-run this cell once.")


## Step 1. Download dataset: `ncbi/Open-Patients`

In [None]:
from datasets import load_dataset

# Public dataset under CC-BY-SA 4.0; no authentication required
dataset = load_dataset("ncbi/Open-Patients")
print(dataset)
print("Example record:")
first_split = list(dataset.keys())[0]
print(dataset[first_split][0])

## Step 1.1 Clean and sanity check

In [None]:
import re, unicodedata
from datasets import DatasetDict
import numpy as np

TEXT_FIELD_CANDIDATES = ["text", "content", "description", "body", "note"]
sample_split = list(dataset.keys())[0]
sample_item = dataset[sample_split][0]
text_field = None
for k in TEXT_FIELD_CANDIDATES:
    if k in sample_item and isinstance(sample_item[k], str):
        text_field = k
        break
if text_field is None:
    for k, v in sample_item.items():
        if isinstance(v, str):
            text_field = k
            break
print("Using text field:", text_field)

KEEP_ASCII_ONLY = False
MIN_LEN_CHARS = 10
MAX_LEN_CHARS = 50000

def basic_clean(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    if KEEP_ASCII_ONLY:
        s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
    return s

def map_clean(example):
    t = example.get(text_field, "")
    t = basic_clean(t)
    example[text_field] = t
    example["len_chars"] = len(t)
    return example

def is_valid(example):
    ln = example["len_chars"]
    return (ln >= MIN_LEN_CHARS) and (ln <= MAX_LEN_CHARS)

cleaned = DatasetDict()
for split in dataset.keys():
    cleaned_split = dataset[split].map(map_clean, desc=f"Cleaning {split}")
    cleaned_split = cleaned_split.filter(is_valid, desc=f"Filtering {split}")
    cleaned[split] = cleaned_split

def dedupe_exact(ds, key):
    seen = set()
    idxs = []
    for i, s in enumerate(ds[key]):
        if s not in seen:
            idxs.append(i)
            seen.add(s)
    return ds.select(idxs)

for split in list(cleaned.keys()):
    before = len(cleaned[split])
    cleaned[split] = dedupe_exact(cleaned[split], text_field)
    after = len(cleaned[split])
    if after != before:
        print(f"Deduped {split}: {before} -> {after}")

for split in cleaned.keys():
    arr = np.array(cleaned[split]["len_chars"])
    if arr.size:
        print(f"{split}: n={arr.size} mean={arr.mean():.1f} p50={np.percentile(arr,50):.0f} "
              f"p90={np.percentile(arr,90):.0f} p99={np.percentile(arr,99):.0f}")

## Step 2. Initialize tokenizer

In [None]:
from transformers import AutoTokenizer

TOKENIZER_MODEL = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

sample = cleaned['train'][0][text_field] if 'train' in cleaned else list(cleaned.values())[0][0][text_field]
encoded = tokenizer(sample, truncation=True, max_length=128)
print("Tokenized sample IDs (first 20):", encoded['input_ids'][:20])

## Step 3. Tokenize dataset and chunk for CLM

In [None]:
from functools import partial
from itertools import chain
SEQ_LEN = 1024

def tokenize_function(examples, text_key):
    return tokenizer(examples[text_key], truncation=False)

# Remove all original columns so only token arrays remain
remove_cols = cleaned['train'].column_names
tokenized = cleaned.map(
    partial(tokenize_function, text_key=text_field),
    batched=True,
    remove_columns=remove_cols,
    desc='Tokenizing',
)

# Sanity check: ensure tokenized has only token-array columns
print(tokenized)
batch = tokenized['train'][:2]
for k, v in batch.items():
    print(k, type(v), type(v[0]))


## Step 4. Save tokenized dataset and preview

In [None]:
from itertools import chain
# from google.colab import drive  # not needed locally
# drive.mount('./drive')  # not needed locally

def group_texts(examples):
    valid_keys = [k for k, v in examples.items() if isinstance(v, list) and v and isinstance(v[0], list)]
    concatenated = {k: list(chain.from_iterable(examples[k])) for k in valid_keys}
    total_length = len(concatenated['input_ids'])
    total_length = (total_length // SEQ_LEN) * SEQ_LEN
    result = {}
    for k, t in concatenated.items():
        result[k] = [t[i:i+SEQ_LEN] for i in range(0, total_length, SEQ_LEN)]
    result['labels'] = list(result['input_ids'])
    return result

lm_datasets = tokenized.map(group_texts, batched=True, desc='Grouping into fixed-length chunks')
print(lm_datasets)

OUT_DIR = "./lab3_tokenized"
lm_datasets.save_to_disk(OUT_DIR)
print("Saved:", OUT_DIR)
