In [4]:
# =========================
# 1) Install
# =========================
!pip -q install transformers datasets accelerate peft bitsandbytes trl

# =========================
# 2) Imports & Config
# =========================
import os, json, re, ast, torch
import transformers as t
import datasets as d
import trl as r
import peft as p

os.environ["TOKENIZERS_PARALLELISM"] = "false"

MODEL_NAME   = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DATASET_NAME = "ai4privacy/pii-masking-200k"
OUTPUT_DIR   = "./pii_smoketest_json"

MAX_TRAIN = 1000
MAX_VAL   = 200
EPOCHS    = 6           # per your ask
BATCH     = 4
LR        = 2e-5
SEED      = 42
USE_BF16  = False       # keep False; we'll run 4-bit compute in fp16 to avoid dtype mismatches

os.makedirs(OUTPUT_DIR, exist_ok=True)

# =========================
# 3) Data (English-only subset)
# =========================
ds = d.load_dataset(DATASET_NAME)
en = ds["train"].filter(lambda x: str(x.get("language","")).lower().startswith("en")).shuffle(seed=SEED)
train = en.select(range(min(MAX_TRAIN, len(en))))
val   = en.select(range(min(MAX_TRAIN, len(en)), min(MAX_TRAIN+MAX_VAL, len(en))))
print(f"Train (EN): {len(train)} | Val (EN): {len(val)}")

# =========================
# 4) Prompt formatting (strict JSON array)
# =========================
INSTR = (
    "Extract all personal information from the INPUT and return a strict JSON array of objects. "
    "Each object must have keys exactly: \"value\" (string) and \"label\" (string). "
    "No extra text; output only the JSON array."
)
def format_row_for_train(x):
    gold = [{"value": e["value"], "label": e["label"]} for e in x["privacy_mask"]]
    return f"### INSTRUCTION: {INSTR}\n### INPUT: {x['source_text']}\n### OUTPUT: {json.dumps(gold, ensure_ascii=False)}"

def format_row_for_infer_input(x_text: str):
    return f"### INSTRUCTION: {INSTR}\n### INPUT: {x_text}\n### OUTPUT:"

# =========================
# 5) Tokenizer & Model (4-bit) + LoRA (with k-bit prep)
# =========================
tok = t.AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tok.pad_token = tok.eos_token

# 4-bit quantization -> compute in fp16 to avoid Float/Half mismatch
bnb_cfg = t.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)
model = t.AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_cfg,
    device_map="auto",
    trust_remote_code=True
)

# Prepare for k-bit training and attach LoRA
model = p.prepare_model_for_kbit_training(model)
lcfg = p.LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05, bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)
model = p.get_peft_model(model, lcfg)

# =========================
# 6) Trainer (SFT)
# =========================
sft_cfg = r.SFTConfig(
    output_dir=OUTPUT_DIR,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH,
    gradient_accumulation_steps=2,
    warmup_ratio=0.05,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    bf16=USE_BF16,                       # keep False for stability with fp16 compute
    gradient_checkpointing=False,
    optim="adamw_bnb_8bit",
    seed=SEED
)
collator = r.trainer.sft_trainer.DataCollatorForLanguageModeling(
    pad_token_id=tok.pad_token_id,
    completion_only_loss=False
)
trainer = r.SFTTrainer(
    model=model,
    processing_class=tok,
    train_dataset=train,
    eval_dataset=val,
    args=sft_cfg,
    data_collator=collator,
    formatting_func=format_row_for_train,
)

# =========================
# 7) Train
# =========================
trainer.train()
print("Training done.")

# Use the trained model object for generation
model = trainer.model.eval()

# =========================
# 8) JSON parsing helpers + inference utils
# =========================
def parse_json_array(text: str):
    """Extract and normalize a JSON array of {value,label} from model text."""
    m = re.search(r"\[.*\]", text, re.DOTALL)
    if m:
        chunk = m.group(0).strip()
        try:
            return json.loads(chunk)
        except Exception:
            pass
        try:
            arr = ast.literal_eval(chunk)
            if isinstance(arr, list):
                norm = []
                for item in arr:
                    if isinstance(item, dict) and "value" in item and "label" in item:
                        norm.append({"value": str(item["value"]), "label": str(item["label"])})
                return norm
        except Exception:
            pass
    return []

@torch.no_grad()
def generate_json_entities(x_text: str, max_new_tokens: int = 200):
    """Return STRICT JSON array for an arbitrary input string."""
    prompt = format_row_for_infer_input(x_text)
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,     # deterministic
        top_p=1.0,
        pad_token_id=tok.eos_token_id
    )
    decoded = tok.decode(out[0], skip_special_tokens=True)
    resp = decoded.split("### OUTPUT:", 1)[-1].strip()
    return parse_json_array(resp)

# =========================
# 9) Batch eval on a small slice + save JSON
# =========================
EVAL_N = min(50, len(val))
subset = val.select(range(EVAL_N))

predictions = []
for i, row in enumerate(subset):
    pred = generate_json_entities(row["source_text"])
    gold = [{"value": e["value"], "label": e["label"]} for e in row["privacy_mask"]]
    predictions.append({
        "index": i,
        "input_text": row["source_text"],
        "gold_entities": gold,
        "predicted_entities": pred
    })

out_path = os.path.join(OUTPUT_DIR, "predictions.json")
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(predictions, f, indent=2, ensure_ascii=False)

print(f"Saved JSON to: {out_path}")
print("Sample record:")
print(json.dumps(predictions[0], indent=2, ensure_ascii=False))

# =========================
# 10) Try it yourself interactively
# =========================
# Example:
test_text = "Hi, I'm Alice Johnson, email alice.j@example.com, phone +1 650-555-1234, SSN 123-45-6789, based in 1600 Amphitheatre Pkwy."
print("\nYour JSON output:\n", json.dumps(generate_json_entities(test_text), indent=2, ensure_ascii=False))


Train (EN): 1000 | Val (EN): 200


Applying formatting function to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,1.6287,1.046389,1.020121,198167.0,0.777994
2,0.9994,0.957383,0.95221,396334.0,0.791258
3,0.9359,0.919616,0.918662,594501.0,0.797002
4,0.9033,0.901369,0.9091,792668.0,0.7998
5,0.8849,0.892576,0.883323,990835.0,0.801677
6,0.8756,0.889484,0.883428,1189002.0,0.802237


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Training done.
Saved JSON to: ./pii_smoketest_json/predictions.json
Sample record:
{
  "index": 0,
  "input_text": "Device gK1374etEQY0 not properly functioning. Do you have more related to Suite 794 available?",
  "gold_entities": [
    {
      "value": "gK1374etEQY0",
      "label": "PASSWORD"
    },
    {
      "value": "Suite 794",
      "label": "SECONDARYADDRESS"
    }
  ],
  "predicted_entities": [
    {
      "value": "gK1374etEQY0",
      "label": "MASKEDNUMBER"
    },
    {
      "value": "Suite 794",
      "label": "SECONDARYADDRESS"
    }
  ]
}

Your JSON output:
 [
  {
    "value": "Alice",
    "label": "FIRSTNAME"
  },
  {
    "value": "Johnson",
    "label": "LASTNAME"
  },
  {
    "value": "alice.j@example.com",
    "label": "EMAIL"
  },
  {
    "value": "+1 650-555-1234",
    "label": "PHONENUMBER"
  },
  {
    "value": "123-45-6789",
    "label": "SSN"
  },
  {
    "value": "1600 Amphitheatre Pkwy",
    "label": "STREET"
  }
]


In [5]:
# Example:
test_text = "Hi,my name is Unnati and you can contact me at ugohil@asu.edu"
print("\nYour JSON output:\n", json.dumps(generate_json_entities(test_text), indent=2, ensure_ascii=False))


Your JSON output:
 [
  {
    "value": "Unnati",
    "label": "FIRSTNAME"
  },
  {
    "value": "ugohil@asu.edu",
    "label": "EMAIL"
  }
]


In [6]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [7]:
import os, time, shutil, pathlib

SRC = "./pii_smoketest_json"   # your OUTPUT_DIR in the notebook
ts  = time.strftime("%Y%m%d_%H%M%S")
DEST_ROOT = "/content/drive/MyDrive/pii_runs"
DEST = f"{DEST_ROOT}/pii_smoketest_json_{ts}"

pathlib.Path(DEST_ROOT).mkdir(parents=True, exist_ok=True)
shutil.copytree(SRC, DEST)

print("✅ Copied to Drive:", DEST)
print("Files:")
for p in pathlib.Path(DEST).rglob("*"):
    print(" -", p.as_posix())


✅ Copied to Drive: /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956
Files:
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/predictions.json
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/checkpoint-375
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/README.md
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/checkpoint-500
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/checkpoint-625
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/checkpoint-750
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/checkpoint-125
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/runs
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/checkpoint-250
 - /content/drive/MyDrive/pii_runs/pii_smoketest_json_20251025_220956/checkpoint-375/adapter_config.json
 - /content/drive/MyDrive/pii_runs/pii_smoketest_j

In [8]:
import json, os
pred_path = os.path.join(DEST, "predictions.json")
print("predictions.json exists:", os.path.exists(pred_path))
if os.path.exists(pred_path):
    with open(pred_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    print("records:", len(data))
    print("sample:\n", json.dumps(data[0], indent=2, ensure_ascii=False)[:1000])


predictions.json exists: True
records: 50
sample:
 {
  "index": 0,
  "input_text": "Device gK1374etEQY0 not properly functioning. Do you have more related to Suite 794 available?",
  "gold_entities": [
    {
      "value": "gK1374etEQY0",
      "label": "PASSWORD"
    },
    {
      "value": "Suite 794",
      "label": "SECONDARYADDRESS"
    }
  ],
  "predicted_entities": [
    {
      "value": "gK1374etEQY0",
      "label": "MASKEDNUMBER"
    },
    {
      "value": "Suite 794",
      "label": "SECONDARYADDRESS"
    }
  ]
}


In [9]:
import shutil
from google.colab import files

zip_path = "/content/pii_smoketest_json.zip"
shutil.make_archive("/content/pii_smoketest_json", "zip", "./pii_smoketest_json")
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>