In [None]:
!pip -q install transformers datasets seqeval peft accelerate
import os, sys, json, random, pathlib
from pathlib import Path
print("deps installed")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
deps installed


In [14]:
# === 1. Connect Google Drive ===
from google.colab import drive
drive.mount('/content/drive')

# === 2. clone E3C rep ===
!rm -rf E3C-Corpus
!git clone --depth 1 https://github.com/hltfbk/E3C-Corpus.git

# === 3. Define path ===
import os
drive_path = "/content/drive/MyDrive/small_data_NER_project/raw"
os.makedirs(drive_path, exist_ok=True)

# === 4. Copy to Drive ===
!cp -r E3C-Corpus/preprocessed_data/clinical_entities/layer1/English {drive_path}/

# === 5. Check ===
!ls {drive_path}/English | head
print("Finished cloning data to Google Drive → MyDrive/small_data_NER_project/raw/English/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cloning into 'E3C-Corpus'...
remote: Enumerating objects: 51427, done.[K
remote: Counting objects: 100% (51427/51427), done.[K
remote: Compressing objects: 100% (43656/43656), done.[K
remote: Total 51427 (delta 8353), reused 50577 (delta 7761), pack-reused 0 (from 0)[K
Receiving objects: 100% (51427/51427), 180.38 MiB | 16.70 MiB/s, done.
Resolving deltas: 100% (8353/8353), done.
Updating files: 100% (51361/51361), done.
test.txt
train.txt
Finished cloning data to Google Drive → MyDrive/small_data_NER_project/raw/English/


In [15]:
from pathlib import Path
import random
from collections import Counter

BASE = Path("/content/drive/MyDrive/small_data_NER_project")
RAW  = BASE/"raw/English"
CONLL= BASE/"conll"
CONLL.mkdir(parents=True, exist_ok=True)

def read_conll(p):
    sents, sent = [], []
    with open(p, encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line:
                if sent: sents.append(sent); sent=[]
                continue
            parts=line.split()
            tok, lab = parts[0], parts[-1]
            lab = "O" if lab == "0" else lab  # normalize '0' -> 'O'
            sent.append((tok, lab))
    if sent: sents.append(sent)
    return sents

def write_conll(sents, path):
    with open(path, "w", encoding="utf-8") as w:
        for sent in sents:
            for tok, lab in sent:
                w.write(f"{tok}\t{lab}\n")
            w.write("\n")

def has_entity(sent):
    return any(lab != "O" for _, lab in sent)

def stratified_dev_split(train_sents, dev_ratio=0.1, min_dev=200, seed=42):
    """Ensure dev has entities: split train into new_train/dev with entity presence."""
    random.seed(seed)
    pos = [s for s in train_sents if has_entity(s)]
    neg = [s for s in train_sents if not has_entity(s)]
    n_dev = max(min_dev, int(len(train_sents) * dev_ratio))
    random.shuffle(pos); random.shuffle(neg)
    # proportional selection
    n_pos = min(len(pos), max(1, int(n_dev * (len(pos) / max(1, len(train_sents))))))
    dev = pos[:n_pos] + neg[:n_dev - n_pos]
    new_train = pos[n_pos:] + neg[n_dev - n_pos:]
    random.shuffle(new_train); random.shuffle(dev)
    return new_train, dev

def label_stats(conll_path):
    counts = Counter()
    with open(conll_path, encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            lab = line.split()[-1]
            lab = "O" if lab == "0" else lab
            counts[lab] += 1
    total = sum(counts.values())
    print(f"=== Label counts in {conll_path.name} (total tokens={total}) ===")
    for k,v in counts.most_common():
        pct = 0.0 if total==0 else v/total*100
        print(f"{k:12s} {v:7d}  ({pct:5.2f}%)")
    print()

# --- Step 1: Load original E3C splits ---
train_sents = read_conll(RAW/"train.txt")
test_sents  = read_conll(RAW/"test.txt")
print(f"Loaded {len(train_sents)} train sentences, {len(test_sents)} test sentences")

# --- Step 2: Split dev (stratified; ensure entities present in dev) ---
new_train, dev_sents = stratified_dev_split(train_sents, dev_ratio=0.1, min_dev=200, seed=42)
print(f"Split -> new_train={len(new_train)}  dev={len(dev_sents)}  test={len(test_sents)}")

# --- Step 3: Save unified CoNLL ---
write_conll(new_train, CONLL/"train.conll")
write_conll(dev_sents, CONLL/"dev.conll")
write_conll(test_sents, CONLL/"test.conll")
print("Saved unified CoNLL files to", CONLL)

# --- Step 4: Label distributions (instead of head) ---
label_stats(CONLL/"train.conll")
label_stats(CONLL/"dev.conll")
label_stats(CONLL/"test.conll")

Loaded 669 train sentences, 851 test sentences
Split -> new_train=469  dev=200  test=851
Saved unified CoNLL files to /content/drive/MyDrive/small_data_NER_project/conll
=== Label counts in train.conll (total tokens=9112) ===
O               8524  (93.55%)
B-ety            303  ( 3.33%)
I-ety            285  ( 3.13%)

=== Label counts in dev.conll (total tokens=3545) ===
O               3285  (92.67%)
B-ety            134  ( 3.78%)
I-ety            126  ( 3.55%)

=== Label counts in test.conll (total tokens=16702) ===
O              15746  (94.28%)
B-ety            516  ( 3.09%)
I-ety            440  ( 2.63%)



In [16]:
# ===== N-way K-shot sampler for E3C (writes to Drive) =====
from pathlib import Path
from collections import Counter, defaultdict
import random

BASE   = Path("/content/drive/MyDrive/small_data_NER_project")
CONLL  = BASE/"conll"
TRAINF = CONLL/"train.conll"
DEVF   = CONLL/"dev.conll"
TESTF  = CONLL/"test.conll"

def read_conll(path: Path):
    sents, sent = [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line:
                if sent: sents.append(sent); sent=[]
                continue
            tok, lab = line.split()[:2]
            lab = "O" if lab == "0" else lab
            sent.append((tok, lab))
    if sent: sents.append(sent)
    return sents

def write_conll(sents, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as w:
        for sent in sents:
            for tok, lab in sent:
                w.write(f"{tok}\t{lab}\n")
            w.write("\n")

def sentence_types(sent):
    return {lab.split("-",1)[1] for _, lab in sent if lab.startswith("B-")}

def sentence_mention_counts(sent):
    c = Counter()
    for _, lab in sent:
        if lab.startswith("B-"):
            c[lab.split("-",1)[1]] += 1
    return c

def corpus_mention_totals(sents):
    tot = Counter()
    for s in sents: tot.update(sentence_mention_counts(s))
    return tot

def stratified_dev_if_missing(train_sents, ratio=0.1, min_dev=200, seed=42):
    random.seed(seed)
    has_ent = [s for s in train_sents if any(l!="O" for _,l in s)]
    no_ent  = [s for s in train_sents if not any(l!="O" for _,l in s)]
    n_dev = max(min_dev, int(len(train_sents)*ratio))
    random.shuffle(has_ent); random.shuffle(no_ent)
    k_has = min(len(has_ent), max(1, int(n_dev * (len(has_ent)/(len(train_sents)+1e-9)))))
    dev = has_ent[:k_has] + no_ent[:n_dev-k_has]
    tr  = has_ent[k_has:] + no_ent[n_dev-k_has:]
    random.shuffle(tr); random.shuffle(dev)
    return tr, dev

def build_fewshot(k=5, seed=42, mode="sent"):
    """
    mode='sent': ensure each entity type appears in >=K sentences (if possible)
    mode='mention': ensure each entity type has >=K mentions (if possible)
    """
    assert mode in ("sent","mention")
    train = read_conll(TRAINF)
    dev   = read_conll(DEVF) if DEVF.exists() else None
    test  = read_conll(TESTF)
    if dev is None:
        train, dev = stratified_dev_if_missing(train, ratio=0.1, min_dev=200, seed=seed)

    types = sorted(corpus_mention_totals(train).keys())
    random.seed(seed)

    if mode == "sent":
        # Greedy: pick sentence that increases #types still below K
        picked=set()
        covered = Counter()  # per-type sentence count
        def gain(i):
            g=0
            for t in sentence_types(train[i]):
                if covered[t] < k: g += 1
            return g
        # Upper bound per type cannot exceed available sentences containing that type
        contain = defaultdict(list)
        for i, s in enumerate(train):
            for t in sentence_types(s): contain[t].append(i)
        while True:
            if all(covered[t] >= min(k, len(contain[t])) for t in types):
                break
            best_i, best_g = -1, 0
            for i in range(len(train)):
                if i in picked: continue
                g = gain(i)
                if g > best_g:
                    best_g, best_i = g, i
            if best_i == -1 or best_g == 0: break
            picked.add(best_i)
            for t in sentence_types(train[best_i]):
                covered[t] += 1

    else:  # mode == "mention"
        totals = corpus_mention_totals(train)
        target = {t: min(k, totals[t]) for t in types}
        per_sent = [sentence_mention_counts(s) for s in train]
        picked=set(); covered=Counter()
        def gain(i):
            g=0; cnt=per_sent[i]
            for t, need in target.items():
                if covered[t] >= need: continue
                if t in cnt:
                    g += min(cnt[t], need - covered[t])
            return g
        while True:
            if all(covered[t] >= target[t] for t in types): break
            best_i, best_g = -1, 0
            for i in range(len(train)):
                if i in picked: continue
                g = gain(i)
                if g > best_g: best_g, best_i = g, i
            if best_i == -1 or best_g == 0: break
            picked.add(best_i); covered.update(per_sent[best_i])

    few_idxs = sorted(picked)
    few_train = [train[i] for i in few_idxs]

    out_dir = CONLL/f"fewshot_k{k}_seed{seed}_{mode}"
    write_conll(few_train, out_dir/"train.conll")
    write_conll(dev,       out_dir/"dev.conll")
    write_conll(test,      out_dir/"test.conll")

    # Print coverage stats
    print(f"\n== Built few-shot @ {out_dir} ==")
    print(f"train sentences = {len(few_train)}, dev={len(dev)}, test={len(test)}")
    if mode == "sent":
        per_type_sent = Counter()
        for s in few_train:
            for t in sentence_types(s): per_type_sent[t]+=1
        print("Per-type sentence coverage (aim >= K):")
        for t in types:
            print(f"  {t:15s} {per_type_sent[t]:3d}")
    else:
        c = corpus_mention_totals(few_train)
        print("Per-type mention coverage (aim >= K):")
        for t in types:
            print(f"  {t:15s} {c[t]:3d}")

    return out_dir

# ---- Build NK sets you need (edit K_LIST if desired) ----
K_LIST = (1,5,10,20)
for K in K_LIST:
    build_fewshot(k=K, seed=42, mode="sent")     # each type ≥ K sentences
    build_fewshot(k=K, seed=42, mode="mention")  # each type ≥ K mentions


== Built few-shot @ /content/drive/MyDrive/small_data_NER_project/conll/fewshot_k1_seed42_sent ==
train sentences = 1, dev=200, test=851
Per-type sentence coverage (aim >= K):
  ety               1

== Built few-shot @ /content/drive/MyDrive/small_data_NER_project/conll/fewshot_k1_seed42_mention ==
train sentences = 1, dev=200, test=851
Per-type mention coverage (aim >= K):
  ety               1

== Built few-shot @ /content/drive/MyDrive/small_data_NER_project/conll/fewshot_k5_seed42_sent ==
train sentences = 5, dev=200, test=851
Per-type sentence coverage (aim >= K):
  ety               5

== Built few-shot @ /content/drive/MyDrive/small_data_NER_project/conll/fewshot_k5_seed42_mention ==
train sentences = 1, dev=200, test=851
Per-type mention coverage (aim >= K):
  ety               5

== Built few-shot @ /content/drive/MyDrive/small_data_NER_project/conll/fewshot_k10_seed42_sent ==
train sentences = 10, dev=200, test=851
Per-type sentence coverage (aim >= K):
  ety              10

In [17]:
from collections import Counter
from pathlib import Path

def count_cover_and_mentions(dirpath):
    sents = read_conll(Path(dirpath)/"train.conll")
    # 句子覆盖
    per_type_sent = Counter()
    for s in sents:
        for t in sentence_types(s):
            per_type_sent[t]+=1
    # 提及覆盖
    per_type_mentions = corpus_mention_totals(sents)
    print("Per-type sentence coverage:", dict(per_type_sent))
    print("Per-type mention  coverage:", dict(per_type_mentions))

count_cover_and_mentions(CONLL/"fewshot_k5_seed42_sent")

Per-type sentence coverage: {'ety': 5}
Per-type mention  coverage: {'ety': 9}
