In [None]:
import json

with open("config.json", "r") as f:
    config_argument = json.load(f)

access_token = config_argument["huggingface_access_token"]

model_1_path = config_argument["model_checkpoint_1213"]
model_2_path = config_argument["model_checkpoint_1000"]
model_3_path = config_argument["model_checkpoint_500"]


model_test_task1_data_path_txt = config_argument["model_test_task1_data_path_txt"]
model_test_task2_data_path_txt = config_argument["model_test_task2_data_path_txt"]

print( "access_token: ", access_token )

print( "model_1_path: ", model_1_path )
print( "model_2_path: ", model_2_path )
print( "model_3_path: ", model_3_path )

print( "model_test_task1_data_path_txt", model_test_task1_data_path_txt )
print( "model_test_task2_data_path_txt", model_test_task2_data_path_txt ) 


access_token:  
model_1_path:  
model_2_path:  
model_3_path:  


In [None]:
import torch
import numpy as np

# Level1 對 Level2 的對應表
check_ner = {
    "NAME": ["PATIENT", "DOCTOR", "USERNAME", "FAMILYNAME", "PERSONALNAME"],
    "OCCUPATION": ["PROFESSION"],
    "LOCATION": ["ROOM", "DEPARTMENT", "HOSPITAL", "ORGANIZATION", "STREET", "CITY", "DISTRICT", "COUNTY", "STATE", "COUNTRY", "ZIP", "LOCATION-OTHER"],
    "AGE": ["AGE"],
    "DATE": ["DATE"],
    "CONTACT_INFORMATION": ["PHONE", "FAX", "EMAIL", "URL", "IPADDRESS"],
    "IDENTIFIERS": ["SOCIAL_SECURITY_NUMBER", "MEDICAL_RECORD_NUMBER", "HEALTH_PLAN_NUMBER", "ACCOUNT_NUMBER", "LICENSE_NUMBER", "VEHICLE_ID", "DEVICE_ID", "BIOMETRIC_ID", "ID_NUMBER"],
    "OTHER": ["OTHER"]
}

def strip_prefix(tag):
    """去掉 BIO prefix，如 B-PATIENT → PATIENT"""
    if tag.startswith("B-") or tag.startswith("I-"):
        return tag[2:]
    return tag



# level2 屬於 level1 才可以
def get_level2_entities_1(model, tokenizer, sentence, label_map_lvl1, label_map_lvl2, choose_loss):
    # 1. Tokenize with offsets
    encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True)
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    offsets = encoding["offset_mapping"][0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # 2. Model forward
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask , choose_loss=choose_loss)
        logits_lvl1, logits_lvl2 = outputs.logits  # tuple of two tensors

    # 3. Predict Level 1 & Level 2
    preds_lvl1 = torch.argmax(logits_lvl1, dim=2)[0].cpu().numpy()
    preds_lvl2 = torch.argmax(logits_lvl2, dim=2)[0].cpu().numpy()

    results = []
    for idx, (pred1_id, pred2_id) in enumerate(zip(preds_lvl1, preds_lvl2)):
        token_id = input_ids[0, idx].item()
        if token_id in [tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id]:
            continue

        start, end = offsets[idx]
        if start == end:
            continue

        entity_lvl1 = label_map_lvl1.get(pred1_id, "O")
        entity_lvl2 = label_map_lvl2.get(pred2_id, "O")
        entity_lvl1_temp = strip_prefix(entity_lvl1)
        entity_lvl2_temp = strip_prefix(entity_lvl2)

        # ✅ 僅當 Level 1 為合法大類，且 Level 2 屬於其子類時
        if entity_lvl1_temp in check_ner:
            allowed_subtypes = check_ner[entity_lvl1_temp]
            if entity_lvl2_temp in allowed_subtypes:
                probs = torch.softmax(logits_lvl2[0, idx], dim=0)
                score = float(probs[pred2_id].cpu().numpy())

                results.append({
                    "entity": entity_lvl2,  # 保留 B-XXX 或 I-XXX
                    "score": np.float32(score),
                    "index": idx,
                    "word": tokens[idx],
                    "start": start,
                    "end": end
                })

    return results




# 直接預測2
def get_level2_entities_2(model, tokenizer, sentence, label_map_lvl2, choose_loss ):
    # 1. Tokenize with offsets
    encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True)
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    offsets = encoding["offset_mapping"][0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # 2. Model forward
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, choose_loss=choose_loss)
        logits_lvl1, logits_lvl2 = outputs.logits  # tuple of two tensors

    # 3. Predict level 2 labels
    preds_lvl2 = torch.argmax(logits_lvl2, dim=2)[0].cpu().numpy()

    results = []
    for idx, pred_id in enumerate(preds_lvl2):
        token_id = input_ids[0, idx].item()
        # 跳過特殊token
        if token_id in [tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id]:
            continue

        start, end = offsets[idx]
        if start == end:
            continue

        token_str = tokens[idx]
        entity = label_map_lvl2.get(pred_id, "O")
        if entity != "O":
            probs = torch.softmax(logits_lvl2[0, idx], dim=0)
            score = float(probs[pred_id].cpu().numpy())

            results.append({
                "entity": entity,
                "score": np.float32(score),
                "index": idx,
                "word": token_str,  # 改用 tokenizer 拆出來的 token
                "start": start,
                "end": end
            })

    return results


# 不考慮O才預測
def get_level2_entities_3(model, tokenizer, sentence, label_map_lvl1, label_map_lvl2, choose_loss):
    # 1. Tokenize with offsets
    encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True)
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    offsets = encoding["offset_mapping"][0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    # 2. Model forward
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, choose_loss=choose_loss)
        logits_lvl1, logits_lvl2 = outputs.logits  # tuple of two tensors

    # 3. Predict Level 1 & Level 2
    preds_lvl1 = torch.argmax(logits_lvl1, dim=2)[0].cpu().numpy()
    preds_lvl2 = torch.argmax(logits_lvl2, dim=2)[0].cpu().numpy()

    results = []
    for idx, (pred1_id, pred2_id) in enumerate(zip(preds_lvl1, preds_lvl2)):
        token_id = input_ids[0, idx].item()
        # 跳過特殊 token
        if token_id in [tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id]:
            continue

        start, end = offsets[idx]
        if start == end:
            continue

        entity_lvl1 = label_map_lvl1.get(pred1_id, "O")
        entity_lvl2 = label_map_lvl2.get(pred2_id, "O")

        # ✅ 僅當 Level 1 不是 "O" 才考慮 Level 2 的結果
        if entity_lvl1 != "O" and entity_lvl2 != "O":
            probs = torch.softmax(logits_lvl2[0, idx], dim=0)
            score = float(probs[pred2_id].cpu().numpy())

            results.append({
                "entity": entity_lvl2,
                "score": np.float32(score),
                "index": idx,
                "word": tokens[idx],  # 使用 tokenizer 拆出來的 token
                "start": start,
                "end": end
            })

    return results


# 直接預測2
def get_level2_entities_normal(model, tokenizer, sentence, label_map):
    # 1. Tokenize with offsets
    # encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True, truncation=True)
    # input_ids = encoding["input_ids"]
    # attention_mask = encoding["attention_mask"]
    # offsets = encoding["offset_mapping"][0].tolist()
    # tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True, truncation=True)
    input_ids = encoding["input_ids"].to(device)          # 放到 GPU
    attention_mask = encoding["attention_mask"].to(device)  # 放到 GPU
    offsets = encoding["offset_mapping"][0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())  # token ids 放 CPU 才能用 tokenizer

    # 2. Model forward
    model.to(device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # [batch_size, seq_len, num_labels]


    preds = torch.argmax(logits, dim=2)[0].cpu().numpy()

    results = []
    for idx, (pred_id, offset) in enumerate(zip(preds, offsets)):
        token_id = input_ids[0, idx].item()

        # 跳過特殊 token 或無效 offset
        if token_id in [tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id]:
            continue

        start, end = offset
        entity = label_map.get(pred_id, "O")

        if entity != "O":
            probs = torch.softmax(logits[0, idx], dim=0)
            score = probs[pred_id].item()

            results.append({
                "entity": entity,
                "score": np.float32(score),
                "index": idx,
                "word": tokens[idx],  # 更準確
                "start": start,
                "end": end
            })

    return results





In [None]:
def Choose_label_map( name ) :

  label_map_BIO = {
    0: 'O',
    1: 'B-PATIENT', 2: 'I-PATIENT',
    3: 'B-DOCTOR', 4: 'I-DOCTOR',
    5: 'B-USERNAME', 6: 'I-USERNAME',
    7: 'B-FAMILYNAME', 8: 'I-FAMILYNAME',
    9: 'B-PERSONALNAME', 10: 'I-PERSONALNAME',
    11: 'B-PROFESSION', 12: 'I-PROFESSION',
    13: 'B-ROOM', 14: 'I-ROOM',
    15: 'B-DEPARTMENT', 16: 'I-DEPARTMENT',
    17: 'B-HOSPITAL', 18: 'I-HOSPITAL',
    19: 'B-ORGANIZATION', 20: 'I-ORGANIZATION',
    21: 'B-STREET', 22: 'I-STREET',
    23: 'B-CITY', 24: 'I-CITY',
    25: 'B-DISTRICT', 26: 'I-DISTRICT',
    27: 'B-COUNTY', 28: 'I-COUNTY',
    29: 'B-STATE', 30: 'I-STATE',
    31: 'B-COUNTRY', 32: 'I-COUNTRY',
    33: 'B-ZIP', 34: 'I-ZIP',
    35: 'B-LOCATION-OTHER', 36: 'I-LOCATION-OTHER',
    37: 'B-AGE', 38: 'I-AGE',
    39: 'B-DATE', 40: 'I-DATE',
    41: 'B-TIME', 42: 'I-TIME',
    43: 'B-DURATION', 44: 'I-DURATION',
    45: 'B-SET', 46: 'I-SET',
    47: 'B-PHONE', 48: 'I-PHONE',
    49: 'B-FAX', 50: 'I-FAX',
    51: 'B-EMAIL', 52: 'I-EMAIL',
    53: 'B-URL', 54: 'I-URL',
    55: 'B-IPADDRESS', 56: 'I-IPADDRESS',
    57: 'B-SOCIAL_SECURITY_NUMBER', 58: 'I-SOCIAL_SECURITY_NUMBER',
    59: 'B-MEDICAL_RECORD_NUMBER', 60: 'I-MEDICAL_RECORD_NUMBER',
    61: 'B-HEALTH_PLAN_NUMBER', 62: 'I-HEALTH_PLAN_NUMBER',
    63: 'B-ACCOUNT_NUMBER', 64: 'I-ACCOUNT_NUMBER',
    65: 'B-LICENSE_NUMBER', 66: 'I-LICENSE_NUMBER',
    67: 'B-VEHICLE_ID', 68: 'I-VEHICLE_ID',
    69: 'B-DEVICE_ID', 70: 'I-DEVICE_ID',
    71: 'B-BIOMETRIC_ID', 72: 'I-BIOMETRIC_ID',
    73: 'B-ID_NUMBER', 74: 'I-ID_NUMBER',
    75: 'B-OTHER', 76: 'I-OTHER'
  }

  label_map_BIO_CRF = {
    0: 'O',
    1: 'B-PATIENT', 2: 'I-PATIENT',
    3: 'B-DOCTOR', 4: 'I-DOCTOR',
    5: 'B-USERNAME', 6: 'I-USERNAME',
    7: 'B-FAMILYNAME', 8: 'I-FAMILYNAME',
    9: 'B-PERSONALNAME', 10: 'I-PERSONALNAME',
    11: 'B-PROFESSION', 12: 'I-PROFESSION',
    13: 'B-ROOM', 14: 'I-ROOM',
    15: 'B-DEPARTMENT', 16: 'I-DEPARTMENT',
    17: 'B-HOSPITAL', 18: 'I-HOSPITAL',
    19: 'B-ORGANIZATION', 20: 'I-ORGANIZATION',
    21: 'B-STREET', 22: 'I-STREET',
    23: 'B-CITY', 24: 'I-CITY',
    25: 'B-DISTRICT', 26: 'I-DISTRICT',
    27: 'B-COUNTY', 28: 'I-COUNTY',
    29: 'B-STATE', 30: 'I-STATE',
    31: 'B-COUNTRY', 32: 'I-COUNTRY',
    33: 'B-ZIP', 34: 'I-ZIP',
    35: 'B-LOCATION-OTHER', 36: 'I-LOCATION-OTHER',
    37: 'B-AGE', 38: 'I-AGE',
    39: 'B-DATE', 40: 'I-DATE',
    41: 'B-TIME', 42: 'I-TIME',
    43: 'B-DURATION', 44: 'I-DURATION',
    45: 'B-SET', 46: 'I-SET',
    47: 'B-PHONE', 48: 'I-PHONE',
    49: 'B-FAX', 50: 'I-FAX',
    51: 'B-EMAIL', 52: 'I-EMAIL',
    53: 'B-URL', 54: 'I-URL',
    55: 'B-IPADDRESS', 56: 'I-IPADDRESS',
    57: 'B-SOCIAL_SECURITY_NUMBER', 58: 'I-SOCIAL_SECURITY_NUMBER',
    59: 'B-MEDICAL_RECORD_NUMBER', 60: 'I-MEDICAL_RECORD_NUMBER',
    61: 'B-HEALTH_PLAN_NUMBER', 62: 'I-HEALTH_PLAN_NUMBER',
    63: 'B-ACCOUNT_NUMBER', 64: 'I-ACCOUNT_NUMBER',
    65: 'B-LICENSE_NUMBER', 66: 'I-LICENSE_NUMBER',
    67: 'B-VEHICLE_ID', 68: 'I-VEHICLE_ID',
    69: 'B-DEVICE_ID', 70: 'I-DEVICE_ID',
    71: 'B-BIOMETRIC_ID', 72: 'I-BIOMETRIC_ID',
    73: 'B-ID_NUMBER', 74: 'I-ID_NUMBER',
    75: 'B-OTHER', 76: 'I-OTHER',
    77: "IGNORE"
  }

  label_map_BIOU = { 0: 'O',
    1: 'B-PATIENT', 2: 'I-PATIENT', 3: 'L-PATIENT', 4: 'U-PATIENT',
    5: 'B-DOCTOR', 6: 'I-DOCTOR', 7: 'L-DOCTOR', 8: 'U-DOCTOR',
    9: 'B-USERNAME', 10: 'I-USERNAME', 11: 'L-USERNAME', 12: 'U-USERNAME',
    13: 'B-FAMILYNAME', 14: 'I-FAMILYNAME', 15: 'L-FAMILYNAME', 16: 'U-FAMILYNAME',
    17: 'B-PERSONALNAME', 18: 'I-PERSONALNAME', 19: 'L-PERSONALNAME', 20: 'U-PERSONALNAME',
    21: 'B-PROFESSION', 22: 'I-PROFESSION', 23: 'L-PROFESSION', 24: 'U-PROFESSION',
    25: 'B-ROOM', 26: 'I-ROOM', 27: 'L-ROOM', 28: 'U-ROOM',
    29: 'B-DEPARTMENT', 30: 'I-DEPARTMENT', 31: 'L-DEPARTMENT', 32: 'U-DEPARTMENT',
    33: 'B-HOSPITAL', 34: 'I-HOSPITAL', 35: 'L-HOSPITAL', 36: 'U-HOSPITAL',
    37: 'B-ORGANIZATION', 38: 'I-ORGANIZATION', 39: 'L-ORGANIZATION', 40: 'U-ORGANIZATION',
    41: 'B-STREET', 42: 'I-STREET', 43: 'L-STREET', 44: 'U-STREET',
    45: 'B-CITY', 46: 'I-CITY', 47: 'L-CITY', 48: 'U-CITY',
    49: 'B-DISTRICT', 50: 'I-DISTRICT', 51: 'L-DISTRICT', 52: 'U-DISTRICT',
    53: 'B-COUNTY', 54: 'I-COUNTY', 55: 'L-COUNTY', 56: 'U-COUNTY',
    57: 'B-STATE', 58: 'I-STATE', 59: 'L-STATE', 60: 'U-STATE',
    61: 'B-COUNTRY', 62: 'I-COUNTRY', 63: 'L-COUNTRY', 64: 'U-COUNTRY',
    65: 'B-ZIP', 66: 'I-ZIP', 67: 'L-ZIP', 68: 'U-ZIP',
    69: 'B-LOCATION-OTHER', 70: 'I-LOCATION-OTHER', 71: 'L-LOCATION-OTHER', 72: 'U-LOCATION-OTHER',
    73: 'B-AGE', 74: 'I-AGE', 75: 'L-AGE', 76: 'U-AGE',
    77: 'B-DATE', 78: 'I-DATE', 79: 'L-DATE', 80: 'U-DATE',
    81: 'B-TIME', 82: 'I-TIME', 83: 'L-TIME', 84: 'U-TIME',
    85: 'B-DURATION', 86: 'I-DURATION', 87: 'L-DURATION', 88: 'U-DURATION',
    89: 'B-SET', 90: 'I-SET', 91: 'L-SET', 92: 'U-SET',
    93: 'B-PHONE', 94: 'I-PHONE', 95: 'L-PHONE', 96: 'U-PHONE',
    97: 'B-FAX', 98: 'I-FAX', 99: 'L-FAX', 100: 'U-FAX',
    101: 'B-EMAIL', 102: 'I-EMAIL', 103: 'L-EMAIL', 104: 'U-EMAIL',
    105: 'B-URL', 106: 'I-URL', 107: 'L-URL', 108: 'U-URL',
    109: 'B-IPADDRESS', 110: 'I-IPADDRESS', 111: 'L-IPADDRESS', 112: 'U-IPADDRESS',
    113: 'B-SOCIAL_SECURITY_NUMBER', 114: 'I-SOCIAL_SECURITY_NUMBER', 115: 'L-SOCIAL_SECURITY_NUMBER', 116: 'U-SOCIAL_SECURITY_NUMBER',
    117: 'B-MEDICAL_RECORD_NUMBER', 118: 'I-MEDICAL_RECORD_NUMBER', 119: 'L-MEDICAL_RECORD_NUMBER', 120: 'U-MEDICAL_RECORD_NUMBER',
    121: 'B-HEALTH_PLAN_NUMBER', 122: 'I-HEALTH_PLAN_NUMBER', 123: 'L-HEALTH_PLAN_NUMBER', 124: 'U-HEALTH_PLAN_NUMBER',
    125: 'B-ACCOUNT_NUMBER', 126: 'I-ACCOUNT_NUMBER', 127: 'L-ACCOUNT_NUMBER', 128: 'U-ACCOUNT_NUMBER',
    129: 'B-LICENSE_NUMBER', 130: 'I-LICENSE_NUMBER', 131: 'L-LICENSE_NUMBER', 132: 'U-LICENSE_NUMBER',
    133: 'B-VEHICLE_ID', 134: 'I-VEHICLE_ID', 135: 'L-VEHICLE_ID', 136: 'U-VEHICLE_ID',
    137: 'B-DEVICE_ID', 138: 'I-DEVICE_ID', 139: 'L-DEVICE_ID', 140: 'U-DEVICE_ID',
    141: 'B-BIOMETRIC_ID', 142: 'I-BIOMETRIC_ID', 143: 'L-BIOMETRIC_ID', 144: 'U-BIOMETRIC_ID',
    145: 'B-ID_NUMBER', 146: 'I-ID_NUMBER', 147: 'L-ID_NUMBER', 148: 'U-ID_NUMBER',
    149: 'B-OTHER', 150: 'I-OTHER', 151: 'L-OTHER', 152: 'U-OTHER'
    }


  label_map_BIOU_CRF = { 0: 'O',
    1: 'B-PATIENT', 2: 'I-PATIENT', 3: 'L-PATIENT', 4: 'U-PATIENT',
    5: 'B-DOCTOR', 6: 'I-DOCTOR', 7: 'L-DOCTOR', 8: 'U-DOCTOR',
    9: 'B-USERNAME', 10: 'I-USERNAME', 11: 'L-USERNAME', 12: 'U-USERNAME',
    13: 'B-FAMILYNAME', 14: 'I-FAMILYNAME', 15: 'L-FAMILYNAME', 16: 'U-FAMILYNAME',
    17: 'B-PERSONALNAME', 18: 'I-PERSONALNAME', 19: 'L-PERSONALNAME', 20: 'U-PERSONALNAME',
    21: 'B-PROFESSION', 22: 'I-PROFESSION', 23: 'L-PROFESSION', 24: 'U-PROFESSION',
    25: 'B-ROOM', 26: 'I-ROOM', 27: 'L-ROOM', 28: 'U-ROOM',
    29: 'B-DEPARTMENT', 30: 'I-DEPARTMENT', 31: 'L-DEPARTMENT', 32: 'U-DEPARTMENT',
    33: 'B-HOSPITAL', 34: 'I-HOSPITAL', 35: 'L-HOSPITAL', 36: 'U-HOSPITAL',
    37: 'B-ORGANIZATION', 38: 'I-ORGANIZATION', 39: 'L-ORGANIZATION', 40: 'U-ORGANIZATION',
    41: 'B-STREET', 42: 'I-STREET', 43: 'L-STREET', 44: 'U-STREET',
    45: 'B-CITY', 46: 'I-CITY', 47: 'L-CITY', 48: 'U-CITY',
    49: 'B-DISTRICT', 50: 'I-DISTRICT', 51: 'L-DISTRICT', 52: 'U-DISTRICT',
    53: 'B-COUNTY', 54: 'I-COUNTY', 55: 'L-COUNTY', 56: 'U-COUNTY',
    57: 'B-STATE', 58: 'I-STATE', 59: 'L-STATE', 60: 'U-STATE',
    61: 'B-COUNTRY', 62: 'I-COUNTRY', 63: 'L-COUNTRY', 64: 'U-COUNTRY',
    65: 'B-ZIP', 66: 'I-ZIP', 67: 'L-ZIP', 68: 'U-ZIP',
    69: 'B-LOCATION-OTHER', 70: 'I-LOCATION-OTHER', 71: 'L-LOCATION-OTHER', 72: 'U-LOCATION-OTHER',
    73: 'B-AGE', 74: 'I-AGE', 75: 'L-AGE', 76: 'U-AGE',
    77: 'B-DATE', 78: 'I-DATE', 79: 'L-DATE', 80: 'U-DATE',
    81: 'B-TIME', 82: 'I-TIME', 83: 'L-TIME', 84: 'U-TIME',
    85: 'B-DURATION', 86: 'I-DURATION', 87: 'L-DURATION', 88: 'U-DURATION',
    89: 'B-SET', 90: 'I-SET', 91: 'L-SET', 92: 'U-SET',
    93: 'B-PHONE', 94: 'I-PHONE', 95: 'L-PHONE', 96: 'U-PHONE',
    97: 'B-FAX', 98: 'I-FAX', 99: 'L-FAX', 100: 'U-FAX',
    101: 'B-EMAIL', 102: 'I-EMAIL', 103: 'L-EMAIL', 104: 'U-EMAIL',
    105: 'B-URL', 106: 'I-URL', 107: 'L-URL', 108: 'U-URL',
    109: 'B-IPADDRESS', 110: 'I-IPADDRESS', 111: 'L-IPADDRESS', 112: 'U-IPADDRESS',
    113: 'B-SOCIAL_SECURITY_NUMBER', 114: 'I-SOCIAL_SECURITY_NUMBER', 115: 'L-SOCIAL_SECURITY_NUMBER', 116: 'U-SOCIAL_SECURITY_NUMBER',
    117: 'B-MEDICAL_RECORD_NUMBER', 118: 'I-MEDICAL_RECORD_NUMBER', 119: 'L-MEDICAL_RECORD_NUMBER', 120: 'U-MEDICAL_RECORD_NUMBER',
    121: 'B-HEALTH_PLAN_NUMBER', 122: 'I-HEALTH_PLAN_NUMBER', 123: 'L-HEALTH_PLAN_NUMBER', 124: 'U-HEALTH_PLAN_NUMBER',
    125: 'B-ACCOUNT_NUMBER', 126: 'I-ACCOUNT_NUMBER', 127: 'L-ACCOUNT_NUMBER', 128: 'U-ACCOUNT_NUMBER',
    129: 'B-LICENSE_NUMBER', 130: 'I-LICENSE_NUMBER', 131: 'L-LICENSE_NUMBER', 132: 'U-LICENSE_NUMBER',
    133: 'B-VEHICLE_ID', 134: 'I-VEHICLE_ID', 135: 'L-VEHICLE_ID', 136: 'U-VEHICLE_ID',
    137: 'B-DEVICE_ID', 138: 'I-DEVICE_ID', 139: 'L-DEVICE_ID', 140: 'U-DEVICE_ID',
    141: 'B-BIOMETRIC_ID', 142: 'I-BIOMETRIC_ID', 143: 'L-BIOMETRIC_ID', 144: 'U-BIOMETRIC_ID',
    145: 'B-ID_NUMBER', 146: 'I-ID_NUMBER', 147: 'L-ID_NUMBER', 148: 'U-ID_NUMBER',
    149: 'B-OTHER', 150: 'I-OTHER', 151: 'L-OTHER', 152: 'U-OTHER', 153: "IGNORE"
    }

  label_map_big = {
        0: 'O',
        1: 'B-NAME', 2: 'I-NAME',
        3: 'B-OCCUPATION', 4: 'I-OCCUPATION',
        5: 'B-LOCATION', 6: 'I-LOCATION',
        7: 'B-AGE', 8: 'I-AGE',
        9: 'B-DATE', 10: 'I-DATE',
        11: 'B-CONTACT_INFORMATION', 12:'I-CONTACT_INFORMATION',
        13: 'B-IDENTIFIERS', 14: 'I-IDENTIFIERS',
        15: 'B-OTHER', 16: 'I-OTHER'
    }


  if name == "BIO":
    return label_map_BIO
  elif name == "BIOU":
    return label_map_BIOU
  elif name == "BIG":
    return label_map_big
  elif name == "BIO_CRF":
    return label_map_BIO_CRF
  elif name == "BIOU_CRF":
    return label_map_BIOU_CRF
  else:
    return None



name = "BIO"
BIO_label_map = Choose_label_map( name )

new_label2id_BIO = {v: k for k, v in BIO_label_map.items()}
new_id2label_BIO = BIO_label_map


name = "BIOU"
BIOU_label_map = Choose_label_map( name )

new_label2id_BIOU = {v: k for k, v in BIOU_label_map.items()}
new_id2label_BIOU = BIOU_label_map


name ="BIG"
BIG_label_map = Choose_label_map( name )

new_label2id_BIG = {v: k for k, v in BIG_label_map.items()}
new_id2label_BIG = BIG_label_map


name ="BIO_CRF"
BIO_CRF_label_map = Choose_label_map( name )

new_label2id_BIO_CRF = {v: k for k, v in BIO_CRF_label_map.items()}
new_id2label_BIO_CRF = BIO_CRF_label_map


name ="BIOU_CRF"
BIOU_CRF_label_map = Choose_label_map( name )

new_label2id_BIOU_CRF = {v: k for k, v in BIOU_CRF_label_map.items()}
new_id2label_BIOU_CRF = BIOU_CRF_label_map

In [None]:
def Process_Predict_Ner_BIOUL(pre):
    answer_list = []
    current_entity = None
    current_word = ""
    start_pos = None
    end_pos = None

    for dic in pre:
        entity_type = dic['entity']
        raw_word = dic['word']
        word = raw_word.replace("▁", "")
        token_start = dic.get('start')
        token_end = dic.get('end')
        has_space = raw_word.startswith("▁")

        if entity_type.startswith("B-"):
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = entity_type.replace("B-", "")
            current_word = word
            start_pos = token_start
            end_pos = token_end

        elif entity_type.startswith("I-"):
            ent = entity_type.replace("I-", "")
            if current_entity == ent:
                if has_space:
                    current_word += " " + word
                else:
                    current_word += word
                end_pos = token_end
            else:
                if current_entity and current_word:
                    answer_list.append({
                        "entity": current_entity,
                        "word": current_word,
                        "start": start_pos,
                        "end": end_pos
                    })
                current_entity = ent
                current_word = word
                start_pos = token_start
                end_pos = token_end

        elif entity_type.startswith("L-"):
            ent = entity_type.replace("L-", "")
            if current_entity == ent:
                if has_space:
                    current_word += " " + word
                else:
                    current_word += word
                end_pos = token_end
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
                current_entity = None
                current_word = ""
                start_pos = None
                end_pos = None
            else:
                # 如果之前的 entity 沒接上，當作獨立實體處理
                answer_list.append({
                    "entity": ent,
                    "word": word,
                    "start": token_start,
                    "end": token_end
                })
                current_entity = None
                current_word = ""
                start_pos = None
                end_pos = None

        elif entity_type.startswith("U-"):
            ent = entity_type.replace("U-", "")
            answer_list.append({
                "entity": ent,
                "word": word,
                "start": token_start,
                "end": token_end
            })

            current_entity = None
            current_word = ""
            start_pos = None
            end_pos = None

        else:  # O
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = None
            current_word = ""
            start_pos = None
            end_pos = None

    # 收尾
    if current_entity and current_word:
        answer_list.append({
            "entity": current_entity,
            "word": current_word,
            "start": start_pos,
            "end": end_pos
        })

    return answer_list



def Process_Predict_Ner(pre):
    answer_list = []
    current_entity = None
    current_word = ""
    start_pos = None
    end_pos = None

    for dic in pre:
        entity_type = dic['entity']
        raw_word = dic['word']
        word = raw_word.replace("▁", "")
        token_start = dic.get('start')
        token_end = dic.get('end')
        has_space = raw_word.startswith("▁")

        if entity_type.startswith("B-"):
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = entity_type.replace("B-", "")
            current_word = word
            start_pos = token_start
            end_pos = token_end

        elif entity_type.startswith("I-"):
            ent = entity_type.replace("I-", "")
            if current_entity == ent:
                if has_space:
                    current_word += " " + word
                else:
                    current_word += word
                end_pos = token_end
            else:
                if current_entity and current_word:
                    answer_list.append({
                        "entity": current_entity,
                        "word": current_word,
                        "start": start_pos,
                        "end": end_pos
                    })
                current_entity = ent
                current_word = word
                start_pos = token_start
                end_pos = token_end

        else:  # O
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = None
            current_word = ""
            start_pos = None
            end_pos = None

    # 收尾
    if current_entity and current_word:
        answer_list.append({
            "entity": current_entity,
            "word": current_word,
            "start": start_pos,
            "end": end_pos
        })

    return answer_list


def Process_Predict_level(pre):

    answer_list = []
    current_entity = None
    current_word = ""
    start_pos = None
    end_pos = None

    for dic in pre:
        entity_type = dic['entity']
        raw_word = dic['word']
        word = raw_word.replace("▁", "")
        token_start = dic.get('start')
        token_end = dic.get('end')
        has_space = raw_word.startswith("▁")

        if entity_type.startswith("B-"):
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = entity_type.replace("B-", "")
            current_word = word
            start_pos = token_start
            end_pos = token_end

        elif entity_type.startswith("I-"):
            ent = entity_type.replace("I-", "")
            if current_entity == ent:
                if has_space:
                    current_word += " " + word
                else:
                    current_word += word
                end_pos = token_end
            else:
                if current_entity and current_word:
                    answer_list.append({
                        "entity": current_entity,
                        "word": current_word,
                        "start": start_pos,
                        "end": end_pos
                    })
                current_entity = ent
                current_word = word
                start_pos = token_start
                end_pos = token_end

        else:  # O
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = None
            current_word = ""
            start_pos = None
            end_pos = None

    # 收尾
    if current_entity and current_word:
        answer_list.append({
            "entity": current_entity,
            "word": current_word,
            "start": start_pos,
            "end": end_pos
        })

    return answer_list


# **CRF**

In [None]:
!pip install torchcrf
!pip install pytorch-crf

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torchcrf import CRF


from transformers.modeling_outputs import TokenClassifierOutput

from transformers import XLMRobertaConfig, XLMRobertaModel, XLMRobertaPreTrainedModel
from transformers import PreTrainedModel
from transformers import XLMRobertaForTokenClassification



class XLMRobertaWithCRF(XLMRobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # 用 from_pretrained 來載入預訓練權重
        self.roberta = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english", config=config, token=access_token)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.crf = CRF(config.num_labels, batch_first=True)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        emissions = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            mask = attention_mask.bool()
            mask[:, 0] = True  # 確保第一token是有效的mask
            loss = -self.crf(emissions, labels, mask=mask, reduction='mean')
        return TokenClassifierOutput(
            loss=loss,
            logits=emissions,
        )


# **FOCAL LOSS**

In [None]:
from transformers import XLMRobertaForTokenClassification
import torch.nn as nn
from transformers.modeling_outputs import TokenClassifierOutput
import torch.nn.functional as F # Import F



class XLMRobertaForTokenClassificationWithFocalLoss(XLMRobertaForTokenClassification):
    def __init__(self, config):
        super().__init__(config)
        self.ignore_index = config.label2id.get("0", -100) if hasattr(config, "label2id") else -100
        self.gamma = 2.0  # focal loss gamma

    def compute_focal_loss(self, logits, targets):
        log_probs = F.log_softmax(logits, dim=-1)
        probs = torch.exp(log_probs)

        targets = targets.view(-1)
        log_probs = log_probs.view(-1, log_probs.size(-1))
        probs = probs.view(-1, probs.size(-1))

        mask = targets != self.ignore_index
        targets = targets[mask]
        log_probs = log_probs[mask] # Apply mask to log_probs
        probs = probs[mask] # Apply mask to probs


        if targets.numel() == 0:
            return torch.tensor(0.0, dtype=logits.dtype, device=logits.device)

        focal_weight = (1 - probs.gather(1, targets.unsqueeze(1)).squeeze()) ** self.gamma
        loss = -focal_weight * log_probs.gather(1, targets.unsqueeze(1)).squeeze()
        return loss.mean()

    def forward(self, input_ids=None, attention_mask=None, labels=None, ):
        # 預測
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None,  # 我們自己處理 loss
        )

        logits = outputs.logits
        loss = None
        if labels is not None:
            loss = self.compute_focal_loss(logits, labels)

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# **LEVEL**

In [None]:
from transformers import XLMRobertaPreTrainedModel, XLMRobertaModel
import torch.nn as nn
from transformers.modeling_outputs import TokenClassifierOutput

class XLMRobertaForHierarchicalTokenClassification(XLMRobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels_lvl1 = getattr(config, "num_labels_lvl1", None)
        self.num_labels_lvl2 = getattr(config, "num_labels_lvl2", None)

        self.roberta = XLMRobertaModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.classifier_lvl1 = nn.Linear(config.hidden_size, self.num_labels_lvl1)
        self.classifier_lvl2 = nn.Linear(config.hidden_size, self.num_labels_lvl2)

        self.init_weights()


    ## loss

    def forward(self, input_ids=None, attention_mask=None, labels_lvl1=None, labels_lvl2=None, choose_loss = 1 ):

        ## loss 1
        if choose_loss == 1:
          outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
          sequence_output = self.dropout(outputs[0])

          logits_lvl1 = self.classifier_lvl1(sequence_output)
          logits_lvl2 = self.classifier_lvl2(sequence_output)

          loss = None
          loss_fct = nn.CrossEntropyLoss()
          if labels_lvl1 is not None and labels_lvl2 is not None:
              loss1 = loss_fct(logits_lvl1.view(-1, self.num_labels_lvl1), labels_lvl1.view(-1))
              loss2 = loss_fct(logits_lvl2.view(-1, self.num_labels_lvl2), labels_lvl2.view(-1))
              loss = loss1 + loss2

          return TokenClassifierOutput(
              loss=loss,
              logits=(logits_lvl1, logits_lvl2)  # ⚠️ 注意這裡傳 tuple
          )

        ## loss 2
        elif choose_loss == 2:

          outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
          sequence_output = self.dropout(outputs[0])

          logits_lvl1 = self.classifier_lvl1(sequence_output)
          logits_lvl2 = self.classifier_lvl2(sequence_output)

          loss = None
          loss_fct = nn.CrossEntropyLoss()

          if labels_lvl1 is not None and labels_lvl2 is not None:
              # Level 1 loss
              loss1 = loss_fct(logits_lvl1.view(-1, self.num_labels_lvl1), labels_lvl1.view(-1))

              # Level 2 loss masking: 只在 Level 1 不是 O 的 token 上計算
              active_mask = (labels_lvl1.view(-1) != 0)  # 假設 "O" 是 label id 0
              active_logits_lvl2 = logits_lvl2.view(-1, self.num_labels_lvl2)[active_mask]
              active_labels_lvl2 = labels_lvl2.view(-1)[active_mask]

              if active_labels_lvl2.numel() > 0:
                  loss2 = loss_fct(active_logits_lvl2, active_labels_lvl2)
              else:
                  loss2 = 0.0  # 沒有合法 token，不加 loss2

              loss = loss1 + loss2

          return TokenClassifierOutput(
              loss=loss,
              logits=(logits_lvl1, logits_lvl2)  # 注意：回傳 tuple
          )




# **weight_class**

In [None]:
from transformers import XLMRobertaForTokenClassification
import torch.nn as nn
from transformers.modeling_outputs import TokenClassifierOutput
import torch.nn.functional as F # Import F



from transformers import XLMRobertaForTokenClassification, AutoConfig
import torch.nn as nn
from transformers.modeling_outputs import TokenClassifierOutput
import torch


class XLMRobertaForTokenClassificationWithClassWeight(XLMRobertaForTokenClassification):
    def __init__(self, config, class_weights=None):
        super().__init__(config)
        self.num_labels = config.num_labels

        if class_weights is not None:
            self.loss_fct = nn.CrossEntropyLoss(weight=class_weights, ignore_index=-100)
        else:
            self.loss_fct = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids=None, attention_mask=None, labels=None):

        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,  # 必須保留
        )

        logits = outputs.logits
        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.contiguous().view(-1, self.num_labels), labels.contiguous().view(-1))

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )




# **預測**

In [None]:
import numpy as np
import pandas as pd

def calculate_overlap(pred_start, pred_end, gt_start, gt_end):
    """計算兩個時間區間的重疊長度"""
    overlap_start = max(pred_start, gt_start)
    overlap_end = min(pred_end, gt_end)
    overlap = max(0, overlap_end - overlap_start)
    return overlap

def evaluate_task2(prediction_file, ground_truth_file):
    # 讀取預測和真實標籤數據
    pred_df = pd.read_csv(prediction_file, sep='\t', header=None,
                         names=['id', 'type', 'start', 'end', 'content'])
    gt_df = pd.read_csv(ground_truth_file, sep='\t', header=None,
                       names=['id', 'type', 'start', 'end', 'content'])

    # 獲取所有獨特的SHI類型
    all_types = sorted(set(gt_df['type'].unique()) | set(pred_df['type'].unique()))

    # 初始化每種類型的指標
    metrics = {shi_type: {'tp': 0, 'fp': 0, 'fn': 0} for shi_type in all_types}

    # 按音頻ID分組處理
    unique_ids = sorted(set(gt_df['id'].unique()) | set(pred_df['id'].unique()))

    for audio_id in unique_ids:
        gt_records = gt_df[gt_df['id'] == audio_id].copy()
        pred_records = pred_df[pred_df['id'] == audio_id].copy()

        # 初始化匹配矩陣來追蹤已處理的預測和真實標籤
        gt_matched = [False] * len(gt_records)
        pred_matched = [False] * len(pred_records)

        # 計算True Positives和部分False Positives/False Negatives
        for i, pred_row in enumerate(pred_records.itertuples()):
            pred_type = pred_row.type
            pred_start = pred_row.start
            pred_end = pred_row.end
            pred_duration = pred_end - pred_start

            best_overlap = 0
            best_gt_idx = -1

            # 找到與當前預測重疊最大的真實標籤
            for j, gt_row in enumerate(gt_records.itertuples()):
                if gt_row.type != pred_type:
                    continue

                overlap = calculate_overlap(pred_start, pred_end, gt_row.start, gt_row.end)
                if overlap > best_overlap:
                    best_overlap = overlap
                    best_gt_idx = j

            if best_gt_idx >= 0:  # 找到部分匹配
                gt_row = gt_records.iloc[best_gt_idx]
                gt_duration = gt_row.end - gt_row.start

                # 計算 True Positive
                metrics[pred_type]['tp'] += best_overlap

                # 計算 False Positive (對於部分匹配，類型相同)
                metrics[pred_type]['fp'] += pred_duration - best_overlap

                # 計算 False Negative (對於部分匹配，類型相同)
                metrics[pred_type]['fn'] += gt_duration - best_overlap

                # 標記已處理
                gt_matched[best_gt_idx] = True
                pred_matched[i] = True
            else:
                # 完全不匹配或者類型不同：整個預測為False Positive
                metrics[pred_type]['fp'] += pred_duration

        # 處理未匹配的真實標籤 (False Negatives)
        for j, matched in enumerate(gt_matched):
            if not matched:
                gt_row = gt_records.iloc[j]
                gt_type = gt_row.type
                gt_duration = gt_row.end - gt_row.start
                metrics[gt_type]['fn'] += gt_duration

        # 處理與類型不同的預測 (False Positives)
        for i, (matched, pred_row) in enumerate(zip(pred_matched, pred_records.itertuples())):
            if matched:
                continue

            # 檢查是否有與其他類型匹配
            pred_type = pred_row.type
            pred_start = pred_row.start
            pred_end = pred_row.end
            pred_duration = pred_end - pred_start

            for gt_row in gt_records.itertuples():
                if gt_row.type == pred_type:
                    continue  # 已在之前的步驟中處理過

                overlap = calculate_overlap(pred_start, pred_end, gt_row.start, gt_row.end)
                if overlap > 0:
                    # 類型不匹配但時間重疊：整個預測為False Positive
                    metrics[pred_type]['fp'] += pred_duration
                    break

    # 計算每種類型的Precision, Recall和F1
    f1_scores = []
    for shi_type in all_types:
        m = metrics[shi_type]
        precision = m['tp'] / (m['tp'] + m['fp']) if (m['tp'] + m['fp']) > 0 else 0
        recall = m['tp'] / (m['tp'] + m['fn']) if (m['tp'] + m['fn']) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

        print(f"類型 {shi_type}:")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1: {f1:.4f}")
        print(f"  TP: {m['tp']:.2f}, FP: {m['fp']:.2f}, FN: {m['fn']:.2f}")
        print()

    # 計算宏平均F1
    macro_f1 = np.mean(f1_scores)
    print(f"Macro-Average F1: {macro_f1:.4f}")

    return macro_f1



In [None]:
from transformers import AutoTokenizer
import pandas as pd


model_name = "xlm-roberta-large-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)

# 載入 task1 資料：{id: sentence}
def load_task1(filepath):
    id2text = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(maxsplit=1)
            if len(parts) == 2:
                id_, text = parts
                id2text[id_] = text
    return id2text

# 載入 task2 為 DataFrame
def load_task2(filepath):
    rows = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()  # 先去除行首尾空白與換行
            if not line:
                continue
            parts = line.split('\t')
            if len(parts) == 5:
                parts = [p.strip() for p in parts]  # 再去每個欄位的頭尾空白
                rows.append(parts)
    df = pd.DataFrame(rows, columns=["id", "type", "start_time", "end_time", "content"])
    return df

def find_nth_occurrence(text, substring, n):
    start = -1
    for _ in range(n):
        start = text.find(substring, start + 1)
        if start == -1:
            return -1
    return start


def map_entities_to_char_indices_with_duplicates(task1_file, task2_file, output_file=None):
    id2text = load_task1(task1_file)
    df_task2 = load_task2(task2_file)

    # 用來記錄每個 (id, entity_text) 出現次數
    occurrence_counter = {}

    results = []
    for row in df_task2.itertuples():
        tid = str(row.id)
        entity_text = row.content.strip()
        entity_type = row.type

        if tid not in id2text:
            results.append([tid, entity_type, -1, -1, entity_text])
            continue

        sentence = id2text[tid]

        key = (tid, entity_text)
        occurrence_counter[key] = occurrence_counter.get(key, 0) + 1
        nth = occurrence_counter[key]

        start_char = find_nth_occurrence(sentence, entity_text, nth)
        if start_char == -1:
            start_char, end_char = -1, -1
        else:
            end_char = start_char + len(entity_text)

        results.append([tid, entity_type, start_char, end_char, entity_text])

    if output_file:
        with open(output_file, "w", encoding="utf-8") as f:
            for r in results:
                f.write("\t".join(map(str, r)) + "\n")

    return pd.DataFrame(results, columns=['id', 'type', 'start_char', 'end_char', 'content'])




In [None]:
df_result = map_entities_to_char_indices_with_duplicates(
    model_test_task1_data_path_txt,
    model_test_task2_data_path_txt,
    "./entity_token_indices.txt"
)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tabulate import tabulate  # pip install tabulate
from collections import Counter




# 測試task1資料
with open( model_test_task1_data_path_txt, "r", encoding="utf-8" ) as f :
  test_data = f.readlines()


model_name = [
           model_1_path
             ]  # 你想測試的模型路徑列表

results = []

for model_id in model_name:
    choose_loss = 0
    answer = ""

    if "level" in model_id:
        if "loss_1" in model_id:
            choose_loss = 1
        elif "loss_2" in model_id:
            choose_loss = 2

        tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
        model = XLMRobertaForHierarchicalTokenClassification.from_pretrained(model_id)

        new_label2id = new_label2id_BIO
        new_id2label = new_id2label_BIO
        label_map = BIO_label_map
        label_map_big = BIG_label_map

    elif "focal_loss_BIOU" in model_id:

      tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
      model = XLMRobertaForTokenClassificationWithFocalLoss.from_pretrained(model_id)

      # classifier = pipeline("ner", model=model, tokenizer=tokenizer)

      new_label2id = new_label2id_BIOU
      new_id2label = new_id2label_BIOU
      label_map = BIOU_label_map

    elif "focal" in model_id:

      tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
      model = XLMRobertaForTokenClassificationWithFocalLoss.from_pretrained(model_id)

      # classifier = pipeline("ner", model=model, tokenizer=tokenizer)

      new_label2id = new_label2id_BIO
      new_id2label = new_id2label_BIO
      label_map = BIO_label_map

    elif "crf_BIOU" in model_id:

      tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english", token=access_token)
      model = XLMRobertaWithCRF.from_pretrained(model_id)

      new_label2id = new_label2id_BIOU_CRF
      new_id2label = new_id2label_BIOU_CRF
      label_map = BIOU_CRF_label_map

    elif "crf_FGM" in model_id:

      tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english", token=access_token)
      model = XLMRobertaWithCRF.from_pretrained(model_id)

      new_label2id = new_label2id_BIO_CRF
      new_id2label = new_id2label_BIO_CRF
      label_map = BIO_CRF_label_map


    elif "crf" in model_id:

      tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english", token=access_token)
      model = XLMRobertaWithCRF.from_pretrained(model_id)

      new_label2id = new_label2id_BIO_CRF
      new_id2label = new_id2label_BIO_CRF
      label_map = BIO_CRF_label_map

    elif "weight_class_BIOU" in model_id:

      tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english", token=access_token)
      model = XLMRobertaForTokenClassificationWithClassWeight.from_pretrained(model_id, token=access_token, ignore_mismatched_sizes=True)

      new_label2id = new_label2id_BIOU
      new_id2label = new_id2label_BIOU
      label_map = BIOU_label_map

    elif "weight_class" in model_id:

      tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english", token=access_token)
      model = XLMRobertaForTokenClassificationWithClassWeight.from_pretrained(model_id, token=access_token, ignore_mismatched_sizes=True)

      new_label2id = new_label2id_BIO
      new_id2label = new_id2label_BIO
      label_map = BIO_label_map

    else:
        tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)
        model = AutoModelForTokenClassification.from_pretrained(model_id, token=access_token)
        # print(next(model.parameters()).device)
        classifier = pipeline("ner", model=model, tokenizer=tokenizer)

        if ( "BIOU" in model_id ) or ( "FGM_BIOU" in model_id ) :
            new_label2id = new_label2id_BIOU
            new_id2label = new_id2label_BIOU
            label_map = BIOU_label_map
        elif "BIO" in model_id:
            new_label2id = new_label2id_BIO
            new_id2label = new_id2label_BIO
            label_map = BIO_label_map
        elif "FGM" in model_id:
            new_label2id = new_label2id_BIO
            new_id2label = new_id2label_BIO
            label_map = BIO_label_map


    total_time = 1

    if "level" in model_id:
      total_time = 3

    for times in range(total_time):
      for line in test_data:
          line = line.strip()
          name, text = line.split("\t")

          answer_list = []  # 初始化

          if choose_loss == 0:

                  if "crf_BIOU" in model_id:
                      pre = get_level2_entities_normal(model, tokenizer, text, label_map)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner_BIOUL(pre)
                  elif "focal_loss_BIOU" in model_id:
                      pre = get_level2_entities_normal(model, tokenizer, text, label_map)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner_BIOUL(pre)
                  elif "weight_class_BIOU" in model_id:
                      pre = get_level2_entities_normal(model, tokenizer, text, label_map)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner_BIOUL(pre)
                  elif ( "BIOU" in model_id ) or ( "FGM_BIOU" in model_id ) :
                      pre = classifier(text)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner_BIOUL(pre)
                  elif "BIO" in model_id:
                      pre = classifier(text)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner(pre)
                  elif "focal" in model_id:
                      pre = get_level2_entities_normal(model, tokenizer, text, label_map)
                      # pre = classifier(text)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner(pre)
                  elif "crf_FGM" in model_id:
                      pre = get_level2_entities_normal(model, tokenizer, text, label_map)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner(pre)
                  elif "crf" in model_id:
                      pre = get_level2_entities_normal(model, tokenizer, text, label_map)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner(pre)
                  elif "weight_class" in model_id:
                      pre = get_level2_entities_normal(model, tokenizer, text, label_map)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner(pre)
                  elif "FGM" in model_id:
                      pre = classifier(text)
                      if len(pre) != 0:
                        answer_list = Process_Predict_Ner(pre)

                  for i in answer_list:
                      answer += f"{name}\t{i['entity']}\t{i['start']}\t{i['end']}\t{i['word']}\n"
          else:

                if times == 0:
                    pre = get_level2_entities_1(model, tokenizer, text, label_map_big, label_map, choose_loss)
                elif times == 1:
                    pre = get_level2_entities_2(model, tokenizer, text, label_map, choose_loss)
                else:
                    pre = get_level2_entities_3(model, tokenizer, text, label_map_big, label_map, choose_loss)

                if len(pre) != 0:
                    answer_list = Process_Predict_level(pre)
                    for i in answer_list:
                        answer += f"{name}\t{i['entity']}\t{i['start']}\t{i['end']}\t{i['word']}\n"

      name = model_id[model_id.find("model"):]
      file_name = f"./pre_answer_{ name }_{choose_loss}.txt"

      with open( file_name, "w", encoding="utf-8") as f:
          f.write(answer)

      macro_f1 = evaluate_task2( file_name, "./entity_token_indices.txt" )


      results.append({
          "Model": model_id,
          "Choose Loss": choose_loss,
          "F1 Score": round(macro_f1, 4),
          "total_time": total_time,
          "now_time": times
      })

# 🔸 最後整理成表格
print(tabulate(results, headers="keys", tablefmt="github"))


### **=============================================================================================**