In [None]:
import json

with open("config.json", "r") as f:
    config_argument = json.load(f)

access_token = config_argument["huggingface_access_token"]

task1_train_data_path_txt = config_argument["model_train_task1_data_path_txt"]
task2_train_data_path_txt = config_argument["model_train_task2_data_path_txt"]

task1_val_data_path_txt = config_argument["model_val_task1_data_path_txt"]
task2_val_data_path_txt = config_argument["model_val_task2_data_path_txt"]

answer_val_data_path_txt = config_argument["answer_val_data_path_txt"]

model_save_path = config_argument["model_save_path"]
model_logging_dir = config_argument["model_logging_dir"]


print( "access_token: ", access_token )
print( "task1_train_data_path: ", task1_train_data_path_txt )
print( "task2_train_data_path: ", task2_train_data_path_txt )
print( "task1_val_data_path: ", task1_val_data_path_txt )
print( "task2_val_data_path: ", task2_val_data_path_txt )
print( "answer_val_data_path: ", answer_val_data_path_txt )
print( "model_save_path: ", model_save_path )
print( "model_logging_dir: ", model_logging_dir )


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from transformers import pipeline

access_token = access_token

model_id = "xlm-roberta-large-finetuned-conll03-english"  # xlm-roberta-large-finetuned-conll03-english

from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token ) #padding=True,truncation=True,max_length=128
model = AutoModelForTokenClassification.from_pretrained(model_id, token=access_token)
classifier = pipeline("ner", model=model, tokenizer=tokenizer)
classifier("Alya told Jasmine that Andrew could pay with cash..")



In [None]:
# 查看原始 dropout 設定
print("hidden_dropout_prob:", model.roberta.config.hidden_dropout_prob)
print("attention_probs_dropout_prob:", model.roberta.config.attention_probs_dropout_prob)

# 查看實際 Dropout 層的機率
print("model.dropout.p:", model.dropout.p)

In [None]:
print("Special tokens:")
for name in tokenizer.special_tokens_map:
    token = tokenizer.special_tokens_map[name]
    token_id = tokenizer.convert_tokens_to_ids(token)
    print(f"{name:20} -> {token:10} (ID: {token_id})")

In [23]:
tokens = tokenizer("really I used to have that. I used to get excited when I could have the afternoon to myself, but I don't really have that any more. I try to fill my time with other things. On Wednesdays, for example, Amelia is at school until nine o'clock at night, so Wednesday is like my night to try to go see some friends or something like that, and keep myself stay at work late or do something right after work, so I'm not just like at home by myself.", return_offsets_mapping=True)
print(tokens.tokens())

['<s>', '▁really', '▁I', '▁used', '▁to', '▁have', '▁that', '.', '▁I', '▁used', '▁to', '▁get', '▁excited', '▁when', '▁I', '▁could', '▁have', '▁the', '▁afternoon', '▁to', '▁myself', ',', '▁but', '▁I', '▁don', "'", 't', '▁really', '▁have', '▁that', '▁any', '▁more', '.', '▁I', '▁try', '▁to', '▁fill', '▁my', '▁time', '▁with', '▁other', '▁things', '.', '▁On', '▁Wednesday', 's', ',', '▁for', '▁example', ',', '▁Am', 'elia', '▁is', '▁at', '▁school', '▁until', '▁ni', 'ne', '▁o', "'", 'c', 'lock', '▁at', '▁night', ',', '▁so', '▁Wednesday', '▁is', '▁like', '▁my', '▁night', '▁to', '▁try', '▁to', '▁go', '▁see', '▁some', '▁friends', '▁or', '▁something', '▁like', '▁that', ',', '▁and', '▁keep', '▁myself', '▁stay', '▁at', '▁work', '▁late', '▁or', '▁do', '▁something', '▁right', '▁after', '▁work', ',', '▁so', '▁I', "'", 'm', '▁not', '▁just', '▁like', '▁at', '▁home', '▁by', '▁myself', '.', '</s>']


In [24]:

config = AutoConfig.from_pretrained(model_id, token=access_token)
print(config.id2label)  # 這會列出 id 對應的 label

{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'I-LOC', 4: 'I-MISC', 5: 'I-ORG', 6: 'I-PER', 7: 'O'}


In [25]:
label_map = {
    0: 'O',
    1: 'B-PATIENT', 2: 'I-PATIENT', 3: 'L-PATIENT', 4: 'U-PATIENT',
    5: 'B-DOCTOR', 6: 'I-DOCTOR', 7: 'L-DOCTOR', 8: 'U-DOCTOR',
    9: 'B-USERNAME', 10: 'I-USERNAME', 11: 'L-USERNAME', 12: 'U-USERNAME',
    13: 'B-FAMILYNAME', 14: 'I-FAMILYNAME', 15: 'L-FAMILYNAME', 16: 'U-FAMILYNAME',
    17: 'B-PERSONALNAME', 18: 'I-PERSONALNAME', 19: 'L-PERSONALNAME', 20: 'U-PERSONALNAME',
    21: 'B-PROFESSION', 22: 'I-PROFESSION', 23: 'L-PROFESSION', 24: 'U-PROFESSION',
    25: 'B-ROOM', 26: 'I-ROOM', 27: 'L-ROOM', 28: 'U-ROOM',
    29: 'B-DEPARTMENT', 30: 'I-DEPARTMENT', 31: 'L-DEPARTMENT', 32: 'U-DEPARTMENT',
    33: 'B-HOSPITAL', 34: 'I-HOSPITAL', 35: 'L-HOSPITAL', 36: 'U-HOSPITAL',
    37: 'B-ORGANIZATION', 38: 'I-ORGANIZATION', 39: 'L-ORGANIZATION', 40: 'U-ORGANIZATION',
    41: 'B-STREET', 42: 'I-STREET', 43: 'L-STREET', 44: 'U-STREET',
    45: 'B-CITY', 46: 'I-CITY', 47: 'L-CITY', 48: 'U-CITY',
    49: 'B-DISTRICT', 50: 'I-DISTRICT', 51: 'L-DISTRICT', 52: 'U-DISTRICT',
    53: 'B-COUNTY', 54: 'I-COUNTY', 55: 'L-COUNTY', 56: 'U-COUNTY',
    57: 'B-STATE', 58: 'I-STATE', 59: 'L-STATE', 60: 'U-STATE',
    61: 'B-COUNTRY', 62: 'I-COUNTRY', 63: 'L-COUNTRY', 64: 'U-COUNTRY',
    65: 'B-ZIP', 66: 'I-ZIP', 67: 'L-ZIP', 68: 'U-ZIP',
    69: 'B-LOCATION-OTHER', 70: 'I-LOCATION-OTHER', 71: 'L-LOCATION-OTHER', 72: 'U-LOCATION-OTHER',
    73: 'B-AGE', 74: 'I-AGE', 75: 'L-AGE', 76: 'U-AGE',
    77: 'B-DATE', 78: 'I-DATE', 79: 'L-DATE', 80: 'U-DATE',
    81: 'B-TIME', 82: 'I-TIME', 83: 'L-TIME', 84: 'U-TIME',
    85: 'B-DURATION', 86: 'I-DURATION', 87: 'L-DURATION', 88: 'U-DURATION',
    89: 'B-SET', 90: 'I-SET', 91: 'L-SET', 92: 'U-SET',
    93: 'B-PHONE', 94: 'I-PHONE', 95: 'L-PHONE', 96: 'U-PHONE',
    97: 'B-FAX', 98: 'I-FAX', 99: 'L-FAX', 100: 'U-FAX',
    101: 'B-EMAIL', 102: 'I-EMAIL', 103: 'L-EMAIL', 104: 'U-EMAIL',
    105: 'B-URL', 106: 'I-URL', 107: 'L-URL', 108: 'U-URL',
    109: 'B-IPADDRESS', 110: 'I-IPADDRESS', 111: 'L-IPADDRESS', 112: 'U-IPADDRESS',
    113: 'B-SOCIAL_SECURITY_NUMBER', 114: 'I-SOCIAL_SECURITY_NUMBER', 115: 'L-SOCIAL_SECURITY_NUMBER', 116: 'U-SOCIAL_SECURITY_NUMBER',
    117: 'B-MEDICAL_RECORD_NUMBER', 118: 'I-MEDICAL_RECORD_NUMBER', 119: 'L-MEDICAL_RECORD_NUMBER', 120: 'U-MEDICAL_RECORD_NUMBER',
    121: 'B-HEALTH_PLAN_NUMBER', 122: 'I-HEALTH_PLAN_NUMBER', 123: 'L-HEALTH_PLAN_NUMBER', 124: 'U-HEALTH_PLAN_NUMBER',
    125: 'B-ACCOUNT_NUMBER', 126: 'I-ACCOUNT_NUMBER', 127: 'L-ACCOUNT_NUMBER', 128: 'U-ACCOUNT_NUMBER',
    129: 'B-LICENSE_NUMBER', 130: 'I-LICENSE_NUMBER', 131: 'L-LICENSE_NUMBER', 132: 'U-LICENSE_NUMBER',
    133: 'B-VEHICLE_ID', 134: 'I-VEHICLE_ID', 135: 'L-VEHICLE_ID', 136: 'U-VEHICLE_ID',
    137: 'B-DEVICE_ID', 138: 'I-DEVICE_ID', 139: 'L-DEVICE_ID', 140: 'U-DEVICE_ID',
    141: 'B-BIOMETRIC_ID', 142: 'I-BIOMETRIC_ID', 143: 'L-BIOMETRIC_ID', 144: 'U-BIOMETRIC_ID',
    145: 'B-ID_NUMBER', 146: 'I-ID_NUMBER', 147: 'L-ID_NUMBER', 148: 'U-ID_NUMBER',
    149: 'B-OTHER', 150: 'I-OTHER', 151: 'L-OTHER', 152: 'U-OTHER'
}


資料準備


In [26]:
def Caculate_Wav_File_Times( inputs ) :

        read = inputs

        dict_times = {}
        for line in read:
            line = line.strip()
            line_split = line.split('\t')

            if line_split[0] not in dict_times :
                dict_times[line_split[0]] = 1
            else:
                dict_times[line_split[0]] = dict_times[line_split[0]]  + 1

        return dict_times

In [27]:
# with open( "/content/task2_answer.txt", "r", encoding="utf-8" ) as f :
#   data = f.readlines()

def Prepare_Task2_NER(data) :
  data_times_dict = Caculate_Wav_File_Times( data )


  data_list = {}
  temp_dict = {}
  temp_list = []

  while data :

    times = data_times_dict[data[0].split('\t')[0]]

    for i in range( times  ) :

      line = data[i]


      line = line.strip()
      line_split = line.split("\t")



      temp_dict[ line_split[4] ] = line_split[1]
      temp_list.append( temp_dict )
      temp_dict = {}

    data_list[ data[0].split('\t')[0] ] = temp_list
    temp_list = []


    data = data[times:]


  # print(data_list)

  return data_list



In [28]:
#en
with open( task2_train_data_path_txt, "r", encoding="utf-8" ) as f :
  data = f.readlines()

print(len(data))

# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_2/task2_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = data + f.readlines()


# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_3/task2_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = data + f.readlines()





data_list = Prepare_Task2_NER( data )

940


In [None]:
for i in data_list :
  print(i, data_list[i])

In [30]:
#en
with open( task2_val_data_path_txt, "r", encoding="utf-8" ) as f :
  val_data = f.readlines()

print(len(val_data))

val_data_list = Prepare_Task2_NER( val_data )

1275


In [None]:
for i in val_data_list :
  print(i, val_data_list[i])

In [None]:
new_label2id= {v: k for k, v in label_map.items()}
new_id2label = label_map

# 更新配置
config.id2label = new_id2label
config.label2id = new_label2id

# 打印新的 id2label 和 label2id
print("新的 id2label:", config.id2label)
print("新的 label2id:", config.label2id)

In [33]:
def Prepare_Task1_NER( data, data_list):
  train_data = []

  for i in data:
      # print( i )
      line = i.strip()
      line_split = line.split("\t")

      name = line_split[0]
      text = line_split[1]

      tokens = tokenizer(text.strip(), return_offsets_mapping=True, return_tensors="pt", truncation=True, add_special_tokens=True)
      offsets = tokens["offset_mapping"][0].tolist()
      input_ids = tokens["input_ids"][0].tolist()
      token_texts = tokenizer.convert_ids_to_tokens(input_ids)

      # 初始化 label
      label = [config.label2id["O"]] * len(input_ids)
      label[0] = -100
      label[-1] = -100

      input_ids = tokens["input_ids"]
      attention_mask = tokens["attention_mask"]


      # print(f"name: {name}")
      # print(f"text: {text}")
      # print(f"offsets: {offsets}")
      # print(tokens.tokens())


      # 檢查是否有標註資料
      if name not in data_list:
          train_data.append({
              "input_ids": input_ids[0].tolist(),
              "labels": label,
              "attention_mask": attention_mask[0].tolist()
          })
          continue

      # 將標註合併為 (start, end, tag) 的格式
      entities = []
      used_indices = set()  # 防止重複使用相同文字
      for ent in data_list[name]:
          for word, tag in ent.items():
              # 用 sliding window 尋找沒使用過的 word 位置
              start = -1
              for idx in range(len(text)):
                  if idx in used_indices:
                      continue
                  if text[idx:idx+len(word)] == word:
                      start = idx
                      # 標記這些字元位置已經用過
                      used_indices.update(range(start, start+len(word)))
                      # print(used_indices)
                      break
              if start != -1:
                  end = start + len(word)
                  entities.append((start, end, tag))
              else:
                  print(f"[未找到實體] name={name}, word='{word}', tag='{tag}'")
                  print(f"→ 原始句子：{text}")
                  print(text[idx:idx+len(word)])

      # print(f"name: {name}")
      # print(f"text: {text}")
      # print(f"entities: {entities}")
      # print( offsets )

      # 比對 offset 和 entity span，標註 label
      for idx, (start, end) in enumerate(offsets):

        if start == end:
            continue
        for ent_start, ent_end, tag in entities:
            if start >= ent_start and end <= ent_end:
                if ent_start == start and ent_end == end:  # 單 token 實體
                    label[idx] = config.label2id[f"U-{tag}"]
                    break
                elif start == ent_start:
                    label[idx] = config.label2id[f"B-{tag}"]
                    break
                elif end == ent_end:
                    label[idx] = config.label2id[f"L-{tag}"]
                    break
                else:
                    label[idx] = config.label2id[f"I-{tag}"]
                    break

      train_data.append({
          "input_ids": input_ids[0].tolist(),
          "labels": label,
          "attention_mask": attention_mask[0].tolist()
      })

  return train_data


In [34]:
#en
with open( task1_train_data_path_txt, "r", encoding="utf-8" ) as f :
  data = f.readlines()

print( len(data) )

# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_2/task1_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = data + f.readlines()

# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_3/task1_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = data + f.readlines()


train_data = Prepare_Task1_NER( data, data_list )

310


In [None]:
for i in train_data[-5:-1]:
  print(i)

In [36]:
#en
with open( task1_val_data_path_txt, "r", encoding="utf-8" ) as f :
  val_data = f.readlines()

print( len(val_data) )



test_data = Prepare_Task1_NER( val_data, val_data_list )

449


In [None]:
!pip install datasets

In [38]:
from datasets import Dataset
data_train = Dataset.from_list(train_data)
data_test = Dataset.from_list(test_data)

In [None]:
data_train, data_test

In [None]:
# model.config.label2id = new_label2id
# model.config.id2label = new_id2label

model_id = "xlm-roberta-large-finetuned-conll03-english"   #xlm-roberta-large-finetuned-conll03-english


import torch.nn as nn
# tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)

# ✅ 使用 data collator 處理 padding（這會自動 padding input_ids, attention_mask, 和 labels）
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_id,
    num_labels=len(new_label2id),
    id2label=new_id2label,
    label2id=new_label2id,
    ignore_mismatched_sizes=True   # 這行很重要！
)


In [41]:
def bio_u_to_bio_es(labels):
    new_labels = []
    for label in labels:
        if label.startswith('L-'):
            new_labels.append('E-' + label[2:])
        elif label.startswith('U-'):
            new_labels.append('S-' + label[2:])
        else:
            new_labels.append(label)
    return new_labels


In [42]:
def Process_Predict_Ner_BIOUL(pre):
    answer_list = []
    current_entity = None
    current_word = ""
    start_pos = None
    end_pos = None

    for dic in pre:
        entity_type = dic['entity']
        raw_word = dic['word']
        word = raw_word.replace("▁", "")
        token_start = dic.get('start')
        token_end = dic.get('end')
        has_space = raw_word.startswith("▁")

        if entity_type.startswith("B-"):
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = entity_type.replace("B-", "")
            current_word = word
            start_pos = token_start
            end_pos = token_end

        elif entity_type.startswith("I-"):
            ent = entity_type.replace("I-", "")
            if current_entity == ent:
                if has_space:
                    current_word += " " + word
                else:
                    current_word += word
                end_pos = token_end
            else:
                if current_entity and current_word:
                    answer_list.append({
                        "entity": current_entity,
                        "word": current_word,
                        "start": start_pos,
                        "end": end_pos
                    })
                current_entity = ent
                current_word = word
                start_pos = token_start
                end_pos = token_end

        elif entity_type.startswith("L-"):
            ent = entity_type.replace("L-", "")
            if current_entity == ent:
                if has_space:
                    current_word += " " + word
                else:
                    current_word += word
                end_pos = token_end
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
                current_entity = None
                current_word = ""
                start_pos = None
                end_pos = None
            else:
                # 如果之前的 entity 沒接上，當作獨立實體處理
                answer_list.append({
                    "entity": ent,
                    "word": word,
                    "start": token_start,
                    "end": token_end
                })
                current_entity = None
                current_word = ""
                start_pos = None
                end_pos = None

        elif entity_type.startswith("U-"):
            ent = entity_type.replace("U-", "")
            answer_list.append({
                "entity": ent,
                "word": word,
                "start": token_start,
                "end": token_end
            })

            current_entity = None
            current_word = ""
            start_pos = None
            end_pos = None

        else:  # O
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = None
            current_word = ""
            start_pos = None
            end_pos = None

    # 收尾
    if current_entity and current_word:
        answer_list.append({
            "entity": current_entity,
            "word": current_word,
            "start": start_pos,
            "end": end_pos
        })

    return answer_list

In [43]:
def get_level2_entities_normal(model, tokenizer, sentence, label_map):
    device = next(model.parameters()).device  # 取得 model 裝置

    # 1. Tokenize with offsets
    encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True, truncation=True)
    input_ids = encoding["input_ids"].to(device)          # 放到 GPU
    attention_mask = encoding["attention_mask"].to(device)  # 放到 GPU
    offsets = encoding["offset_mapping"][0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())  # token ids 放 CPU 才能用 tokenizer

    # 2. Model forward
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # [batch_size, seq_len, num_labels]

    preds = torch.argmax(logits, dim=2)[0].cpu().numpy()  # 預測結果放回 CPU

    results = []
    for idx, (pred_id, offset) in enumerate(zip(preds, offsets)):
        token_id = input_ids[0, idx].item()

        # 跳過特殊 token 或無效 offset
        if token_id in [tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id]:
            continue

        start, end = offset
        entity = label_map.get(pred_id, "O")

        if entity != "O":
            probs = torch.softmax(logits[0, idx], dim=0)
            score = probs[pred_id].item()

            results.append({
                "entity": entity,
                "score": np.float32(score),
                "index": idx,
                "word": tokens[idx],  # 更準確
                "start": start,
                "end": end
            })

    return results


In [None]:
import numpy as np
import pandas as pd

def calculate_overlap(pred_start, pred_end, gt_start, gt_end):
    """計算兩個時間區間的重疊長度"""
    overlap_start = max(pred_start, gt_start)
    overlap_end = min(pred_end, gt_end)
    overlap = max(0, overlap_end - overlap_start)
    return overlap

def evaluate_task2( ground_truth_file, model, tokenizer ) :



    answer = ""

    for text in val_data :

      answer_list = []

      text_split = text.strip().split("\t")
      name = text_split[0]
      text = text_split[1]

      pre = get_level2_entities_normal(model, tokenizer, text, label_map)
      if len(pre) != 0:
        answer_list = Process_Predict_Ner_BIOUL(pre)

      for i in answer_list:
        answer += f"{name}\t{i['entity']}\t{i['start']}\t{i['end']}\t{i['word']}\n"

    prediction_file = "model_eval.txt"
    with open( prediction_file, "w", encoding="utf-8") as f:
      f.write(answer)


    # 讀取預測和真實標籤數據
    import csv
    pred_df = pd.read_csv(
          prediction_file,
          sep='\t',
          header=None,
          names=['id', 'type', 'start', 'end', 'content'],
          quoting=csv.QUOTE_NONE,        # 不解析引號
          encoding='utf-8',              # 或試 utf-8-sig
          on_bad_lines='skip',           # 跳過爛行
          engine='python'                # 更寬容的 parser
      )
    gt_df = pd.read_csv(ground_truth_file, sep='\t', header=None,
                       names=['id', 'type', 'start', 'end', 'content'])

    # 獲取所有獨特的SHI類型
    all_types = sorted(set(gt_df['type'].unique()) | set(pred_df['type'].unique()))

    # 初始化每種類型的指標
    metrics = {shi_type: {'tp': 0, 'fp': 0, 'fn': 0} for shi_type in all_types}

    # 按音頻ID分組處理
    unique_ids = sorted(set(gt_df['id'].unique()) | set(pred_df['id'].unique()))

    for audio_id in unique_ids:
        gt_records = gt_df[gt_df['id'] == audio_id].copy()
        pred_records = pred_df[pred_df['id'] == audio_id].copy()

        # 初始化匹配矩陣來追蹤已處理的預測和真實標籤
        gt_matched = [False] * len(gt_records)
        pred_matched = [False] * len(pred_records)

        # 計算True Positives和部分False Positives/False Negatives
        for i, pred_row in enumerate(pred_records.itertuples()):
            pred_type = pred_row.type
            pred_start = pred_row.start
            pred_end = pred_row.end
            pred_duration = pred_end - pred_start

            best_overlap = 0
            best_gt_idx = -1

            # 找到與當前預測重疊最大的真實標籤
            for j, gt_row in enumerate(gt_records.itertuples()):
                if gt_row.type != pred_type:
                    continue

                overlap = calculate_overlap(pred_start, pred_end, gt_row.start, gt_row.end)
                if overlap > best_overlap:
                    best_overlap = overlap
                    best_gt_idx = j

            if best_gt_idx >= 0:  # 找到部分匹配
                gt_row = gt_records.iloc[best_gt_idx]
                gt_duration = gt_row.end - gt_row.start

                # 計算 True Positive
                metrics[pred_type]['tp'] += best_overlap

                # 計算 False Positive (對於部分匹配，類型相同)
                metrics[pred_type]['fp'] += pred_duration - best_overlap

                # 計算 False Negative (對於部分匹配，類型相同)
                metrics[pred_type]['fn'] += gt_duration - best_overlap

                # 標記已處理
                gt_matched[best_gt_idx] = True
                pred_matched[i] = True
            else:
                # 完全不匹配或者類型不同：整個預測為False Positive
                metrics[pred_type]['fp'] += pred_duration

        # 處理未匹配的真實標籤 (False Negatives)
        for j, matched in enumerate(gt_matched):
            if not matched:
                gt_row = gt_records.iloc[j]
                gt_type = gt_row.type
                gt_duration = gt_row.end - gt_row.start
                metrics[gt_type]['fn'] += gt_duration

        # 處理與類型不同的預測 (False Positives)
        for i, (matched, pred_row) in enumerate(zip(pred_matched, pred_records.itertuples())):
            if matched:
                continue

            # 檢查是否有與其他類型匹配
            pred_type = pred_row.type
            pred_start = pred_row.start
            pred_end = pred_row.end
            pred_duration = pred_end - pred_start

            for gt_row in gt_records.itertuples():
                if gt_row.type == pred_type:
                    continue  # 已在之前的步驟中處理過

                overlap = calculate_overlap(pred_start, pred_end, gt_row.start, gt_row.end)
                if overlap > 0:
                    # 類型不匹配但時間重疊：整個預測為False Positive
                    metrics[pred_type]['fp'] += pred_duration
                    break

    # 計算每種類型的Precision, Recall和F1
    f1_scores = []
    for shi_type in all_types:
        m = metrics[shi_type]
        precision = m['tp'] / (m['tp'] + m['fp']) if (m['tp'] + m['fp']) > 0 else 0
        recall = m['tp'] / (m['tp'] + m['fn']) if (m['tp'] + m['fn']) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

        # print(f"類型 {shi_type}:")
        # print(f"  Precision: {precision:.4f}")
        # print(f"  Recall: {recall:.4f}")
        # print(f"  F1: {f1:.4f}")
        # print(f"  TP: {m['tp']:.2f}, FP: {m['fp']:.2f}, FN: {m['fn']:.2f}")
        # print()

    # 計算宏平均F1
    macro_f1 = np.mean(f1_scores)
    # print(f"Macro-Average F1: {macro_f1:.4f}")

    return macro_f1


In [45]:
from transformers import TrainerCallback

class CharBasedEvaluationCallback(TrainerCallback):
    def __init__(self, task2_path, tokenizer):
        self.task2_path = task2_path
        self.tokenizer = tokenizer

    def on_evaluate(self, args, state, control, **kwargs):
        model = kwargs["model"]

        macro_f1 = evaluate_task2(self.task2_path, model, self.tokenizer)

        print(f"[Char-based Evaluation after epoch {state.epoch}]")
        print("Macro-F1:", macro_f1)

In [None]:
# ✅ 建立 Trainer
from transformers import TrainingArguments, Trainer
import torch

# training_args = TrainingArguments(
#     output_dir="./ner_results",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     logging_strategy="epoch",          # 👈 新增這行：每個 epoch log 一次
#     logging_first_step=True,           # 👈 第一步就 log（可選）
#     logging_dir="./ner_logs",          # 👈 log 檔儲存資料夾（可選）
#     learning_rate=1e-5,
#     do_train=True,
#     do_eval=True,
#     num_train_epochs=50,
#     weight_decay=0.01,
# )

training_args = TrainingArguments(
    output_dir=model_save_path,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_first_step=True,
    logging_dir=model_logging_dir,
    learning_rate=3e-5,                     # 微幅調高（視情況）3e-5
    num_train_epochs=20,                   # 避免一次就設 50
    weight_decay=0.03,                     # 適當正則化 0.03
    # lr_scheduler_type="linear",  # 線性衰減
    # per_device_train_batch_size=8,        # 批量大一點也有助穩定
    per_device_eval_batch_size=64,
    # load_best_model_at_end=True,           # ⚠️ 搭配 EarlyStopping 時很重要
    # metric_for_best_model="eval_loss",     # 根據 loss 選最佳模型
)



task2_path = answer_val_data_path_txt

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_test,
    data_collator=data_collator,
    callbacks=[CharBasedEvaluationCallback(task2_path, tokenizer)]
)


trainer.train()