In [None]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [None]:
!pip install torchcrf

Collecting torchcrf
  Downloading TorchCRF-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->torchcrf)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->torchcrf)
 

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torchcrf import CRF
access_token = ""

from transformers.modeling_outputs import TokenClassifierOutput

from transformers import XLMRobertaConfig, XLMRobertaModel, XLMRobertaPreTrainedModel
from transformers import PreTrainedModel
from transformers import XLMRobertaForTokenClassification
from transformers import XLMRobertaTokenizer


class XLMRobertaWithCRF(XLMRobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # 用 from_pretrained 來載入預訓練權重
        self.roberta = XLMRobertaModel.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english", config=config, token=access_token)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.crf = CRF(config.num_labels, batch_first=True)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        emissions = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            mask = attention_mask.bool()
            mask[:, 0] = True  # 確保第一token是有效的mask
            loss = -self.crf(emissions, labels, mask=mask, reduction='mean')
        return TokenClassifierOutput(
            loss=loss,
            logits=emissions,
        )


In [None]:
label_map = {
    0: 'O',
    1: 'B-PATIENT', 2: 'I-PATIENT',
    3: 'B-DOCTOR', 4: 'I-DOCTOR',
    5: 'B-USERNAME', 6: 'I-USERNAME',
    7: 'B-FAMILYNAME', 8: 'I-FAMILYNAME',
    9: 'B-PERSONALNAME', 10: 'I-PERSONALNAME',
    11: 'B-PROFESSION', 12: 'I-PROFESSION',
    13: 'B-ROOM', 14: 'I-ROOM',
    15: 'B-DEPARTMENT', 16: 'I-DEPARTMENT',
    17: 'B-HOSPITAL', 18: 'I-HOSPITAL',
    19: 'B-ORGANIZATION', 20: 'I-ORGANIZATION',
    21: 'B-STREET', 22: 'I-STREET',
    23: 'B-CITY', 24: 'I-CITY',
    25: 'B-DISTRICT', 26: 'I-DISTRICT',
    27: 'B-COUNTY', 28: 'I-COUNTY',
    29: 'B-STATE', 30: 'I-STATE',
    31: 'B-COUNTRY', 32: 'I-COUNTRY',
    33: 'B-ZIP', 34: 'I-ZIP',
    35: 'B-LOCATION-OTHER', 36: 'I-LOCATION-OTHER',
    37: 'B-AGE', 38: 'I-AGE',
    39: 'B-DATE', 40: 'I-DATE',
    41: 'B-TIME', 42: 'I-TIME',
    43: 'B-DURATION', 44: 'I-DURATION',
    45: 'B-SET', 46: 'I-SET',
    47: 'B-PHONE', 48: 'I-PHONE',
    49: 'B-FAX', 50: 'I-FAX',
    51: 'B-EMAIL', 52: 'I-EMAIL',
    53: 'B-URL', 54: 'I-URL',
    55: 'B-IPADDRESS', 56: 'I-IPADDRESS',
    57: 'B-SOCIAL_SECURITY_NUMBER', 58: 'I-SOCIAL_SECURITY_NUMBER',
    59: 'B-MEDICAL_RECORD_NUMBER', 60: 'I-MEDICAL_RECORD_NUMBER',
    61: 'B-HEALTH_PLAN_NUMBER', 62: 'I-HEALTH_PLAN_NUMBER',
    63: 'B-ACCOUNT_NUMBER', 64: 'I-ACCOUNT_NUMBER',
    65: 'B-LICENSE_NUMBER', 66: 'I-LICENSE_NUMBER',
    67: 'B-VEHICLE_ID', 68: 'I-VEHICLE_ID',
    69: 'B-DEVICE_ID', 70: 'I-DEVICE_ID',
    71: 'B-BIOMETRIC_ID', 72: 'I-BIOMETRIC_ID',
    73: 'B-ID_NUMBER', 74: 'I-ID_NUMBER',
    75: 'B-OTHER', 76: 'I-OTHER',
    77: 'IGNORE'
}

In [None]:
from transformers import AutoTokenizer

access_token = ""

model_name = "FacebookAI/xlm-roberta-large-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, token = access_token )

from transformers import XLMRobertaConfig

config = XLMRobertaConfig.from_pretrained(model_name, num_labels=len(label_map))
model = XLMRobertaWithCRF(config)


model.config.id2label = label_map
model.config.label2id = {v:k for k,v in label_map.items()}

In [None]:
print("新的 id2label:", model.config.id2label)
print("新的 label2id:", model.config.label2id)

新的 id2label: {0: 'O', 1: 'B-PATIENT', 2: 'I-PATIENT', 3: 'B-DOCTOR', 4: 'I-DOCTOR', 5: 'B-USERNAME', 6: 'I-USERNAME', 7: 'B-FAMILYNAME', 8: 'I-FAMILYNAME', 9: 'B-PERSONALNAME', 10: 'I-PERSONALNAME', 11: 'B-PROFESSION', 12: 'I-PROFESSION', 13: 'B-ROOM', 14: 'I-ROOM', 15: 'B-DEPARTMENT', 16: 'I-DEPARTMENT', 17: 'B-HOSPITAL', 18: 'I-HOSPITAL', 19: 'B-ORGANIZATION', 20: 'I-ORGANIZATION', 21: 'B-STREET', 22: 'I-STREET', 23: 'B-CITY', 24: 'I-CITY', 25: 'B-DISTRICT', 26: 'I-DISTRICT', 27: 'B-COUNTY', 28: 'I-COUNTY', 29: 'B-STATE', 30: 'I-STATE', 31: 'B-COUNTRY', 32: 'I-COUNTRY', 33: 'B-ZIP', 34: 'I-ZIP', 35: 'B-LOCATION-OTHER', 36: 'I-LOCATION-OTHER', 37: 'B-AGE', 38: 'I-AGE', 39: 'B-DATE', 40: 'I-DATE', 41: 'B-TIME', 42: 'I-TIME', 43: 'B-DURATION', 44: 'I-DURATION', 45: 'B-SET', 46: 'I-SET', 47: 'B-PHONE', 48: 'I-PHONE', 49: 'B-FAX', 50: 'I-FAX', 51: 'B-EMAIL', 52: 'I-EMAIL', 53: 'B-URL', 54: 'I-URL', 55: 'B-IPADDRESS', 56: 'I-IPADDRESS', 57: 'B-SOCIAL_SECURITY_NUMBER', 58: 'I-SOCIAL_SECURIT

# **資料準備**


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def Caculate_Wav_File_Times( inputs ) :

        read = inputs

        dict_times = {}
        for line in read:
            line = line.strip()
            line_split = line.split('\t')

            if line_split[0] not in dict_times :
                dict_times[line_split[0]] = 1
            else:
                dict_times[line_split[0]] = dict_times[line_split[0]]  + 1

        return dict_times

In [None]:
# with open( "/content/task2_answer.txt", "r", encoding="utf-8" ) as f :
#   data = f.readlines()

def Prepare_Task2_NER(data) :
  data_times_dict = Caculate_Wav_File_Times( data )


  data_list = {}
  temp_dict = {}
  temp_list = []

  while data :

    times = data_times_dict[data[0].split('\t')[0]]

    for i in range( times  ) :

      line = data[i]


      line = line.strip()
      line_split = line.split("\t")



      temp_dict[ line_split[4] ] = line_split[1]
      temp_list.append( temp_dict )
      temp_dict = {}

    data_list[ data[0].split('\t')[0] ] = temp_list
    temp_list = []


    data = data[times:]


  print(data_list)

  return data_list



In [None]:
#en

with open( "/content/drive/MyDrive/AICUP_DATA/offical_change/task2_train.txt", "r", encoding="utf-8" ) as f :
  data = f.readlines()


# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_1/task2_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = f.readlines()

# print(len(data))

# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_2/task2_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = data + f.readlines()


# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_3/task2_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = data + f.readlines()



print(len(data))

data_list = Prepare_Task2_NER( data )

2113
{'1848': [{'quite a while': 'DURATION'}], '1933': [{'Kelly': 'PERSONALNAME'}, {'a week': 'DURATION'}, {'today': 'DATE'}, {'a week': 'DURATION'}, {'yesterday': 'DATE'}], '2043': [{'Josh': 'FAMILYNAME'}], '2053': [{'now': 'DATE'}], '2087': [{'12 sessions': 'DURATION'}, {'12 sessions': 'DURATION'}], '10042': [{'June': 'PATIENT'}, {'today': 'DATE'}, {'Jones': 'DOCTOR'}, {'last few months': 'DURATION'}], '10224': [{'yesterday': 'DATE'}], '10858': [{'next week': 'DATE'}, {'July': 'DATE'}], '10868': [{'Board': 'DOCTOR'}], '10971': [{'today': 'DATE'}, {'day': 'DATE'}, {'today': 'DATE'}, {'day': 'DATE'}, {'morning': 'TIME'}, {'morning': 'TIME'}, {'Micah': 'DOCTOR'}], '11053': [{'six months': 'DURATION'}], '11057': [{'now': 'DATE'}], '11081': [{'80': 'AGE'}, {'Harold': 'FAMILYNAME'}], '11546': [{'Amy': 'PERSONALNAME'}, {'Erin': 'PERSONALNAME'}, {'Erin': 'PERSONALNAME'}, {'two hours': 'DURATION'}, {'nights': 'TIME'}], '11982': [{'two weeks': 'DURATION'}], '12371': [{'next week': 'DATE'}, {'m

In [None]:
#en

with open( "/content/drive/MyDrive/AICUP_DATA/offical_change/task2_val.txt", "r", encoding="utf-8" ) as f :
  val_data = f.readlines()


# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/test_set/task2_answer_change.txt", "r", encoding="utf-8" ) as f :
#   val_data = f.readlines()

print(len(val_data))


val_data_list = Prepare_Task2_NER( val_data )

1997
{'2505': [{'10 minutes': 'DURATION'}, {'last night': 'TIME'}, {'today': 'DATE'}, {'James': 'FAMILYNAME'}], '2943': [{'Friday': 'DATE'}, {'Saturday': 'DATE'}, {'Tuesday': 'DATE'}, {'yesterday': 'DATE'}], '2983': [{'James': 'FAMILYNAME'}], '3104': [{'Stefan': 'FAMILYNAME'}], '4808': [{'now': 'DATE'}], '24444': [{'20s': 'AGE'}, {'Franco': 'PERSONALNAME'}, {'Franco': 'PERSONALNAME'}, {'Andover': 'LOCATION-OTHER'}], '24653': [{'this weekend': 'DATE'}], '24947': [{'day': 'DURATION'}, {'per day': 'SET'}, {'morning': 'TIME'}], '25002': [{'all weekend': 'DURATION'}], '25010': [{'now': 'DATE'}, {'an hour': 'DURATION'}], '25077': [{'Sunday': 'DATE'}, {'noontime': 'TIME'}], '25176': [{'James': 'FAMILYNAME'}], '25344': [{'Jeez': 'PERSONALNAME'}, {'now': 'DATE'}, {'now': 'DATE'}], '25390': [{'now': 'DATE'}], '25516': [{'2:30': 'TIME'}], '25533': [{'six': 'DURATION'}, {'eight hours': 'DURATION'}], '25598': [{'Franco': 'PERSONALNAME'}], '25724': [{'13 hours': 'DURATION'}, {'40 a week': 'SET'}], '

In [None]:
def Prepare_Task1_NER( data, data_list):
  train_data = []

  for i in data:
      # print( i )
      line = i.strip()
      line_split = line.split("\t")

      name = line_split[0]
      text = line_split[1]

      tokens = tokenizer(text.strip(), return_offsets_mapping=True, return_tensors="pt", truncation=True, add_special_tokens=True)
      offsets = tokens["offset_mapping"][0].tolist()
      input_ids = tokens["input_ids"][0].tolist()
      token_texts = tokenizer.convert_ids_to_tokens(input_ids)

      # 初始化 label
      label = [config.label2id["O"]] * len(input_ids)
      label[0] = 77
      label[-1] = 77


      input_ids = tokens["input_ids"]
      attention_mask = tokens["attention_mask"]

      # print(f"name: {name}")
      # print(f"text: {text}")
      # print(f"offsets: {offsets}")
      # print(tokens.tokens())


      # 檢查是否有標註資料
      if name not in data_list:
          train_data.append({
              "input_ids": input_ids[0].tolist(),
              "labels": label,
              "attention_mask": attention_mask[0].tolist()
          })
          continue

      # 將標註合併為 (start, end, tag) 的格式
      entities = []
      used_indices = set()  # 防止重複使用相同文字
      for ent in data_list[name]:
          for word, tag in ent.items():
              # 用 sliding window 尋找沒使用過的 word 位置
              start = -1
              for idx in range(len(text)):
                  if idx in used_indices:
                      continue
                  if text[idx:idx+len(word)] == word:
                      start = idx
                      # 標記這些字元位置已經用過
                      used_indices.update(range(start, start+len(word)))
                      break
              if start != -1:
                  end = start + len(word)
                  entities.append((start, end, tag))
              else:
                  print(f"[未找到實體] name={name}, word='{word}', tag='{tag}'")
                  print(f"→ 原始句子：{text}")
                  print(text[idx:idx+len(word)])

      # print(f"name: {name}")
      # print(f"text: {text}")
      # print(f"entities: {entities}")

      # 比對 offset 和 entity span，標註 label
      for idx, (start, end) in enumerate(offsets):
          if start == end:
              continue
          for ent_start, ent_end, tag in entities:
              if start == ent_start:
                  label[idx] = config.label2id[f"B-{tag}"]
                  break
              elif ent_start < start < ent_end:
                  label[idx] = config.label2id[f"I-{tag}"]
                  break

      train_data.append({
          "input_ids": input_ids[0].tolist(),
          "labels": label,
          "attention_mask": attention_mask[0].tolist()
      })

  return train_data


In [None]:
#en
with open( "/content/drive/MyDrive/AICUP_DATA/offical_change/task1_train.txt", "r", encoding="utf-8" ) as f :
  data = f.readlines()

# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_1/task1_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = f.readlines()

# print( len(data) )

# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_2/task1_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = data + f.readlines()

# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/hold_3/task1_answer_change.txt", "r", encoding="utf-8" ) as f :
#   data = data + f.readlines()


print( len(data) )

train_data = Prepare_Task1_NER( data, data_list )


836


In [None]:
#en
with open( "/content/drive/MyDrive/AICUP_DATA/offical_change/task1_val.txt", "r", encoding="utf-8" ) as f :
  val_data = f.readlines()

# with open( "/content/drive/MyDrive/AICUP_DATA/en-dataset/3_fold/test_set/task1_answer_change.txt", "r", encoding="utf-8" ) as f :
#   val_data = f.readlines()

print( len(val_data) )


test_data = Prepare_Task1_NER( val_data, val_data_list )

534


In [None]:
!pip install datasets



In [None]:
from datasets import Dataset
data_train = Dataset.from_list(train_data)
data_test = Dataset.from_list(test_data)

In [None]:
data_train, data_test

(Dataset({
     features: ['input_ids', 'labels', 'attention_mask'],
     num_rows: 836
 }),
 Dataset({
     features: ['input_ids', 'labels', 'attention_mask'],
     num_rows: 534
 }))

# **TRAIN**

In [None]:
print(train_data[20])

{'input_ids': [0, 119057, 4, 41866, 16792, 87, 14037, 10, 3687, 221, 87, 25, 39, 959, 3853, 90908, 214, 1672, 450, 5, 8622, 25, 7, 10, 10176, 4785, 111, 25813, 9, 17489, 63920, 450, 621, 11405, 214, 163, 6637, 87, 25, 272, 37842, 6626, 53095, 136, 1632, 509, 7730, 47, 186, 100, 6088, 106, 271, 1884, 3650, 110763, 142, 56816, 678, 10, 37195, 56816, 15440, 4, 3129, 509, 7730, 47, 186, 645, 3650, 93712, 5, 87, 25, 39, 28875, 9077, 759, 25813, 831, 186, 16940, 23, 17262, 9, 188, 9, 160018, 21974, 5, 581, 14380, 450, 87, 6777, 3564, 1257, 7730, 678, 4, 2], 'labels': [77, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39, 40, 40, 0, 0, 0, 43, 44, 0, 0, 43, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 43, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 77], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
def Process_Predict_Ner(pre):
    answer_list = []
    current_entity = None
    current_word = ""
    start_pos = None
    end_pos = None

    for dic in pre:
        entity_type = dic['entity']
        raw_word = dic['word']
        word = raw_word.replace("▁", "")
        token_start = dic.get('start')
        token_end = dic.get('end')
        has_space = raw_word.startswith("▁")

        if entity_type.startswith("B-"):
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = entity_type.replace("B-", "")
            current_word = word
            start_pos = token_start
            end_pos = token_end

        elif entity_type.startswith("I-"):
            ent = entity_type.replace("I-", "")
            if current_entity == ent:
                if has_space:
                    current_word += " " + word
                else:
                    current_word += word
                end_pos = token_end
            else:
                if current_entity and current_word:
                    answer_list.append({
                        "entity": current_entity,
                        "word": current_word,
                        "start": start_pos,
                        "end": end_pos
                    })
                current_entity = ent
                current_word = word
                start_pos = token_start
                end_pos = token_end

        else:  # O
            if current_entity and current_word:
                answer_list.append({
                    "entity": current_entity,
                    "word": current_word,
                    "start": start_pos,
                    "end": end_pos
                })
            current_entity = None
            current_word = ""
            start_pos = None
            end_pos = None

    # 收尾
    if current_entity and current_word:
        answer_list.append({
            "entity": current_entity,
            "word": current_word,
            "start": start_pos,
            "end": end_pos
        })

    return answer_list

In [None]:
# 小心 -100 和 0   "labels": pad_sequence(labels, batch_first=True, padding_value=0)  # Changed padding_value to -100
from torch.nn.utils.rnn import pad_sequence
def custom_collator(features):
    input_ids = [torch.tensor(f["input_ids"]) for f in features]
    attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
    labels = [torch.tensor(f["labels"]) for f in features]


    batch = {
        "input_ids": pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id),
        "attention_mask": pad_sequence(attention_mask, batch_first=True, padding_value=0),
        "labels": pad_sequence(labels, batch_first=True, padding_value=77)  # Changed padding_value to -100
    }

    # print( batch )

    return batch

In [None]:
def get_level2_entities_normal(model, tokenizer, sentence, label_map):
    device = next(model.parameters()).device  # 取得 model 裝置

    # 1. Tokenize with offsets
    encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True, truncation=True)
    input_ids = encoding["input_ids"].to(device)          # 放到 GPU
    attention_mask = encoding["attention_mask"].to(device)  # 放到 GPU
    offsets = encoding["offset_mapping"][0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())  # token ids 放 CPU 才能用 tokenizer

    # 2. Model forward
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # [batch_size, seq_len, num_labels]

    preds = torch.argmax(logits, dim=2)[0].cpu().numpy()  # 預測結果放回 CPU

    results = []
    for idx, (pred_id, offset) in enumerate(zip(preds, offsets)):
        token_id = input_ids[0, idx].item()

        # 跳過特殊 token 或無效 offset
        if token_id in [tokenizer.pad_token_id, tokenizer.cls_token_id, tokenizer.sep_token_id]:
            continue

        start, end = offset
        entity = label_map.get(pred_id, "O")

        if entity != "O":
            probs = torch.softmax(logits[0, idx], dim=0)
            score = probs[pred_id].item()

            results.append({
                "entity": entity,
                "score": np.float32(score),
                "index": idx,
                "word": tokens[idx],  # 更準確
                "start": start,
                "end": end
            })

    return results


In [None]:
import numpy as np
import pandas as pd

def calculate_overlap(pred_start, pred_end, gt_start, gt_end):
    """計算兩個時間區間的重疊長度"""
    overlap_start = max(pred_start, gt_start)
    overlap_end = min(pred_end, gt_end)
    overlap = max(0, overlap_end - overlap_start)
    return overlap

def evaluate_task2( ground_truth_file, model, tokenizer ) :



    answer = ""

    for text in val_data :

      answer_list = []

      text_split = text.strip().split("\t")
      name = text_split[0]
      text = text_split[1]

      pre = get_level2_entities_normal(model, tokenizer, text, label_map)
      if len(pre) != 0:
        answer_list = Process_Predict_Ner(pre)

      for i in answer_list:
        answer += f"{name}\t{i['entity']}\t{i['start']}\t{i['end']}\t{i['word']}\n"

    prediction_file = "model_eval.txt"
    with open( prediction_file, "w", encoding="utf-8") as f:
      f.write(answer)


    # 讀取預測和真實標籤數據
    import csv
    pred_df = pd.read_csv(
          prediction_file,
          sep='\t',
          header=None,
          names=['id', 'type', 'start', 'end', 'content'],
          quoting=csv.QUOTE_NONE,        # 不解析引號
          encoding='utf-8',              # 或試 utf-8-sig
          on_bad_lines='skip',           # 跳過爛行
          engine='python'                # 更寬容的 parser
      )
    gt_df = pd.read_csv(ground_truth_file, sep='\t', header=None,
                       names=['id', 'type', 'start', 'end', 'content'])

    # 獲取所有獨特的SHI類型
    all_types = sorted(set(gt_df['type'].unique()) | set(pred_df['type'].unique()))

    # 初始化每種類型的指標
    metrics = {shi_type: {'tp': 0, 'fp': 0, 'fn': 0} for shi_type in all_types}

    # 按音頻ID分組處理
    unique_ids = sorted(set(gt_df['id'].unique()) | set(pred_df['id'].unique()))

    for audio_id in unique_ids:
        gt_records = gt_df[gt_df['id'] == audio_id].copy()
        pred_records = pred_df[pred_df['id'] == audio_id].copy()

        # 初始化匹配矩陣來追蹤已處理的預測和真實標籤
        gt_matched = [False] * len(gt_records)
        pred_matched = [False] * len(pred_records)

        # 計算True Positives和部分False Positives/False Negatives
        for i, pred_row in enumerate(pred_records.itertuples()):
            pred_type = pred_row.type
            pred_start = pred_row.start
            pred_end = pred_row.end
            pred_duration = pred_end - pred_start

            best_overlap = 0
            best_gt_idx = -1

            # 找到與當前預測重疊最大的真實標籤
            for j, gt_row in enumerate(gt_records.itertuples()):
                if gt_row.type != pred_type:
                    continue

                overlap = calculate_overlap(pred_start, pred_end, gt_row.start, gt_row.end)
                if overlap > best_overlap:
                    best_overlap = overlap
                    best_gt_idx = j

            if best_gt_idx >= 0:  # 找到部分匹配
                gt_row = gt_records.iloc[best_gt_idx]
                gt_duration = gt_row.end - gt_row.start

                # 計算 True Positive
                metrics[pred_type]['tp'] += best_overlap

                # 計算 False Positive (對於部分匹配，類型相同)
                metrics[pred_type]['fp'] += pred_duration - best_overlap

                # 計算 False Negative (對於部分匹配，類型相同)
                metrics[pred_type]['fn'] += gt_duration - best_overlap

                # 標記已處理
                gt_matched[best_gt_idx] = True
                pred_matched[i] = True
            else:
                # 完全不匹配或者類型不同：整個預測為False Positive
                metrics[pred_type]['fp'] += pred_duration

        # 處理未匹配的真實標籤 (False Negatives)
        for j, matched in enumerate(gt_matched):
            if not matched:
                gt_row = gt_records.iloc[j]
                gt_type = gt_row.type
                gt_duration = gt_row.end - gt_row.start
                metrics[gt_type]['fn'] += gt_duration

        # 處理與類型不同的預測 (False Positives)
        for i, (matched, pred_row) in enumerate(zip(pred_matched, pred_records.itertuples())):
            if matched:
                continue

            # 檢查是否有與其他類型匹配
            pred_type = pred_row.type
            pred_start = pred_row.start
            pred_end = pred_row.end
            pred_duration = pred_end - pred_start

            for gt_row in gt_records.itertuples():
                if gt_row.type == pred_type:
                    continue  # 已在之前的步驟中處理過

                overlap = calculate_overlap(pred_start, pred_end, gt_row.start, gt_row.end)
                if overlap > 0:
                    # 類型不匹配但時間重疊：整個預測為False Positive
                    metrics[pred_type]['fp'] += pred_duration
                    break

    # 計算每種類型的Precision, Recall和F1
    f1_scores = []
    for shi_type in all_types:
        m = metrics[shi_type]
        precision = m['tp'] / (m['tp'] + m['fp']) if (m['tp'] + m['fp']) > 0 else 0
        recall = m['tp'] / (m['tp'] + m['fn']) if (m['tp'] + m['fn']) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

        # print(f"類型 {shi_type}:")
        # print(f"  Precision: {precision:.4f}")
        # print(f"  Recall: {recall:.4f}")
        # print(f"  F1: {f1:.4f}")
        # print(f"  TP: {m['tp']:.2f}, FP: {m['fp']:.2f}, FN: {m['fn']:.2f}")
        # print()

    # 計算宏平均F1
    macro_f1 = np.mean(f1_scores)
    # print(f"Macro-Average F1: {macro_f1:.4f}")

    return macro_f1



In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./ner_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_first_step=True,
    logging_dir="./ner_logs",
    learning_rate=3e-5,                     # 微幅調高（視情況）3e-5
    num_train_epochs=40,                   # 避免一次就設 50
    weight_decay=0.03,                     # 適當正則化 0.03
    # per_device_train_batch_size=8,        # 批量大一點也有助穩定
    # per_device_eval_batch_size=8,
    # load_best_model_at_end=True,           # ⚠️ 搭配 EarlyStopping 時很重要
    # metric_for_best_model="eval_loss",     # 根據 loss 選最佳模型
)

In [None]:
from transformers import TrainerCallback

class CharBasedEvaluationCallback(TrainerCallback):
    def __init__(self, task2_path, tokenizer):
        self.task2_path = task2_path
        self.tokenizer = tokenizer

    def on_evaluate(self, args, state, control, **kwargs):
        model = kwargs["model"]

        macro_f1 = evaluate_task2(self.task2_path, model, self.tokenizer)

        print(f"[Char-based Evaluation after epoch {state.epoch}]")
        print("Macro-F1:", macro_f1)

In [None]:
class FGM:
    def __init__(self, model, epsilon=1.0):
        self.model = model
        self.epsilon = epsilon
        self.backup = {}

    def attack(self, emb_name='embeddings.word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = self.epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='embeddings.word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name and name in self.backup:
                param.data = self.backup[name]
        self.backup = {}


In [None]:
from transformers import Trainer

class TrainerWithFGM(Trainer):
    def __init__(self, *args, fgm=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.fgm = fgm

    def training_step(self, model, inputs, num_items):  # ← 加上 num_items
        model.train()
        inputs = self._prepare_inputs(inputs)

        # 原始 loss
        loss = self.compute_loss(model, inputs)
        loss.backward()

        # 對抗訓練
        if self.fgm is not None:
            self.fgm.attack()
            adv_loss = self.compute_loss(model, inputs)
            adv_loss.backward()
            self.fgm.restore()

        return loss.detach()


In [None]:
from transformers import Trainer


task2_path = "/content/drive/MyDrive/AICUP_DATA/offical_change/entity_token_indices.txt"


fgm = FGM(model)

trainer = TrainerWithFGM(
    model=model,
    args=training_args,
    train_dataset=data_train,
    eval_dataset=data_test,
    data_collator=custom_collator,
    callbacks=[CharBasedEvaluationCallback(task2_path, tokenizer)],
    fgm=fgm,  # ✅ 加入這裡
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,18.8753,15.303363
2,6.6493,12.282393
3,4.2144,11.460523
4,3.1007,10.774789
5,2.0206,11.842018
6,2.024,12.281533
7,1.3892,12.959299
8,0.9848,13.806341
9,0.7783,14.365185
10,0.6122,15.615187


[Char-based Evaluation after epoch 1.0]
Macro-F1: 0.4816576469325218
[Char-based Evaluation after epoch 2.0]
Macro-F1: 0.6002536323369817
[Char-based Evaluation after epoch 3.0]
Macro-F1: 0.618552906100802
[Char-based Evaluation after epoch 4.0]
Macro-F1: 0.6704181948208405
[Char-based Evaluation after epoch 5.0]
Macro-F1: 0.6703708450960366
[Char-based Evaluation after epoch 6.0]
Macro-F1: 0.6594484957639974
[Char-based Evaluation after epoch 7.0]
Macro-F1: 0.7030044213661656
[Char-based Evaluation after epoch 8.0]
Macro-F1: 0.6877784603967375
[Char-based Evaluation after epoch 9.0]
Macro-F1: 0.6757309925104699
[Char-based Evaluation after epoch 10.0]
Macro-F1: 0.6758555796301355
[Char-based Evaluation after epoch 11.0]
Macro-F1: 0.689032496233497
[Char-based Evaluation after epoch 12.0]
Macro-F1: 0.6724634105769386
[Char-based Evaluation after epoch 13.0]
Macro-F1: 0.6819186071968986
[Char-based Evaluation after epoch 14.0]
Macro-F1: 0.6859891532759896
[Char-based Evaluation after ep

KeyboardInterrupt: 

儲存模型

In [None]:
import shutil
shutil.copytree('/content/ner_results/checkpoint-1680', '/content/drive/MyDrive/AI_CUP_NER/k_fold_new/crf_FGM_offical/model_checkpoint-1680')

'/content/drive/MyDrive/AI_CUP_NER/k_fold_new/crf_FGM_offical/model_checkpoint-1680'