In [None]:
import json

with open("config.json", "r") as f:
    config_argument = json.load(f)

model_test_task1_data_path_txt = config_argument["model_test_task1_data_path_txt"]
model_test_task2_data_path_txt = config_argument["model_test_task2_data_path_txt"]



print( "model_test_task1_data_path_txt", model_test_task1_data_path_txt )
print( "model_test_task2_data_path_txt", model_test_task2_data_path_txt ) 


In [None]:

import pandas as pd


# 載入 task1 資料：{id: sentence}
def load_task1(filepath):
    id2text = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(maxsplit=1)
            if len(parts) == 2:
                id_, text = parts
                id2text[id_] = text
    return id2text

# 載入 task2 為 DataFrame
def load_task2(filepath):
    rows = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()  # 先去除行首尾空白與換行
            if not line:
                continue
            parts = line.split('\t')
            if len(parts) == 5:
                parts = [p.strip() for p in parts]  # 再去每個欄位的頭尾空白
                rows.append(parts)
    df = pd.DataFrame(rows, columns=["id", "type", "start_time", "end_time", "content"])
    return df

def find_nth_occurrence(text, substring, n):
    start = -1
    for _ in range(n):
        start = text.find(substring, start + 1)
        if start == -1:
            return -1
    return start


def map_entities_to_char_indices_with_duplicates(task1_file, task2_file, output_file=None):
    id2text = load_task1(task1_file)
    df_task2 = load_task2(task2_file)

    # 用來記錄每個 (id, entity_text) 出現次數
    occurrence_counter = {}

    results = []
    for row in df_task2.itertuples():
        tid = str(row.id)
        entity_text = row.content.strip()
        entity_type = row.type

        if tid not in id2text:
            results.append([tid, entity_type, -1, -1, entity_text])
            continue

        sentence = id2text[tid]

        key = (tid, entity_text)
        occurrence_counter[key] = occurrence_counter.get(key, 0) + 1
        nth = occurrence_counter[key]

        start_char = find_nth_occurrence(sentence, entity_text, nth)
        if start_char == -1:
            start_char, end_char = -1, -1
        else:
            end_char = start_char + len(entity_text)

        results.append([tid, entity_type, start_char, end_char, entity_text])

    if output_file:
        with open(output_file, "w", encoding="utf-8") as f:
            for r in results:
                f.write("\t".join(map(str, r)) + "\n")

    return pd.DataFrame(results, columns=['id', 'type', 'start_char', 'end_char', 'content'])




In [None]:
df_result = map_entities_to_char_indices_with_duplicates(
    model_test_task1_data_path_txt,
    model_test_task2_data_path_txt,
    "./entity_token_indices.txt"
)