In [None]:
import json
import string
from copy import deepcopy
import re

# 1. 讀取 JSON 數據
input_filename = "./task1_answer_timestamps.json"
output_filename = "./task1_answer_timestamps_cleaned.json"

with open(input_filename, "r", encoding="utf-8") as f:
    data = json.load(f)

# 2. 深拷貝以保留原始數據
cleaned_data = deepcopy(data)

# 3. 定義去除標點函數
def strip_punctuation(word):
    return word.strip(string.punctuation)

# 4. 定義英文月份集合
months = {
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
}
def normalize_time(text: str) -> str:
    def repl(m):
        h = m.group(1)
        mnt = m.group(2) 
        if len(mnt) == 1:
            mnt = "0" + mnt
        return f"{h}:{mnt}"

    return re.sub(r"\b(\d{1,2})\.(\d{1,2})\b", repl, text)

for record in cleaned_data.values():
    for segment in record.get("segments", []):
        words = segment.get("words", [])

        original = segment["text"]
        segment["text"] = normalize_time(original)

        for w in words:
            if w["word"] == "Dr.":
                continue
            w["word"] = strip_punctuation(w["word"])
        
        for i, w in enumerate(words):
            word_lower = w["word"]
            
            if word_lower in months and i + 1 < len(words):
                next_word = words[i + 1]["word"]
                cleaned = strip_punctuation(next_word) 
                
                if re.match(r"^\d+(?:st|nd|rd|th)?$", cleaned, flags=re.IGNORECASE):
                    if not next_word.endswith(","):
                        words[i + 1]["word"] = next_word + ","
            word_lower = word_lower.lower()
            if word_lower in ["p.m", "a.m"]:
                if not w["word"].endswith("."):
                    w["word"] = w["word"] + "."

        for i, w in enumerate(words):
            token = w["word"]
            if i > 0 and words[i-1]["word"] == "Dr.":
                if re.match(r'^[A-Z]$', token):
                    if not token.endswith("."):
                        w["word"] = token + "."
                    

            if re.match(r'^\d+\.\d+$', token):
                if i + 1 < len(words):
                    next_low = words[i+1]["word"].lower()
                    if next_low in {"p.m.", "p.m", "a.m.", "a.m",'am','pm'}:

                        parts = token.split(".", 1)
                        w["word"] = parts[0] + ":" + parts[1]

with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print(f"所有 ID 處理完成，已將結果儲存在：{output_filename}")


所有 ID 處理完成，已將結果儲存在：./task1_answer_timestamps_cleaned.json
