In [1]:
import re
import json

In [2]:

def load_raw_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return f.read()

In [3]:
def parse_blocks(raw_text):
    # Split using the dashed line as a separator
    raw_blocks = raw_text.strip().split("-" * 50)
    blocks = []

    for block in raw_blocks:
        type_match = re.search(r"\[Type\]\s*(\w+)", block)
        title_match = re.search(r"\[Title\]\s*(.+?)\n", block)
        content_match = re.search(r"\[Content\]\s*(.+)", block, re.DOTALL)

        if type_match and title_match and content_match:
            blocks.append({
                "type": type_match.group(1).strip(),
                "title": title_match.group(1).strip(),
                "content": content_match.group(1).strip()
            })

    return blocks

In [4]:
def clean_text(text):
    # Remove HTML tags if any
    text = re.sub(r"<[^>]+>", "", text)
    # Remove excess punctuation (except periods, as they help in sentence boundaries)
    text = re.sub(r"[^\w\s.,?!]", "", text)
    # Collapse multiple spaces
    return re.sub(r"\s+", " ", text).strip()

In [5]:
def save_as_jsonl(data, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for item in data:
            json.dump(item, f, ensure_ascii=False)
            f.write("\n")

In [7]:
input_path = "preprocessed_output2.txt"          # Your input file
output_path = "cleaned_health_data2.jsonl"

raw_text = load_raw_file(input_path)
blocks = parse_blocks(raw_text)

cleaned_chunks = []
for block in blocks:
    cleaned_chunks.append({
        "type": block["type"],
        "title": clean_text(block["title"]),
        "content": clean_text(block["content"])
    })

save_as_jsonl(cleaned_chunks, output_path)
print(f"✅ Done. Output saved to: {output_path}")

✅ Done. Output saved to: cleaned_health_data2.jsonl
