In [None]:
import json, re, os

# 1) Helper functions
def clean(text):
    # remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # remove anything except word chars, space, .,!?
    text = re.sub(r'[^\w\s\.\,\!\?]', '', text)
    # lowercase and strip
    return text.lower().strip()

def split_sentences(text):
    # split on . or ! or ? followed by whitespace
    parts = re.split(r'(?<=[\.!?])\s+', text.strip())
    return [p for p in parts if p]

# 2) File paths
IN_FILE  = 'data/processed/merged.json'
OUT_FILE = 'data/processed/cleaned.json'
os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)

# 3) Load merged data
with open(IN_FILE, 'r') as f:
    data = json.load(f)

# 4) Clean & split each review
for entry in data:
    raw = entry.get('text', '') or ''
    cleaned_block = clean(raw)
    entry['sentences'] = split_sentences(cleaned_block)

# 5) Write out cleaned JSON
with open(OUT_FILE, 'w') as f:
    json.dump(data, f, indent=2)

print(f"✅ Cleaned {len(data)} reviews → {OUT_FILE}")
