In [7]:
import os
import json
import pandas as pd
import numpy as np
import re
from tqdm import tqdm


In [8]:
RAW_JSON_DIR = "../data/raw/"
OUTPUT_PATH = "../data/processed/health_news_clean_final.csv"


In [9]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)      # remove URLs
    text = re.sub(r"[^a-z0-9\s]", "", text)  # remove symbols
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [10]:
def rating_to_label(rating):
    if rating <= 2:
        return 1   # misinformation
    else:
        return 0   # reliable


In [12]:
rows = []

json_files = [f for f in os.listdir(RAW_JSON_DIR) if f.endswith(".json")]

for file in tqdm(json_files, desc="Loading JSON files"):
    with open(os.path.join(RAW_JSON_DIR, file), "r", encoding="utf-8") as f:
        data = json.load(f)

        # Some files may contain lists, others single objects
        if isinstance(data, dict):
            data = [data]

        for item in data:
            rows.append({
                "title": item.get("title", ""),
                "description": item.get("description", ""),
                "original_title": item.get("original_title", ""),
                "category": item.get("category", ""),
                "rating": item.get("rating", np.nan),
                "summary": item.get("summary", {}).get("Our Review Summary", ""),
                "why_it_matters": item.get("summary", {}).get("Why This Matters", ""),
                "tags": ", ".join(item.get("tags", []))
            })


NotADirectoryError: [WinError 267] The directory name is invalid: '../data/raw/HealthRelease.json'

In [None]:
df = pd.DataFrame(rows)
print("Rows loaded:", len(df))
df.head()


In [None]:
df.fillna("", inplace=True)


In [None]:
df["text"] = (
    df["title"] + " " +
    df["original_title"] + " " +
    df["description"] + " " +
    df["summary"] + " " +
    df["why_it_matters"]
)


In [None]:
df["text"] = df["text"].apply(clean_text)


In [None]:
df["label"] = df["rating"].apply(
    lambda x: rating_to_label(int(x)) if str(x).isdigit() else np.nan
)

df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)


In [None]:
print("Class distribution:")
df["label"].value_counts(normalize=True)


In [None]:
df[["rating", "label"]].describe()


In [None]:
OUTPUT_PATH = "../data/processed/health_info_clean.csv"

final_df = df[["text", "label"]]
final_df.to_csv(OUTPUT_PATH, index=False)

print("✅ Clean dataset saved successfully")
print("📁 File:", OUTPUT_PATH)
print("🧾 Total rows:", len(final_df))


In [None]:
final_df.sample(5)

