# Scraping Berita

## Scraping Edukasi

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasLestari"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_edukasi_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder Drive: /content/drive/MyDrive/KompasLestari
File output : /content/drive/MyDrive/KompasLestari/kompas_edukasi_scraped.csv


In [None]:
# Scraper Kompas Edukasi (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ========== Konfigurasi ==========
BASE = "https://edukasi.kompas.com/"
REQUEST_DELAY = (0.8, 1.6)     # jeda acak (detik) biar sopan
TIMEOUT = 20
MAX_PAGES_PER_SUB = None       # None = jalan terus sampai 3 halaman berturut-turut kosong per sub-kategori

# Kontrol sub-kanal lintas domain:
ALLOW_SKOLA = True             # www.kompas.com/skola
ALLOW_EDU_NEWS = True          # www.kompas.com/edu (alias Edu News)
ALLOW_KILASPENDIDIKAN = False  # kilaspendidikan.kompas.com (advertorial) -> default exclude

# CSV diset dari sel sebelumnya (OUTPUT_CSV)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ========== HTTP session (retry) ==========
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ========== CSV helpers ==========
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ========== Utils ==========
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ========== Menu discovery ==========
def discover_menu_links():
    """
    Ambil daftar sub-kategori dari kanalMenu Edukasi.
    Lalu filter sesuai flag ALLOW_* di atas.
    """
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            # filter domain berdasarkan flag
            netloc = urlparse(href).netloc.lower()
            if "kilaspendidikan.kompas.com" in netloc and not ALLOW_KILASPENDIDIKAN:
                continue
            if "www.kompas.com" in netloc:
                # /edu dan /skola
                if "/edu" in href and not ALLOW_EDU_NEWS:
                    continue
                if "/skola" in href and not ALLOW_SKOLA:
                    continue
            elif "edukasi.kompas.com" not in netloc:
                # buang domain luar edukasi (kecuali yang di-allow di atas)
                continue
            found[href] = name
    except Exception:
        pass

    # fallback seed minimal kalau parsing menu gagal
    SEED = OrderedDict([
        ("https://www.kompas.com/edu", "Edu News"),
        ("https://edukasi.kompas.com/perguruan-tinggi", "Perguruan Tinggi"),
        ("https://edukasi.kompas.com/sekolah", "Sekolah"),
        ("https://edukasi.kompas.com/pendidikan-khusus", "Pendidikan Khusus"),
        ("https://edukasi.kompas.com/beasiswa", "Beasiswa"),
        ("https://edukasi.kompas.com/literasi", "Literasi"),
        ("https://www.kompas.com/skola", "Skola"),
        ("https://edukasi.kompas.com/ideaksi", "IdeAksi"),
        # ("https://kilaspendidikan.kompas.com", "Kilas Pendidikan"),  # exclude by default
    ])
    for href, name in SEED.items():
        netloc = urlparse(href).netloc.lower()
        if "kilaspendidikan.kompas.com" in netloc and not ALLOW_KILASPENDIDIKAN:
            continue
        if "/edu" in href and not ALLOW_EDU_NEWS:
            continue
        if "/skola" in href and not ALLOW_SKOLA:
            continue
        found.setdefault(href, name)

    return list(found.items())  # [(url_sub, nama_sub)]

# ========== List artikel per sub-kategori ==========
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    """Kumpulkan URL artikel dari satu halaman listing."""
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()

    # kandidat umum
    candidates = []
    candidates += soup.select("a[href*='/read/']")
    candidates += soup.select(".article__title a[href*='/read/']")
    candidates += soup.select(".headline__big a[href*='/read/']")
    candidates += soup.select(".article__list a[href*='/read/']")

    for a in candidates:
        href = a.get("href");
        if not href:
            continue
        full = absolutize(listing_url, href.strip())
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    # kalau sudah ada query, tambahkan &page=
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    """Paging per sub-kategori sampai 3 halaman berturut-turut tidak menambah URL."""
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ========== Parser artikel ==========
def extract_publish_date(soup):
    # meta kompas
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    # og:article
    meta_pub = soup.select_one('meta[property="article:published_time"]')
    if meta_pub and meta_pub.get("content"):
        return meta_pub["content"].strip()
    # ld+json
    script_ld = soup.find("script", type="application/ld+json")
    if script_ld:
        try:
            data = json.loads(script_ld.string)
            if isinstance(data, dict) and data.get("datePublished"):
                return data["datePublished"]
        except Exception:
            pass
    # fallback elemen waktu
    el = soup.select_one(".read__time") or soup.select_one(".read__date")
    return normalize_ws(el.get_text()) if el else ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""  # boleh kosong

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")

    # judul
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")

    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)

    # konten (gabung paragraf, buang 'Baca juga')
    wrapper = soup.select_one(".read__content") or soup.select_one(".story__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)

    return {
        "judul": judul,
        "tanggal_publish": tanggal_publish,
        "tags": tags,
        "konten": konten
    }

# ========== Pipeline utama ==========
def run_scrape_edukasi(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    # temukan sub-kategori dari menu
    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    # kumpulkan URL unik dari semua sub-kategori
    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)  # key gabungan (url, subcategory)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    # simpan per artikel (resumeable)
    saved = 0
    category = "Edukasi"

    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==== JALANKAN ====
run_scrape_edukasi(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - Edu News: https://www.kompas.com/edu
 - Perguruan Tinggi: https://edukasi.kompas.com/perguruan-tinggi
 - Sekolah: https://edukasi.kompas.com/sekolah
 - Pendidikan Khusus: https://edukasi.kompas.com/pendidikan-khusus
 - Beasiswa: https://edukasi.kompas.com/beasiswa
 - Literasi: https://edukasi.kompas.com/literasi
 - Skola: https://www.kompas.com/skola
 - IdeAksi: https://edukasi.kompas.com/ideaksi

Crawl sub-kategori: Edu News
  [page=1] +31 URL (total 31)
  [page=2] +0 URL (total 31)
  [page=3] +0 URL (total 31)
  [page=4] +0 URL (total 31)

Crawl sub-kategori: Perguruan Tinggi
  [page=1] +28 URL (total 28)
  [page=2] +0 URL (total 28)
  [page=3] +0 URL (total 28)
  [page=4] +0 URL (total 28)

Crawl sub-kategori: Sekolah
  [page=1] +27 URL (total 27)
  [page=2] +0 URL (total 27)
  [page=3] +0 URL (total 27)
  [page=4] +0 URL (total 27)

Crawl sub-kategori: Pendidikan Khusus
  [page=1] +32 URL (total 32)
  [page=2] +0 URL (total 32)
  

Scraping: 100%|██████████| 227/227 [04:07<00:00,  1.09s/url]

Selesai. Artikel baru tersimpan: 150.
CSV: /content/drive/MyDrive/KompasLestari/kompas_edukasi_scraped.csv





##  Scraping Parapuan

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasLestari"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_parapuan_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder Drive: /content/drive/MyDrive/KompasLestari
File output : /content/drive/MyDrive/KompasLestari/kompas_parapuan_scraped.csv


In [None]:
# Scraper Kompas Parapuan (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi umum
# ==========================
BASE = "https://www.kompas.com/parapuan"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None  # None = lanjut sampai 3 halaman kosong berturut-turut
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# Session HTTP dengan retry
# ==========================
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helper
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utilities
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Dapatkan daftar sub-kategori dari menu
# ==========================
def discover_menu_links():
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='parapuan'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            found[href] = name
    except Exception:
        pass

    # fallback kalau parsing gagal
    SEED = OrderedDict([
        ("https://www.kompas.com/parapuan/trending-topic", "Trending Topic"),
        ("https://www.kompas.com/parapuan/love-life", "Love & Life"),
        ("https://www.kompas.com/parapuan/wellness", "Wellness"),
        ("https://www.kompas.com/parapuan/fashion-beauty", "Fashion & Beauty"),
        ("https://www.kompas.com/parapuan/lady-boss", "Lady Boss"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    return list(found.items())

# ==========================
# Ambil daftar URL artikel per subcategory
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href")
        if not href:
            continue
        full = absolutize(listing_url, href)
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    meta2 = soup.select_one('meta[property="article:published_time"]')
    if meta2 and meta2.get("content"):
        return meta2["content"].strip()
    return ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1.article__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")
    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)
    wrapper = soup.select_one(".read__content") or soup.select_one(".article__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)
    return {"judul": judul, "tanggal_publish": tanggal_publish, "tags": tags, "konten": konten}

# ==========================
# Pipeline utama
# ==========================
def run_scrape_parapuan(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Parapuan"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_parapuan(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - Trending Topic: https://www.kompas.com/parapuan/trending-topic
 - Love & Life: https://www.kompas.com/parapuan/love-life
 - Wellness: https://www.kompas.com/parapuan/wellness
 - Fashion & Beauty: https://www.kompas.com/parapuan/fashion-beauty
 - Lady Boss: https://www.kompas.com/parapuan/lady-boss

Crawl sub-kategori: Trending Topic
  [page=1] +34 URL (total 34)
  [page=2] +0 URL (total 34)
  [page=3] +0 URL (total 34)
  [page=4] +0 URL (total 34)

Crawl sub-kategori: Love & Life
  [page=1] +34 URL (total 34)
  [page=2] +0 URL (total 34)
  [page=3] +0 URL (total 34)
  [page=4] +0 URL (total 34)

Crawl sub-kategori: Wellness
  [page=1] +34 URL (total 34)
  [page=2] +0 URL (total 34)
  [page=3] +0 URL (total 34)
  [page=4] +0 URL (total 34)

Crawl sub-kategori: Fashion & Beauty
  [page=1] +34 URL (total 34)
  [page=2] +0 URL (total 34)
  [page=3] +0 URL (total 34)
  [page=4] +0 URL (total 34)

Crawl sub-kategori: Lady Boss
  [page=1] +3

Scraping: 100%|██████████| 170/170 [03:47<00:00,  1.34s/url]

Selesai. Artikel baru tersimpan: 130.
CSV: /content/drive/MyDrive/KompasLestari/kompas_parapuan_scraped.csv





## Scraping Properti

In [None]:
!pip -q install requests beautifulsoup4 lxml pandas tqdm python-dateutil

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

DRIVE_FOLDER = "/content/drive/MyDrive/KompasLestari"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_properti_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder Drive: /content/drive/MyDrive/KompasLestari
File output : /content/drive/MyDrive/KompasLestari/kompas_properti_scraped.csv


In [None]:
# Scraper Kompas Properti (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi
# ==========================
BASE = "https://properti.kompas.com/"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None   # None = lanjut sampai 3 halaman berturut-turut kosong

# Ikuti kanalMenu yang kamu inspeksi:
#   News, Listing Properti, Arsitektur, Konstruksi, Tips Properti, IKN, Homey, Indeks, Sorot Properti
# Secara default: ikutkan kanal Properti inti; eksternal dimatikan (bisa dinyalakan).
ALLOW_IKN   = False   # https://ikn.kompas.com/
ALLOW_HOMEY = False   # https://www.kompas.com/homey
ALLOW_SOROT = False   # https://sorot.kompas.com/
ALLOW_INDEKS = True   # https://indeks.kompas.com/?site=properti (arsip lintas waktu)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# HTTP session (retry)
# ==========================
from requests.adapters import HTTPAdapter, Retry
def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helpers
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utils
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Temukan link subcategory dari kanalMenu
# ==========================
def discover_menu_links():
    """
    Ambil menu dari .kanalHeader[data-kanal="properti"] .kanalMenu a
    Lalu filter sesuai flag ALLOW_* di atas.
    """
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='properti'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            netloc = urlparse(href).netloc.lower()
            # filter eksternal sesuai flag
            if "ikn.kompas.com" in netloc and not ALLOW_IKN:
                continue
            if "sorot.kompas.com" in netloc and not ALLOW_SOROT:
                continue
            if "www.kompas.com" in netloc and "/homey" in href and not ALLOW_HOMEY:
                continue
            if "indeks.kompas.com" in netloc and not ALLOW_INDEKS:
                continue
            # lainnya diterima
            found[href] = name
    except Exception:
        pass

    # fallback seed sesuai struktur yang kamu inspeksi (jaga-jaga bila selector gagal)
    SEED = OrderedDict([
        ("https://properti.kompas.com/news", "News"),
        ("https://properti.kompas.com/listing-properti", "Listing Properti"),
        ("https://properti.kompas.com/arsitektur", "Arsitektur"),
        ("https://properti.kompas.com/konstruksi", "Konstruksi"),
        ("https://properti.kompas.com/tips-properti", "Tips Properti"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    # opsional ikutkan indeks (arsip)
    if ALLOW_INDEKS:
        found.setdefault("https://indeks.kompas.com/?site=properti", "Indeks")

    return list(found.items())  # [(url_sub, nama_sub)]

# ==========================
# Kumpulkan URL artikel dari sebuah listing
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    # kalau domain www.kompas.com, pastikan berada di kanal properti
    netloc = urlparse(href).netloc.lower()
    if "www.kompas.com" in netloc and "/properti/" not in href:
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()

    # kandidat umum (link yang mengandung /read/)
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href");
        if not href:
            continue
        full = absolutize(listing_url, href.strip())
        if is_article_url(full):
            urls.add(full)

    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    """
    Paging per subcategory sampai 3 halaman berturut-turut tidak menambah URL.
    Untuk Indeks, pola ?page= juga bekerja.
    """
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    # meta kompas
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    # og:article
    meta_pub = soup.select_one('meta[property="article:published_time"]')
    if meta_pub and meta_pub.get("content"):
        return meta_pub["content"].strip()
    # fallback elemen waktu
    el = soup.select_one(".read__time, .read__date, .articlePost-date")
    return normalize_ws(el.get_text()) if el else ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a, .read__keyword a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""  # boleh kosong

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")

    # judul
    title_el = soup.select_one("h1.read__title, h1.article__title, h1.detail__title, h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")

    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)

    # konten (gabung paragraf, buang 'Baca juga')
    wrapper = soup.select_one(".read__content, .article__content, .kcm__read__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)

    return {
        "judul": judul,
        "tanggal_publish": tanggal_publish,
        "tags": tags,        # bisa kosong ""
        "konten": konten
    }

# ==========================
# Pipeline utama
# ==========================
def run_scrape_properti(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi dari kanalMenu:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Properti"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_properti(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi dari kanalMenu:
 - News: https://properti.kompas.com/news
 - Listing Properti: https://properti.kompas.com/listing-properti
 - Arsitektur: https://properti.kompas.com/arsitektur
 - Konstruksi: https://properti.kompas.com/konstruksi
 - Tips Properti: https://properti.kompas.com/tips-properti
 - Indeks: https://indeks.kompas.com/?site=properti

Crawl sub-kategori: News
  [page=1] +26 URL (total 26)
  [page=2] +0 URL (total 26)
  [page=3] +0 URL (total 26)
  [page=4] +0 URL (total 26)

Crawl sub-kategori: Listing Properti
  [page=1] +24 URL (total 24)
  [page=2] +0 URL (total 24)
  [page=3] +0 URL (total 24)
  [page=4] +0 URL (total 24)

Crawl sub-kategori: Arsitektur
  [page=1] +26 URL (total 26)
  [page=2] +0 URL (total 26)
  [page=3] +0 URL (total 26)
  [page=4] +0 URL (total 26)

Crawl sub-kategori: Konstruksi
  [page=1] +25 URL (total 25)
  [page=2] +0 URL (total 25)
  [page=3] +0 URL (total 25)
  [page=4] +0 URL (total 25)

Crawl sub-k

Scraping: 100%|██████████| 143/143 [03:11<00:00,  1.34s/url]

Selesai. Artikel baru tersimpan: 103.
CSV: /content/drive/MyDrive/KompasLestari/kompas_properti_scraped.csv





## Scraping Lestari

In [None]:
!pip install requests beautifulsoup4 lxml tqdm pandas




In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

INPUT_DIR  = "/content/drive/MyDrive/scrapping-kompas"   # <- folder asal
OUTPUT_DIR = "/content/drive/MyDrive/scrapping-kompas"      # <- folder simpan hasil
os.makedirs(OUTPUT_DIR, exist_ok=True)

INPUT_FILE  = "kompas_lestari_scraped.csv"               # <- nama file yang mau diproses
OUTPUT_FILE = "kompas_lestari_with_subcategory.csv"      # <- nama file hasil

print("Input :", os.path.join(INPUT_DIR, INPUT_FILE))
print("Output:", os.path.join(OUTPUT_DIR, OUTPUT_FILE))


Mounted at /content/drive
Input : /content/drive/MyDrive/scrapping-kompas/kompas_lestari_scraped.csv
Output: /content/drive/MyDrive/scrapping-kompas/kompas_lestari_with_subcategory.csv


In [None]:
import re, json, time, random
from urllib.parse import urlparse
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from requests.adapters import HTTPAdapter, Retry

# ===== HTTP session dengan retry =====
def make_session():
    s = requests.Session()
    s.headers.update({
        "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/120.0.0.0 Safari/537.36"),
        "Accept-Language": "id,en;q=0.9",
    })
    retries = Retry(total=5, backoff_factor=1.2,
                    status_forcelist=[429,500,502,503,504],
                    allowed_methods=frozenset(["GET"]))
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://",  HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

def safe_get(url, timeout=20):
    r = SESSION.get(url, timeout=timeout)
    r.raise_for_status()
    return r

def text_or_none(el):
    return el.get_text(strip=True) if el else None

def extract_subcategory_from_html(html: str) -> str:
    """
    Urutan deteksi:
      1) <meta name="content_subcategory" content="...">
      2) dataLayer.push({ content_subcategory: "..." })
      3) breadcrumb .breadcrumb__item a (ambil item kedua kalau ada)
      4) fallback kosong ""
    """
    soup = BeautifulSoup(html, "lxml")

    # 1) meta
    meta = soup.find("meta", attrs={"name": "content_subcategory"})
    if meta and meta.get("content"):
        return meta["content"].strip()

    # 2) dataLayer
    for sc in soup.find_all("script"):
        s = sc.string or ""
        if "dataLayer" in s and "content_subcategory" in s:
            m = re.search(r"content_subcategory\"\s*:\s*\"([^\"]+)\"", s)
            if not m:
                m = re.search(r"content_subcategory'\s*:\s*'([^']+)'", s)
            if m:
                return m.group(1).strip()

    # 3) breadcrumb
    crumbs = soup.select(".breadcrumb__item a, ul.breadcrumb li a")
    if len(crumbs) >= 2:
        return text_or_none(crumbs[1]) or ""

    return ""

def infer_subcategory(url: str) -> str:
    """
    Ambil subcategory langsung dari artikel. Skip UMKM domain.
    """
    netloc = urlparse(url).netloc.lower()
    if "umkm.kompas.com" in netloc:
        return ""  # skip sesuai permintaan

    try:
        r = safe_get(url)
    except Exception:
        return ""
    sub = extract_subcategory_from_html(r.text) or ""
    # sedikit normalisasi spasi/kasus
    sub = re.sub(r"\s+", " ", sub).strip()
    return sub

def enrich_subcategory(df: pd.DataFrame, subcol="subcategory", urlcol="url") -> pd.DataFrame:
    """
    Isi kolom subcategory jika kosong, atau overwrite semuanya kalau mau.
    Di sini: kalau existing subcategory kosong → isi; kalau sudah ada → biarkan.
    """
    if subcol not in df.columns:
        df[subcol] = ""

    # target baris yang perlu diisi (kosong / null)
    mask = df[subcol].astype(str).str.strip().eq("")
    idxs = df[mask].index.tolist()

    if not idxs:
        print("Semua baris sudah punya subcategory. Tidak ada yang perlu diisi.")
        return df

    print(f"Baris yang akan diisi subcategory: {len(idxs)}")
    for i in tqdm(idxs, desc="Fetch subcategory", unit="row"):
        url = str(df.at[i, urlcol]).strip()
        if not url:
            continue
        sub = infer_subcategory(url)
        df.at[i, subcol] = sub
        time.sleep(random.uniform(0.6, 1.2))  # sopan ke server

    return df


In [None]:
df2.head()

Unnamed: 0,url,kategori,tanggal_publish,judul,tags,konten,subcategory
0,https://lestari.kompas.com/partner/april-group...,Lestari,2025-10-27 08:52:56,IUCN dan APRIL Perkuat Ilmu Konservasi lewat K...,"sektor swasta, restorasi, IUCN, APRIL Group, k...",KOMPAS.com – Sebagai salah satu negara megabio...,Swasta
1,https://lestari.kompas.com/partner/ernst-young...,Lestari,2024-12-20 16:19:13,IAI Terbitkan Peta Jalan Standar Pengungkapan ...,"IAI, SDGs, SPK, keberlanjutan, IFRS, SDG13-Pen...",KOMPAS.com - Indonesia telah menunjukkan komit...,LSM/Figur
2,https://lestari.kompas.com/partner/nusantara-i...,Lestari,2025-03-07 08:03:00,Ramdani Basri: Fleksibilitas Skema KPBU Bantu ...,"jalan tol, pembangunan infrastruktur, tol, KPB...","JAKARTA, KOMPAS.com - Pemerintah Indonesia pun...",Swasta
3,https://lestari.kompas.com/partner/pertamina-g...,Lestari,2025-01-02 10:58:58,"Atasi Masalah Sampah Plastik, PGN Gelar Edukas...","Semarang , PGN, urban farming, perusahaan gas ...",KOMPAS.com - Sampah plastik masih menjadi pers...,BUMN
4,https://lestari.kompas.com/read/2025/11/05/085...,Lestari,2025-11-05 08:57:39,"COP 30: Dagang Karbon Kuno dan Terbukti Gagal,...","emisi karbon dioksida, transisi energi, Perdag...",Oleh Denny Gunawan* KOMPAS.com - Menjelang Con...,LSM/Figur


In [None]:
# Simpan kalau kamu set SAVE=True
SAVE = True
if SAVE:
    df2.to_csv(out_path, index=False, encoding="utf-8")
    print("✅ Disimpan ke:", out_path)
else:
    print("ℹ️ SAVE=False → belum disimpan. Set SAVE=True lalu jalankan cell ini lagi untuk menyimpan.")

✅ Disimpan ke: /content/drive/MyDrive/scrapping-kompas/kompas_lestari_with_subcategory.csv


In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 807 entries, 0 to 806
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              807 non-null    object
 1   kategori         807 non-null    object
 2   tanggal_publish  807 non-null    object
 3   judul            807 non-null    object
 4   tags             807 non-null    object
 5   konten           807 non-null    object
 6   subcategory      807 non-null    object
dtypes: object(7)
memory usage: 44.3+ KB


# Preprocessing

## Install and Mount Drive

In [None]:
!pip install pandas python-dateutil unidecode tqdm

import re, json
import pandas as pd
from dateutil import parser as dtparser
from unidecode import unidecode
from tqdm import tqdm
from google.colab import drive
import os

drive.mount('/content/drive')

# Folder input & output
INPUT_DIR  = "/content/drive/MyDrive/scrapping-kompas"
OUTPUT_DIR = "/content/drive/MyDrive/prepro-kompas"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("INPUT_DIR :", INPUT_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
INPUT_DIR : /content/drive/MyDrive/scrapping-kompas
OUTPUT_DIR: /content/drive/MyDrive/prepro-kompas


## Input Data and Parameter Control

In [None]:
# ===> EDIT daftar file yang mau dibersihkan (harus ada di INPUT_DIR)
INPUT_FILES = [
    "kompas_parapuan_scraped.csv",
    "kompas_edukasi_scraped.csv",
    "kompas_properti_scraped.csv",
    # "kompas_lestari_scraped.csv",
    # "kompas_news_scraped.csv",
]

# Threshold minimal panjang konten setelah prepro (karakter)
MIN_CHARS = 120

# Simpan hasil bersih? (set True setelah kamu review hasil audit)
SAVE = True


## First Cleaning Process

In [None]:
tqdm.pandas()

FINAL_COLUMNS = ["url","category","subcategory","tanggal_publish","judul_berita","tags","konten_berita"]

# Pola awalan media/kanal di awal teks
RE_BAD_PREFIXES = [
    r"^[A-ZÀ-ÖØ-Þ][A-ZÀ-ÖØ-Þ\s\.'’/,-]{1,30},\s*KOMPAS\.?com\s*[-–—:]\s*",  # <KOTA>, KOMPAS.com -
    r"^\s*KOMPAS\.?com\s*[-–—:]\s*",                                        # KOMPAS.com -
    r"^\s*Kompas\.?com\s*[-–—:]\s*",                                        # Kompas.com –
    r"^\s*parapuan\s*\.?\s*co\s*[-–—:]\s*",                                 # Parapuan.co -
]

RE_REMOVE_LINES = [
    r"^\s*(Baca\s+juga|BACA\s+JUGA)\s*:.*$",
    r"^\s*(Penulis|Editor|Penulis\W*Editor)\s*:\s*.*$",
    r"^\s*Artikel ini.*$",
    r"^\s*Ikuti\s+berita.*$",
    r"^\s*Unduh.*(Aplikasi|aplikasi).*Kompas.*$",
    r"^\s*Download.*KOMPAS\.com.*$",
]
RE_INLINE_JUNK = [r"\bBaca\s+juga\s*:\s*.+?(?=$|\.)", r"ADVERTISEMENT", r"IKLAN", r"PROMOSI"]
RE_URL = r"(https?://\S+)"
RE_MULTI_PUNCT = r"([,.!?]){2,}"
RE_MULTI_SPACE = r"[ \t]{2,}"
RE_ZERO_WIDTH = r"[\u200B-\u200D\uFEFF]"

COLUMN_MAP_CANDIDATES = [
    {"url":"url","category":"category","subcategory":"subcategory","tanggal_publish":"tanggal_publish",
     "judul":"judul_berita","judul_berita":"judul_berita","tags":"tags",
     "konten":"konten_berita","konten_berita":"konten_berita"},
    {"title":"judul_berita","content":"konten_berita","published_at":"tanggal_publish","content_tags":"tags"},
]

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = {c: c.strip().lower() for c in df.columns}
    df = df.rename(columns=cols)
    for cmap in COLUMN_MAP_CANDIDATES:
        for src, dst in cmap.items():
            if src in df.columns and dst not in df.columns:
                df = df.rename(columns={src: dst})
    for c in FINAL_COLUMNS:
        if c not in df.columns:
            df[c] = ""
    return df[FINAL_COLUMNS].copy()

def parse_date_safe(x: str):
    if not isinstance(x, str) or not x.strip(): return ""
    s = re.sub(r"\b(WIB|WITA|WIT)\b", "", x.strip(), flags=re.I)
    try:
        return dtparser.parse(s, fuzzy=True).isoformat()
    except Exception:
        return x

def fix_punct_and_spacing(text: str):
    if not isinstance(text, str): return ""
    t = text
    t = re.sub(RE_ZERO_WIDTH, "", t).replace("\xa0", " ")
    t = unidecode(t)                             # normalisasi kutip/aksen
    t = re.sub(RE_MULTI_PUNCT, r"\1", t)         # "...." -> "."
    t = re.sub(RE_MULTI_SPACE, " ", t)
    t = re.sub(r"\s+([,.!?;:])", r"\1", t)       # spasi sebelum tanda baca
    t = re.sub(r"([(\[{])\s+", r"\1", t)         # spasi setelah buka kurung
    t = re.sub(r"\s+([)\]}])", r"\1", t)         # spasi sebelum tutup kurung
    return t.strip()

def remove_garbage_lines(text: str):
    if not isinstance(text, str): return ""
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    kept = []
    for ln in lines:
        ln2 = re.sub(RE_URL, "", ln).strip()
        if any(re.search(p, ln2, flags=re.IGNORECASE) for p in RE_REMOVE_LINES):
            continue
        kept.append(ln2)
    t = "\n".join(kept)
    for pat in RE_INLINE_JUNK:
        t = re.sub(pat, "", t, flags=re.IGNORECASE).strip()
    return t

def strip_kompas_lead(text: str):
    if not isinstance(text, str): return ""
    t = text.strip()
    for pat in RE_BAD_PREFIXES:
        t = re.sub(pat, "", t, flags=re.IGNORECASE)
    return t.strip()

def content_minimal_ok(text: str, min_chars=120):
    return isinstance(text, str) and len(text.strip()) >= min_chars


def auditable_clean(df_raw: pd.DataFrame, min_chars=120, drop_short=True):
    """
    Return:
      df_kept, df_dropped, summary_df
    """
    df = normalize_columns(df_raw.copy())
    for c in FINAL_COLUMNS:
        df[c] = df[c].astype(str).fillna("").str.strip()

    # transform sebelum evaluasi drop
    df["tanggal_publish"] = df["tanggal_publish"].apply(parse_date_safe)
    df["judul_berita"]    = df["judul_berita"].apply(fix_punct_and_spacing)
    df["konten_berita"]   = df["konten_berita"].apply(fix_punct_and_spacing)
    df["konten_berita"]   = df["konten_berita"].apply(remove_garbage_lines)
    df["konten_berita"]   = df["konten_berita"].apply(strip_kompas_lead)
    df["konten_berita"]   = df["konten_berita"].apply(lambda s: re.sub(r"\n{2,}", "\n\n", s).strip())

    df["__len"] = df["konten_berita"].apply(lambda x: len(x.strip()))

    # 1️⃣ duplicate URL
    mask_dup_url = df.duplicated(subset=["url"], keep="first")

    # 2️⃣ duplicate title+body (setelah buang dup URL)
    df_nourl = df.loc[~mask_dup_url].copy()
    mask_dup_tb_partial = df_nourl.duplicated(subset=["judul_berita","konten_berita"], keep="first")
    mask_dup_title_body = df.index.isin(df_nourl.index[mask_dup_tb_partial])

    # 3️⃣ too short (hanya deteksi, tidak dihapus kecuali drop_short=True)
    mask_short = ~df["konten_berita"].apply(lambda s: content_minimal_ok(s, min_chars=min_chars))

    # alasan drop (prioritas)
    reason = pd.Series("", index=df.index, dtype=str)
    reason[mask_dup_url] = "duplicate_url"
    reason[~mask_dup_url & mask_dup_title_body] = "duplicate_title_body"
    if drop_short:
        reason[~mask_dup_url & ~mask_dup_title_body & mask_short] = "too_short"

    # hasil drop hanya duplikat (too_short disimpan kalau drop_short=False)
    df_dropped = df[reason != ""].copy()
    df_dropped["reason"] = reason[reason != ""]
    df_kept    = df[reason == ""].copy()

    # ringkasan
    summary_df = pd.DataFrame([{
        "rows_initial": len(df),
        "drop_duplicate_url": int(mask_dup_url.sum()),
        "drop_duplicate_title_body": int((~mask_dup_url & mask_dup_title_body).sum()),
        "detected_too_short": int((~mask_dup_url & ~mask_dup_title_body & mask_short).sum()),
        "rows_final": len(df_kept),
        "note": "too_short hanya dideteksi, tidak dihapus" if not drop_short else "too_short dihapus",
    }])

    df_kept    = df_kept[FINAL_COLUMNS].reset_index(drop=True)
    df_dropped = df_dropped[FINAL_COLUMNS + ["reason","__len"]].reset_index(drop=True)
    return df_kept, df_dropped, summary_df


def save_by_category(df: pd.DataFrame, out_dir: str):
    report_rows = []
    for cat, dfc in df.groupby("category"):
        cat_slug = re.sub(r"[^a-z0-9]+", "_", cat.lower()).strip("_") or "unknown"
        out_path = os.path.join(out_dir, f"kompas_{cat_slug}_clean.csv")
        dfc.to_csv(out_path, index=False, encoding="utf-8")
        report_rows.append({"category": cat, "rows": len(dfc), "path": out_path})
    return pd.DataFrame(report_rows)


## Audit Results

In [None]:
all_kept = []
all_summaries = []
all_dropped = []

for fname in INPUT_FILES:
    in_path = os.path.join(INPUT_DIR, fname)
    if not os.path.exists(in_path):
        print(f"⚠️ file tidak ditemukan: {in_path}")
        continue

    print(f"\n=== AUDIT: {fname} ===")
    raw = pd.read_csv(in_path)
    kept, dropped, summary_df = auditable_clean(raw, min_chars=MIN_CHARS)

    display(summary_df)                 # ringkasan sebagai DataFrame
    print("• Contoh KEPT (5 baris):")
    display(kept.head(5))
    if len(dropped):
        print("• Contoh DROPPED (5 baris):")
        display(dropped.head(5))
        print("• Rekap alasan (value_counts):")
        display(dropped["reason"].value_counts().to_frame("count"))
    else:
        print("• Tidak ada baris yang di-drop.")

    # kumpulkan kalau nanti kamu memutuskan SAVE=True
    kept["__source_file"] = fname
    dropped["__source_file"] = fname
    all_kept.append(kept)
    all_dropped.append(dropped)
    all_summaries.append(summary_df.assign(file=fname))

# Gabungan (di memori, untuk inspeksi)
df_kept_all    = pd.concat(all_kept, ignore_index=True) if all_kept else pd.DataFrame(columns=FINAL_COLUMNS)
df_dropped_all = pd.concat(all_dropped, ignore_index=True) if all_dropped else pd.DataFrame(columns=FINAL_COLUMNS+["reason","__len"])
df_summary_all = pd.concat(all_summaries, ignore_index=True) if all_summaries else pd.DataFrame()

print("\n=== RINGKASAN GABUNGAN ===")
display(df_summary_all)
print("Jumlah baris KEPT   :", len(df_kept_all))
print("Jumlah baris DROPPED:", len(df_dropped_all))

# Opsional: lihat hanya yang 'too_short' / 'duplicate'
# display(df_dropped_all[df_dropped_all['reason']=='too_short'].head(10))
# display(df_dropped_all[df_dropped_all['reason'].str.contains('duplicate')].head(10))



=== AUDIT: kompas_parapuan_scraped.csv ===


Unnamed: 0,rows_initial,drop_duplicate_url,drop_duplicate_title_body,detected_too_short,rows_final,note
0,130,0,0,0,130,too_short dihapus


• Contoh KEPT (5 baris):


Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita
0,https://money.kompas.com/read/2019/08/01/12421...,Parapuan,Trending Topic,2019-08-01T12:42:15,Kompas.com Kembali Jadi Pemenang Kategori Medi...,"media online, Kompas.com, Superbrands","- Superbrands, lembaga arbiter internasional u..."
1,https://www.kompas.com/parapuan/read/532725712...,Parapuan,Trending Topic,2021-06-05T12:45:00,Mengapa Tempe Bongkrek Beracun? Ini Dia Sejara...,"hari tempe sedunia, tempe bongkrek, tempe bong...","Baru-baru ini, tempe sudah didaftarkan UNESCO ..."
2,https://www.kompas.com/parapuan/read/532860362...,Parapuan,Trending Topic,2021-08-27T20:45:00,"Hotel Bintang 1 sampai 5, Apa Artinya? Ini Per...","Hotel Berbintang, klasifikasi hotel berbintang...",Hotel menjadi salah satu tempat tinggal sement...
3,https://www.kompas.com/parapuan/read/532876621...,Parapuan,Trending Topic,2021-09-06T21:00:00,Ini 4 Rekomendasi Hijab Motif Print yang Penuh...,"hijab motif print, rekomendasi hijab motif pri...","Inovasi hijab semakin lama makin bervariasi, s..."
4,https://www.kompas.com/parapuan/read/532975308...,Parapuan,Trending Topic,2021-11-03T19:35:00,"Selain Pernikahan Perak, Ini Nama Perayaan Ula...","ulang tahun pernikahan, wedding anniversary, p...","Kawan Puan, perayaan ulang tahun pernikahan me..."


• Tidak ada baris yang di-drop.

=== AUDIT: kompas_edukasi_scraped.csv ===


Unnamed: 0,rows_initial,drop_duplicate_url,drop_duplicate_title_body,detected_too_short,rows_final,note
0,150,0,20,3,127,too_short dihapus


• Contoh KEPT (5 baris):


Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita
0,http://www.kompas.com/edu/read/2025/11/04/0809...,Edukasi,Edu News,2025-11-04T08:09:56,"TKA SMA 2025, Kemendikdasmen: Siswa Enggak Usa...","siswa, sekolah, TKA, Tes Kemampuan Akademik, K...",Pihak Kementerian Pendidikan Dasar dan Menenga...
1,http://www.kompas.com/edu/read/2025/11/05/1040...,Edukasi,Edu News,2025-11-05T10:40:00,20 SMA Swasta Paling Berprestasi se-Indonesia ...,"sekolah, SMA swasta, Puspresnas, sma swasta pa...",Sudah banyak SMA swasta membuka pendaftaran ta...
2,http://www.kompas.com/edu/read/2025/11/06/0822...,Edukasi,Edu News,2025-11-06T08:22:20,"Biaya Kuliah Semester 1-7 Jalur SNBP di UI, UG...","UI, itb, Perguruan tinggi, UGM, Unair, biaya k...",Berapa biaya kuliah jalur Seleksi Nasional Ber...
3,http://www.kompas.com/edu/read/2025/11/06/0919...,Edukasi,Edu News,2025-11-06T09:19:08,11 Universitas Swasta Terbaik di Indonesia Ver...,"Binus University , perguruan tinggi, universit...",Kegiatan QS Higher Ed Summit: Asia Pacific 202...
4,http://www.kompas.com/edu/read/2025/11/06/1517...,Edukasi,Edu News,2025-11-06T15:17:11,Pendidikan Ariel NOAH yang Akan Perankan Dilan...,"Ariel NOAH, perguruan tinggi, Dilan ITB 1997, ...",Ariel vokalis band NOAH akan memerankan tokoh ...


• Contoh DROPPED (5 baris):


Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita,reason,__len
0,https://www.kompas.com/edu/read/2025/11/04/080...,Edukasi,Perguruan Tinggi,2025-11-04T08:09:56,"TKA SMA 2025, Kemendikdasmen: Siswa Enggak Usa...","siswa, sekolah, TKA, Tes Kemampuan Akademik, K...",Pihak Kementerian Pendidikan Dasar dan Menenga...,duplicate_title_body,2290
1,https://www.kompas.com/edu/read/2025/11/05/104...,Edukasi,Perguruan Tinggi,2025-11-05T10:40:00,20 SMA Swasta Paling Berprestasi se-Indonesia ...,"sekolah, SMA swasta, Puspresnas, sma swasta pa...",Sudah banyak SMA swasta membuka pendaftaran ta...,duplicate_title_body,1967
2,https://www.kompas.com/edu/read/2025/11/06/082...,Edukasi,Perguruan Tinggi,2025-11-06T08:22:20,"Biaya Kuliah Semester 1-7 Jalur SNBP di UI, UG...","UI, itb, Perguruan tinggi, UGM, Unair, biaya k...",Berapa biaya kuliah jalur Seleksi Nasional Ber...,duplicate_title_body,2090
3,https://www.kompas.com/edu/read/2025/11/06/091...,Edukasi,Perguruan Tinggi,2025-11-06T09:19:08,11 Universitas Swasta Terbaik di Indonesia Ver...,"Binus University , perguruan tinggi, universit...",Kegiatan QS Higher Ed Summit: Asia Pacific 202...,duplicate_title_body,2171
4,https://www.kompas.com/edu/read/2025/11/06/151...,Edukasi,Perguruan Tinggi,2025-11-06T15:17:11,Pendidikan Ariel NOAH yang Akan Perankan Dilan...,"Ariel NOAH, perguruan tinggi, Dilan ITB 1997, ...",Ariel vokalis band NOAH akan memerankan tokoh ...,duplicate_title_body,1785


• Rekap alasan (value_counts):


Unnamed: 0_level_0,count
reason,Unnamed: 1_level_1
duplicate_title_body,20
too_short,3



=== AUDIT: kompas_properti_scraped.csv ===


Unnamed: 0,rows_initial,drop_duplicate_url,drop_duplicate_title_body,detected_too_short,rows_final,note
0,103,0,0,0,103,too_short dihapus


• Contoh KEPT (5 baris):


Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita
0,https://money.kompas.com/read/2019/08/01/12421...,Properti,News,2019-08-01T12:42:15,Kompas.com Kembali Jadi Pemenang Kategori Medi...,"media online, Kompas.com, Superbrands","- Superbrands, lembaga arbiter internasional u..."
1,https://properti.kompas.com/read/xml/2016/09/2...,Properti,News,2016-09-22T07:53:00,"MEA, SCG Siap Menguasai Pasar Semen Indonesia",,Segudang harapan membuncah saat Masyarakat Eko...
2,https://www.kompas.com/properti/read/2025/11/0...,Properti,News,2025-11-04T17:02:24,Investor China Bikin Kawasan Industri di Suban...,"industri tekstil, Subang, investasi China di I...","Kawasan industri dan pergudangan di Jakarta, B..."
3,https://www.kompas.com/properti/read/2025/11/0...,Properti,News,2025-11-04T20:00:00,Pemerintah Targetkan Proyek Tol Terpanjang Ind...,"Kementerian PU, Tol Terpanjang di Indonesia, T...",Proyek Jalan Tol Gedebage-Tasikmalaya-Cilacap ...
4,https://www.kompas.com/properti/read/2025/11/0...,Properti,News,2025-11-05T09:00:00,"Mulai Dibangun 2026, Inilah Profil Calon Tol T...","bandung, infrastruktur, Kementerian PU, Tasikm...",Peta jaringan jalan tol di Indonesia akan sege...


• Tidak ada baris yang di-drop.

=== RINGKASAN GABUNGAN ===


Unnamed: 0,rows_initial,drop_duplicate_url,drop_duplicate_title_body,detected_too_short,rows_final,note,file
0,130,0,0,0,130,too_short dihapus,kompas_parapuan_scraped.csv
1,150,0,20,3,127,too_short dihapus,kompas_edukasi_scraped.csv
2,103,0,0,0,103,too_short dihapus,kompas_properti_scraped.csv


Jumlah baris KEPT   : 360
Jumlah baris DROPPED: 23


Note:
*   Dari kategori `Parapuan` total rows data nya 130, tidak ada data yang too short maupun duplicate yang perlu dihapus
*   Dari kategori `Edukasi` total rows data awalnya 150, terdeteksi ada 3 data yang terlalu pendek (di mana atribut `konten_berita` tidak ada beritanya atau null), dan ada 20 baris data yang duplicate jadi perlu di-dropped
*   Dari kategori `Properti` total rows data nya 103, tidak ada data yang too short maupun duplicate yang perlu dihapus


---


Summary: Untuk  kategori `Parapuan` total rows data tetap 130. Untuk kategori `Edukasi` total rows data menjadi 127. Untuk kategori `Properti` total rows data tetap 103.

### Too-Short `konten_berita` (Min.Character = 120)

In [None]:
display(df_dropped_all[df_dropped_all['reason']=='too_short'].head(10))

Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita,reason,__len,__source_file
10,https://www.kompas.com/edu/read/2024/09/19/144...,Edukasi,Pendidikan Khusus,2024-09-19T14:47:36,"Kisah Rafa Kusuma, Dalang Cilik Down Syndrome ...","dalang cilik, down syndrome, pendidikan khusus",,too_short,0,kompas_edukasi_scraped.csv
21,https://edukasi.kompas.com/ideaksi/read/2025/0...,Edukasi,IdeAksi,2025-09-19T20:00:00,Program Inovasi Anti Kesel: Ciptakan Ruang Ama...,"sekolah, perguruan tinggi , pengabdian masyarakat",,too_short,0,kompas_edukasi_scraped.csv
22,https://edukasi.kompas.com/ideaksi/read/2025/1...,Edukasi,IdeAksi,2025-11-08T11:00:34,Sinergi Kampus dan Generasi Muda: Bangun Ekono...,"Perguruan tinggi, pengabdian masyarakat",,too_short,0,kompas_edukasi_scraped.csv


Note:

*   Ada 3 data yang tidak terdapat isi di konten berita nya (null) tepatnya hanya pada  kategori `Edukasi`, jadi langsung di-drop



### Duplicate `url` or title and body of the news

In [None]:
display(df_dropped_all[df_dropped_all['reason'].str.contains('duplicate')].head(100))

Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita,reason,__len,__source_file
0,https://www.kompas.com/edu/read/2025/11/04/080...,Edukasi,Perguruan Tinggi,2025-11-04T08:09:56,"TKA SMA 2025, Kemendikdasmen: Siswa Enggak Usa...","siswa, sekolah, TKA, Tes Kemampuan Akademik, K...",Pihak Kementerian Pendidikan Dasar dan Menenga...,duplicate_title_body,2290,kompas_edukasi_scraped.csv
1,https://www.kompas.com/edu/read/2025/11/05/104...,Edukasi,Perguruan Tinggi,2025-11-05T10:40:00,20 SMA Swasta Paling Berprestasi se-Indonesia ...,"sekolah, SMA swasta, Puspresnas, sma swasta pa...",Sudah banyak SMA swasta membuka pendaftaran ta...,duplicate_title_body,1967,kompas_edukasi_scraped.csv
2,https://www.kompas.com/edu/read/2025/11/06/082...,Edukasi,Perguruan Tinggi,2025-11-06T08:22:20,"Biaya Kuliah Semester 1-7 Jalur SNBP di UI, UG...","UI, itb, Perguruan tinggi, UGM, Unair, biaya k...",Berapa biaya kuliah jalur Seleksi Nasional Ber...,duplicate_title_body,2090,kompas_edukasi_scraped.csv
3,https://www.kompas.com/edu/read/2025/11/06/091...,Edukasi,Perguruan Tinggi,2025-11-06T09:19:08,11 Universitas Swasta Terbaik di Indonesia Ver...,"Binus University , perguruan tinggi, universit...",Kegiatan QS Higher Ed Summit: Asia Pacific 202...,duplicate_title_body,2171,kompas_edukasi_scraped.csv
4,https://www.kompas.com/edu/read/2025/11/06/151...,Edukasi,Perguruan Tinggi,2025-11-06T15:17:11,Pendidikan Ariel NOAH yang Akan Perankan Dilan...,"Ariel NOAH, perguruan tinggi, Dilan ITB 1997, ...",Ariel vokalis band NOAH akan memerankan tokoh ...,duplicate_title_body,1785,kompas_edukasi_scraped.csv
5,https://www.kompas.com/edu/read/2025/11/07/081...,Edukasi,Perguruan Tinggi,2025-11-07T08:18:02,Ketika Universitas Terbaik Menolak Pemeringkat...,"peringkat universitas, ranking universitas di ...","DALAM dunia yang terobsesi peringkat, keputusa...",duplicate_title_body,5796,kompas_edukasi_scraped.csv
6,https://www.kompas.com/edu/read/2025/11/07/193...,Edukasi,Perguruan Tinggi,2025-11-07T19:34:51,7 PTN di Pulau Jawa yang Lulusannya Banyak Jad...,"Perguruan tinggi, CPNS, PTN, lulusannya banyak...",Jadwal Seleksi Calon Pegawai Negeri Sipil (CPN...,duplicate_title_body,1471,kompas_edukasi_scraped.csv
7,https://www.kompas.com/edu/read/2025/11/08/114...,Edukasi,Perguruan Tinggi,2025-11-08T11:46:37,8 Ekstrakurikuler buat Masuk Sekolah Kedinasan...,"sekolah, Perguruan tinggi, CPNS, ekstrakurikul...",Jika kamu adalah siswa SMA-SMK yang aktif di k...,duplicate_title_body,2135,kompas_edukasi_scraped.csv
8,https://www.kompas.com/edu/read/2025/11/08/155...,Edukasi,Perguruan Tinggi,2025-11-08T15:54:58,Ini Biaya Kuliah Binus University 2026 untuk S...,"calon mahasiswa, perguruan tinggi, Biaya kulia...",Calon mahasiswa perlu mempersiapkan rencana B ...,duplicate_title_body,1935,kompas_edukasi_scraped.csv
9,https://www.kompas.com/edu/read/2025/11/09/122...,Edukasi,Perguruan Tinggi,2025-11-09T12:21:46,"13 SMA Negeri Paling Berprestasi di Indonesia,...","sekolah, SMA Negeri, sma negeri paling berpres...",Masuk SMA negeri masih menjadi pilihan banyak ...,duplicate_title_body,1553,kompas_edukasi_scraped.csv


Note:

*   Ada 20 rows data pada kategori `Edukasi` ynag duplicate, sehingga itu smeua harus di-drop.



## Save to CSV

In [None]:
if SAVE:
    if len(df_kept_all) == 0:
        print("Tidak ada data untuk disimpan (df_kept_all kosong).")
    else:
        print("Menyimpan hasil bersih per kategori...")
        save_report_df = save_by_category(df_kept_all.drop(columns=["__source_file"], errors="ignore"), OUTPUT_DIR)
        display(save_report_df)
        print("Done ✅")
else:
    print("SAVE=False → hanya audit/preview ditampilkan. Set SAVE=True lalu jalankan sel ini lagi untuk menyimpan.")


Menyimpan hasil bersih per kategori...


Unnamed: 0,category,rows,path
0,Edukasi,127,/content/drive/MyDrive/prepro-kompas/kompas_ed...
1,Parapuan,130,/content/drive/MyDrive/prepro-kompas/kompas_pa...
2,Properti,103,/content/drive/MyDrive/prepro-kompas/kompas_pr...


Done ✅


## Second Cleaning Process [Hanya untuk katgori `Lestari` dan `News`]

In [None]:
# Folder input & output
INPUT_DIR  = "/content/drive/MyDrive/scrapping-kompas"
OUTPUT_DIR = "/content/drive/MyDrive/prepro-kompas"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("INPUT_DIR :", INPUT_DIR)
print("OUTPUT_DIR:", OUTPUT_DIR)


INPUT_DIR : /content/drive/MyDrive/scrapping-kompas
OUTPUT_DIR: /content/drive/MyDrive/prepro-kompas


In [None]:
# ===> EDIT daftar file yang mau dibersihkan (harus ada di INPUT_DIR)
INPUT_FILES = [
    "kompas_lestari_scraped.csv",
    "kompas_news_scraped.csv",
]

# Threshold minimal panjang konten setelah prepro (karakter)
MIN_CHARS = 120

# Simpan hasil bersih? (set True setelah kamu review hasil audit)
SAVE = True


In [None]:
tqdm.pandas()

FINAL_COLUMNS = ["url","category","subcategory","tanggal_publish","judul_berita","tags","konten_berita"]

# Pola awalan media/kanal di awal teks
RE_BAD_PREFIXES = [
    r"^[A-ZÀ-ÖØ-Þ][A-ZÀ-ÖØ-Þ\s\.'’/,-]{1,30},\s*KOMPAS\.?com\s*[-–—:]\s*",  # <KOTA>, KOMPAS.com -
    r"^\s*KOMPAS\.?com\s*[-–—:]\s*",                                        # KOMPAS.com -
    r"^\s*Kompas\.?com\s*[-–—:]\s*",                                        # Kompas.com –
    r"^\s*parapuan\s*\.?\s*co\s*[-–—:]\s*",                                 # Parapuan.co -
]

RE_REMOVE_LINES = [
    r"^\s*(Baca\s+juga|BACA\s+JUGA)\s*:.*$",
    r"^\s*(Penulis|Editor|Penulis\W*Editor)\s*:\s*.*$",
    r"^\s*Artikel ini.*$",
    r"^\s*Ikuti\s+berita.*$",
    r"^\s*Unduh.*(Aplikasi|aplikasi).*Kompas.*$",
    r"^\s*Download.*KOMPAS\.com.*$",
]
RE_INLINE_JUNK = [r"\bBaca\s+juga\s*:\s*.+?(?=$|\.)", r"ADVERTISEMENT", r"IKLAN", r"PROMOSI"]
RE_URL = r"(https?://\S+)"
RE_MULTI_PUNCT = r"([,.!?]){2,}"
RE_MULTI_SPACE = r"[ \t]{2,}"
RE_ZERO_WIDTH = r"[\u200B-\u200D\uFEFF]"

COLUMN_MAP_CANDIDATES = [
    {"url":"url","category":"category","subcategory":"subcategory","tanggal_publish":"tanggal_publish",
     "judul":"judul_berita","judul_berita":"judul_berita","tags":"tags",
     "konten":"konten_berita","konten_berita":"konten_berita"},
    {"title":"judul_berita","content":"konten_berita","published_at":"tanggal_publish","content_tags":"tags","kategori":"category"},
]

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = {c: c.strip().lower() for c in df.columns}
    df = df.rename(columns=cols)
    for cmap in COLUMN_MAP_CANDIDATES:
        for src, dst in cmap.items():
            if src in df.columns and dst not in df.columns:
                df = df.rename(columns={src: dst})
    for c in FINAL_COLUMNS:
        if c not in df.columns:
            df[c] = ""
    return df[FINAL_COLUMNS].copy()

def parse_date_safe(x: str):
    if not isinstance(x, str) or not x.strip(): return ""
    s = re.sub(r"\b(WIB|WITA|WIT)\b", "", x.strip(), flags=re.I)
    try:
        return dtparser.parse(s, fuzzy=True).isoformat()
    except Exception:
        return x

def fix_punct_and_spacing(text: str):
    if not isinstance(text, str): return ""
    t = text
    t = re.sub(RE_ZERO_WIDTH, "", t).replace("\xa0", " ")
    t = unidecode(t)                             # normalisasi kutip/aksen
    t = re.sub(RE_MULTI_PUNCT, r"\1", t)         # "...." -> "."
    t = re.sub(RE_MULTI_SPACE, " ", t)
    t = re.sub(r"\s+([,.!?;:])", r"\1", t)       # spasi sebelum tanda baca
    t = re.sub(r"([(\[{])\s+", r"\1", t)         # spasi setelah buka kurung
    t = re.sub(r"\s+([)\]}])", r"\1", t)         # spasi sebelum tutup kurung
    return t.strip()

def remove_garbage_lines(text: str):
    if not isinstance(text, str): return ""
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    kept = []
    for ln in lines:
        ln2 = re.sub(RE_URL, "", ln).strip()
        if any(re.search(p, ln2, flags=re.IGNORECASE) for p in RE_REMOVE_LINES):
            continue
        kept.append(ln2)
    t = "\n".join(kept)
    for pat in RE_INLINE_JUNK:
        t = re.sub(pat, "", t, flags=re.IGNORECASE).strip()
    return t

def strip_kompas_lead(text: str):
    if not isinstance(text, str): return ""
    t = text.strip()
    for pat in RE_BAD_PREFIXES:
        t = re.sub(pat, "", t, flags=re.IGNORECASE)
    return t.strip()

def content_minimal_ok(text: str, min_chars=120):
    return isinstance(text, str) and len(text.strip()) >= min_chars


def auditable_clean(df_raw: pd.DataFrame, min_chars=120, drop_short=True):
    """
    Return:
      df_kept, df_dropped, summary_df
    """
    df = normalize_columns(df_raw.copy())
    for c in FINAL_COLUMNS:
        df[c] = df[c].astype(str).fillna("").str.strip()

    # transform sebelum evaluasi drop
    df["tanggal_publish"] = df["tanggal_publish"].apply(parse_date_safe)
    df["judul_berita"]    = df["judul_berita"].apply(fix_punct_and_spacing)
    df["konten_berita"]   = df["konten_berita"].apply(fix_punct_and_spacing)
    df["konten_berita"]   = df["konten_berita"].apply(remove_garbage_lines)
    df["konten_berita"]   = df["konten_berita"].apply(strip_kompas_lead)
    df["konten_berita"]   = df["konten_berita"].apply(lambda s: re.sub(r"\n{2,}", "\n\n", s).strip())

    df["__len"] = df["konten_berita"].apply(lambda x: len(x.strip()))

    # 1️⃣ duplicate URL
    mask_dup_url = df.duplicated(subset=["url"], keep="first")

    # 2️⃣ duplicate title+body (setelah buang dup URL)
    df_nourl = df.loc[~mask_dup_url].copy()
    mask_dup_tb_partial = df_nourl.duplicated(subset=["judul_berita","konten_berita"], keep="first")
    mask_dup_title_body = df.index.isin(df_nourl.index[mask_dup_tb_partial])

    # 3️⃣ too short (hanya deteksi, tidak dihapus kecuali drop_short=True)
    mask_short = ~df["konten_berita"].apply(lambda s: content_minimal_ok(s, min_chars=min_chars))

    # alasan drop (prioritas)
    reason = pd.Series("", index=df.index, dtype=str)
    reason[mask_dup_url] = "duplicate_url"
    reason[~mask_dup_url & mask_dup_title_body] = "duplicate_title_body"
    if drop_short:
        reason[~mask_dup_url & ~mask_dup_title_body & mask_short] = "too_short"

    # hasil drop hanya duplikat (too_short disimpan kalau drop_short=False)
    df_dropped = df[reason != ""].copy()
    df_dropped["reason"] = reason[reason != ""]
    df_kept    = df[reason == ""].copy()

    # ringkasan
    summary_df = pd.DataFrame([{
        "rows_initial": len(df),
        "drop_duplicate_url": int(mask_dup_url.sum()),
        "drop_duplicate_title_body": int((~mask_dup_url & mask_dup_title_body).sum()),
        "detected_too_short": int((~mask_dup_url & ~mask_dup_title_body & mask_short).sum()),
        "rows_final": len(df_kept),
        "note": "too_short hanya dideteksi, tidak dihapus" if not drop_short else "too_short dihapus",
    }])

    df_kept    = df_kept[FINAL_COLUMNS].reset_index(drop=True)
    df_dropped = df_dropped[FINAL_COLUMNS + ["reason","__len"]].reset_index(drop=True)
    return df_kept, df_dropped, summary_df


def save_by_category(df: pd.DataFrame, out_dir: str):
    report_rows = []
    for cat, dfc in df.groupby("category"):
        cat_slug = re.sub(r"[^a-z0-9]+", "_", cat.lower()).strip("_") or "unknown"
        out_path = os.path.join(out_dir, f"kompas_{cat_slug}_clean.csv")
        dfc.to_csv(out_path, index=False, encoding="utf-8")
        report_rows.append({"category": cat, "rows": len(dfc), "path": out_path})
    return pd.DataFrame(report_rows)


In [None]:
all_kept = []
all_summaries = []
all_dropped = []

for fname in INPUT_FILES:
    in_path = os.path.join(INPUT_DIR, fname)
    if not os.path.exists(in_path):
        print(f"⚠️ file tidak ditemukan: {in_path}")
        continue

    print(f"\n=== AUDIT: {fname} ===")
    raw = pd.read_csv(in_path)
    kept, dropped, summary_df = auditable_clean(raw, min_chars=MIN_CHARS)

    display(summary_df)                 # ringkasan sebagai DataFrame
    print("• Contoh KEPT (5 baris):")
    display(kept.head(5))
    if len(dropped):
        print("• Contoh DROPPED (5 baris):")
        display(dropped.head(5))
        print("• Rekap alasan (value_counts):")
        display(dropped["reason"].value_counts().to_frame("count"))
    else:
        print("• Tidak ada baris yang di-drop.")

    # kumpulkan kalau nanti kamu memutuskan SAVE=True
    kept["__source_file"] = fname
    dropped["__source_file"] = fname
    all_kept.append(kept)
    all_dropped.append(dropped)
    all_summaries.append(summary_df.assign(file=fname))

# Gabungan (di memori, untuk inspeksi)
df_kept_all    = pd.concat(all_kept, ignore_index=True) if all_kept else pd.DataFrame(columns=FINAL_COLUMNS)
df_dropped_all = pd.concat(all_dropped, ignore_index=True) if all_dropped else pd.DataFrame(columns=FINAL_COLUMNS+["reason","__len"])
df_summary_all = pd.concat(all_summaries, ignore_index=True) if all_summaries else pd.DataFrame()

print("\n=== RINGKASAN GABUNGAN ===")
display(df_summary_all)
print("Jumlah baris KEPT   :", len(df_kept_all))
print("Jumlah baris DROPPED:", len(df_dropped_all))

# Opsional: lihat hanya yang 'too_short' / 'duplicate'
# display(df_dropped_all[df_dropped_all['reason']=='too_short'].head(10))
# display(df_dropped_all[df_dropped_all['reason'].str.contains('duplicate')].head(10))



=== AUDIT: kompas_lestari_scraped.csv ===


Unnamed: 0,rows_initial,drop_duplicate_url,drop_duplicate_title_body,detected_too_short,rows_final,note
0,807,0,2,0,805,too_short dihapus


• Contoh KEPT (5 baris):


Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita
0,https://lestari.kompas.com/partner/april-group...,Lestari,Swasta,2025-10-27T08:52:56,IUCN dan APRIL Perkuat Ilmu Konservasi lewat K...,"sektor swasta, restorasi, IUCN, APRIL Group, k...",Sebagai salah satu negara megabiodiversitas te...
1,https://lestari.kompas.com/partner/ernst-young...,Lestari,LSM/Figur,2024-12-20T16:19:13,IAI Terbitkan Peta Jalan Standar Pengungkapan ...,"IAI, SDGs, SPK, keberlanjutan, IFRS, SDG13-Pen...",Indonesia telah menunjukkan komitmen kuat dala...
2,https://lestari.kompas.com/partner/nusantara-i...,Lestari,Swasta,2025-03-07T08:03:00,Ramdani Basri: Fleksibilitas Skema KPBU Bantu ...,"jalan tol, pembangunan infrastruktur, tol, KPB...",Pemerintah Indonesia punya target untuk menuru...
3,https://lestari.kompas.com/partner/pertamina-g...,Lestari,BUMN,2025-01-02T10:58:58,"Atasi Masalah Sampah Plastik, PGN Gelar Edukas...","Semarang , PGN, urban farming, perusahaan gas ...",Sampah plastik masih menjadi persoalan serius ...
4,https://lestari.kompas.com/read/2025/11/05/085...,Lestari,LSM/Figur,2025-11-05T08:57:39,"COP 30: Dagang Karbon Kuno dan Terbukti Gagal,...","emisi karbon dioksida, transisi energi, Perdag...",Oleh Denny Gunawan* KOMPAS.com - Menjelang Con...


• Contoh DROPPED (5 baris):


Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita,reason,__len
0,https://lestari.kompas.com/read/2025/10/27/085...,Lestari,Swasta,2025-10-27T08:52:56,IUCN dan APRIL Perkuat Ilmu Konservasi lewat K...,"sektor swasta, restorasi, IUCN, APRIL Group, k...",Sebagai salah satu negara megabiodiversitas te...,duplicate_title_body,4319
1,https://lestari.kompas.com/read/2025/01/02/105...,Lestari,BUMN,2025-01-02T10:58:58,"Atasi Masalah Sampah Plastik, PGN Gelar Edukas...","Semarang , PGN, urban farming, perusahaan gas ...",Sampah plastik masih menjadi persoalan serius ...,duplicate_title_body,2952


• Rekap alasan (value_counts):


Unnamed: 0_level_0,count
reason,Unnamed: 1_level_1
duplicate_title_body,2



=== AUDIT: kompas_news_scraped.csv ===


Unnamed: 0,rows_initial,drop_duplicate_url,drop_duplicate_title_body,detected_too_short,rows_final,note
0,100,0,0,0,100,too_short dihapus


• Contoh KEPT (5 baris):


Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita
0,https://makassar.kompas.com/read/2025/11/10/06...,Regional,,2025-11-10T06:53:19,Tertangkap Kamera Gandeng Dua Anak Selain Bilq...,"Penculikan anak di Makassar, Makassar, bilqis ...",Kasus dugaan penculikan terhadap bocah empat t...
1,https://medan.kompas.com/read/2025/11/10/05300...,Regional,,2025-11-10T05:30:00,"Firasat Driver Ojol Benar, Tangannya sampai Be...","Medan, sumut, paket narkotika driver ojol medan",Seorang driver ojek online berinisial J melapo...
2,https://megapolitan.kompas.com/read/2025/11/10...,News,,2025-11-10T06:06:00,Sejarah dan Rumitnya Status Kepemilikan Menara...,"Menara Saidah, sejarah Menara Saidah, menara s...",Di antara padatnya arus kendaraan di Jalan MT ...
3,https://megapolitan.kompas.com/read/2025/11/10...,News,,2025-11-10T07:05:00,"Menara Saidah, Bayangan Kemegahan yang Terbeng...","Menara Saidah, bangunan terbengkalai, menara s...",Di antara hiruk-pikuk kendaraan di Jalan MT Ha...
4,https://money.kompas.com/read/2019/08/01/12421...,Money,,2019-08-01T12:42:15,Kompas.com Kembali Jadi Pemenang Kategori Medi...,"media online, Kompas.com, Superbrands","- Superbrands, lembaga arbiter internasional u..."


• Tidak ada baris yang di-drop.

=== RINGKASAN GABUNGAN ===


Unnamed: 0,rows_initial,drop_duplicate_url,drop_duplicate_title_body,detected_too_short,rows_final,note,file
0,807,0,2,0,805,too_short dihapus,kompas_lestari_scraped.csv
1,100,0,0,0,100,too_short dihapus,kompas_news_scraped.csv


Jumlah baris KEPT   : 905
Jumlah baris DROPPED: 2


Note:
*   Dari kategori `Lestari` total rows data nya 870, tidak ada data yang too short, namun ada 2 terdeteksi duplicate yang perlu dihapus
*   Dari kategori `News` total rows data ada 100, dan tidak ada null maupun duplicate yang terdeteksi.


---


Summary: Untuk  kategori `Lestari` total rows data menjadi 868. Untuk kategori `News` total rows data tetap 100.

In [None]:
display(df_dropped_all[df_dropped_all['reason'].str.contains('duplicate')].head(10))

Unnamed: 0,url,category,subcategory,tanggal_publish,judul_berita,tags,konten_berita,reason,__len,__source_file
0,https://lestari.kompas.com/read/2025/10/27/085...,,Swasta,2025-10-27T08:52:56,IUCN dan APRIL Perkuat Ilmu Konservasi lewat K...,"sektor swasta, restorasi, IUCN, APRIL Group, k...",Sebagai salah satu negara megabiodiversitas te...,duplicate_title_body,4319,kompas_lestari_scraped.csv
1,https://lestari.kompas.com/read/2025/01/02/105...,,BUMN,2025-01-02T10:58:58,"Atasi Masalah Sampah Plastik, PGN Gelar Edukas...","Semarang , PGN, urban farming, perusahaan gas ...",Sampah plastik masih menjadi persoalan serius ...,duplicate_title_body,2952,kompas_lestari_scraped.csv


In [None]:
if SAVE:
    if len(df_kept_all) == 0:
        print("Tidak ada data untuk disimpan (df_kept_all kosong).")
    else:
        print("Menyimpan hasil bersih per kategori...")
        save_report_df = save_by_category(df_kept_all.drop(columns=["_source_file"], errors="ignore"), OUTPUT_DIR)
        display(save_report_df)
        print("Done ✅")
else:
    print("SAVE=False → hanya audit/preview ditampilkan. Set SAVE=True lalu jalankan sel ini lagi untuk menyimpan.")


Menyimpan hasil bersih per kategori...


Unnamed: 0,category,rows,path
0,Lestari,805,/content/drive/MyDrive/prepro-kompas/kompas_le...
1,Money,1,/content/drive/MyDrive/prepro-kompas/kompas_mo...
2,News,96,/content/drive/MyDrive/prepro-kompas/kompas_ne...
3,Regional,3,/content/drive/MyDrive/prepro-kompas/kompas_re...


Done ✅


Notes:

*   karena fokus ambil terkait Lestari dan News saja, jadi Kategori Money dan Regional tidak digunakan atau di hapus manual.
*   Jadi untuk Kategori News sekarang ada 96 rows data



