In [None]:
!pip install requests beautifulsoup4 lxml tqdm




# Scraping Money

In [None]:
from google.colab import drive
import os

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasMoney"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_money_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)


Folder Drive: /content/drive/MyDrive/KompasMoney
File output : /content/drive/MyDrive/KompasMoney/kompas_money_scraped.csv


In [None]:
# Scraper Kompas Money (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi umum
# ==========================
BASE = "https://money.kompas.com/"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None  # None = lanjut sampai 3 halaman kosong berturut-turut
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# Session HTTP dengan retry
# ==========================
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helper
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utilities
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Dapatkan daftar sub-kategori dari menu
# ==========================
def discover_menu_links():
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='money'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            found[href] = name
    except Exception:
        pass

    # fallback kalau parsing gagal
    SEED = OrderedDict([
        ("https://money.kompas.com/ekbis", "Ekbis"),
        ("https://money.kompas.com/keuangan", "Keuangan"),
        ("https://money.kompas.com/syariah", "Syariah"),
        ("https://money.kompas.com/industri", "Industri"),
        ("https://money.kompas.com/energi", "Energi"),
        ("https://money.kompas.com/belanja", "Belanja"),
        ("https://money.kompas.com/cuan", "Cuan"),
        ("https://money.kompas.com/karier", "Karier"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    return list(found.items())

# ==========================
# Ambil daftar URL artikel per subcategory
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href")
        if not href:
            continue
        full = absolutize(listing_url, href)
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    meta2 = soup.select_one('meta[property="article:published_time"]')
    if meta2 and meta2.get("content"):
        return meta2["content"].strip()
    return ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1.article__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")
    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)
    wrapper = soup.select_one(".read__content") or soup.select_one(".article__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)
    return {"judul": judul, "tanggal_publish": tanggal_publish, "tags": tags, "konten": konten}

# ==========================
# Pipeline utama
# ==========================
def run_scrape_money(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Money"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_money(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - Ekbis: https://money.kompas.com/ekbis
 - Keuangan: https://money.kompas.com/keuangan
 - Syariah: https://money.kompas.com/syariah
 - Industri: https://money.kompas.com/industri
 - Energi: https://money.kompas.com/energi
 - Belanja: https://money.kompas.com/belanja
 - Cuan: https://money.kompas.com/cuan
 - Karier: https://money.kompas.com/karier
 - Indeks: https://indeks.kompas.com/?site=money
 - Kilas: #
 - Kilas Badan: https://kilasbadan.kompas.com/
 - Kilas Fintech: https://kilasfintech.kompas.com/
 - Kilas Transportasi: https://kilastransportasi.kompas.com/
 - Kilas Investasi: https://kilasinvestasi.kompas.com/
 - Kilas Perbankan: https://kilasperbankan.kompas.com/

Crawl sub-kategori: Ekbis
  [page=1] +40 URL (total 40)
  [page=2] +0 URL (total 40)
  [page=3] +0 URL (total 40)
  [page=4] +0 URL (total 40)

Crawl sub-kategori: Keuangan
  [page=1] +43 URL (total 43)
  [page=2] +0 URL (total 43)
  [page=3] +0 URL (total 43)
  [page=4

Scraping: 100%|██████████| 516/516 [12:13<00:00,  1.42s/url]

Selesai. Artikel baru tersimpan: 354.
CSV: /content/drive/MyDrive/KompasMoney/kompas_money_scraped.csv





# Scraping Tekno

In [None]:
from google.colab import drive
import os

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasTekno"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_tekno_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)


Folder Drive: /content/drive/MyDrive/KompasTekno
File output : /content/drive/MyDrive/KompasTekno/kompas_tekno_scraped.csv


In [None]:
# Scraper Kompas Money (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi umum
# ==========================
BASE = "https://tekno.kompas.com/"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None  # None = lanjut sampai 3 halaman kosong berturut-turut
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# Session HTTP dengan retry
# ==========================
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helper
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utilities
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Dapatkan daftar sub-kategori dari menu
# ==========================
def discover_menu_links():
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='tekno'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            found[href] = name
    except Exception:
        pass

    # fallback kalau parsing gagal
    SEED = OrderedDict([
        ("https://tekno.kompas.com/apps-os", "Apps & OS"),
        ("https://tekno.kompas.com/gadget", "Gadget"),
        ("https://tekno.kompas.com/internet", "Internet"),
        ("https://tekno.kompas.com/hardware", "Hardware"),
        ("https://tekno.kompas.com/business", "Business"),
        ("https://tekno.kompas.com/game", "Game"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    return list(found.items())

# ==========================
# Ambil daftar URL artikel per subcategory
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href")
        if not href:
            continue
        full = absolutize(listing_url, href)
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    meta2 = soup.select_one('meta[property="article:published_time"]')
    if meta2 and meta2.get("content"):
        return meta2["content"].strip()
    return ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1.article__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")
    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)
    wrapper = soup.select_one(".read__content") or soup.select_one(".article__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)
    return {"judul": judul, "tanggal_publish": tanggal_publish, "tags": tags, "konten": konten}

# ==========================
# Pipeline utama
# ==========================
def run_scrape_tekno(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Tekno"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_tekno(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - Apps & OS: https://tekno.kompas.com/apps-os
 - Gadget: https://tekno.kompas.com/gadget
 - Internet: https://tekno.kompas.com/internet
 - Hardware: https://tekno.kompas.com/hardware
 - Business: https://tekno.kompas.com/business
 - Game: https://tekno.kompas.com/game
 - Galeri: https://tekno.kompas.com/galeri
 - Indeks: https://indeks.kompas.com/?site=tekno
 - Tech Innovation: https://pubads.g.doubleclick.net/gampad/clk?id=6163907479&iu=/31800665/KOMPAS.COM/tekno
 - Kilas Internet: https://kilasinternet.kompas.com/

Crawl sub-kategori: Apps & OS
  [page=1] +34 URL (total 34)
  [page=2] +0 URL (total 34)
  [page=3] +0 URL (total 34)
  [page=4] +0 URL (total 34)

Crawl sub-kategori: Gadget
  [page=1] +29 URL (total 29)
  [page=2] +0 URL (total 29)
  [page=3] +0 URL (total 29)
  [page=4] +0 URL (total 29)

Crawl sub-kategori: Internet
  [page=1] +32 URL (total 32)
  [page=2] +0 URL (total 32)
  [page=3] +0 URL (total 32)
  [page=4] +0 URL

Scraping: 100%|██████████| 504/504 [12:29<00:00,  1.49s/url]

Selesai. Artikel baru tersimpan: 401.
CSV: /content/drive/MyDrive/KompasTekno/kompas_tekno_scraped.csv





# Scraping Otomotif

In [None]:
from google.colab import drive
import os

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasOtomotif"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_otomotif_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)

Folder Drive: /content/drive/MyDrive/KompasOtomotif
File output : /content/drive/MyDrive/KompasOtomotif/kompas_otomotif_scraped.csv


In [None]:
# Scraper Kompas Money (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi umum
# ==========================
BASE = "https://otomotif.kompas.com/"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None  # None = lanjut sampai 3 halaman kosong berturut-turut
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# Session HTTP dengan retry
# ==========================
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helper
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utilities
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Dapatkan daftar sub-kategori dari menu
# ==========================
def discover_menu_links():
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='otomotif'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            found[href] = name
    except Exception:
        pass

    # fallback kalau parsing gagal
    SEED = OrderedDict([
        ("https://otomotif.kompas.com/news", "News"),
        ("https://otomotif.kompas.com/mobil", "Mobil"),
        ("https://otomotif.kompas.com/motor", "Motor"),
        ("https://otomotif.kompas.com/sport", "Sport"),
        ("https://otomotif.kompas.com/feature", "Feature"),
        ("https://otomotif.kompas.com/niaga", "Niaga"),
        ("https://otomotif.kompas.com/komunitas", "Komunitas"),
        ("https://otomotif.kompas.com/otopedia", "Otopedia"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    return list(found.items())

# ==========================
# Ambil daftar URL artikel per subcategory
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href")
        if not href:
            continue
        full = absolutize(listing_url, href)
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    meta2 = soup.select_one('meta[property="article:published_time"]')
    if meta2 and meta2.get("content"):
        return meta2["content"].strip()
    return ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1.article__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")
    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)
    wrapper = soup.select_one(".read__content") or soup.select_one(".article__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)
    return {"judul": judul, "tanggal_publish": tanggal_publish, "tags": tags, "konten": konten}

# ==========================
# Pipeline utama
# ==========================
def run_scrape_otomotif(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Otomotif"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_otomotif(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - News: https://otomotif.kompas.com/news
 - Mobil: https://otomotif.kompas.com/mobil
 - Produk: https://otomotif.kompas.com/mobil/produk
 - Modifikasi: https://otomotif.kompas.com/mobil/modifikasi
 - Aksesoris: https://otomotif.kompas.com/mobil/aksesoris
 - Tes: https://otomotif.kompas.com/mobil/tes
 - Teknologi: https://otomotif.kompas.com/mobil/teknologi
 - Motor: https://otomotif.kompas.com/motor
 - Produk: https://otomotif.kompas.com/motor/produk
 - Modifikasi: https://otomotif.kompas.com/motor/modifikasi
 - Aksesoris: https://otomotif.kompas.com/motor/aksesoris
 - Tes: https://otomotif.kompas.com/motor/tes
 - Teknologi: https://otomotif.kompas.com/motor/teknologi
 - Sport: https://otomotif.kompas.com/sport
 - Feature: https://otomotif.kompas.com/feature
 - Niaga: https://otomotif.kompas.com/niaga
 - Komunitas: https://otomotif.kompas.com/komunitas
 - Otopedia: https://otomotif.kompas.com/otopedia
 - Galeri: https://otomotif.kompas.

Scraping: 100%|██████████| 582/582 [12:03<00:00,  1.24s/url]

Selesai. Artikel baru tersimpan: 362.
CSV: /content/drive/MyDrive/KompasOtomotif/kompas_otomotif_scraped.csv





# Scraping Nusaraya

In [None]:
from google.colab import drive
import os

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasNusaraya"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_nusaraya_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)

Folder Drive: /content/drive/MyDrive/KompasNusaraya
File output : /content/drive/MyDrive/KompasNusaraya/kompas_nusaraya_scraped.csv


In [None]:
# Scraper Kompas Money (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi umum
# ==========================
BASE = "https://www.kompas.com/nusaraya"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None  # None = lanjut sampai 3 halaman kosong berturut-turut
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# Session HTTP dengan retry
# ==========================
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helper
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utilities
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Dapatkan daftar sub-kategori dari menu
# ==========================
def discover_menu_links():
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='nusaraya'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            found[href] = name
    except Exception:
        pass

    # fallback kalau parsing gagal
    SEED = OrderedDict([
        ("https://www.kompas.com/sumatera-utara", "Sumatera Utara"),
        ("https://www.kompas.com/sumatera-selatan", "Sumatera Selatan"),
        ("https://www.kompas.com/sumatera-barat", "Sumatera Barat"),
        ("https://www.kompas.com/riau", "Riau"),
        ("https://www.kompas.com/lampung", "Lampung"),
        ("https://www.kompas.com/banten", "Banten"),
        ("https://www.kompas.com/yogyakarta", "Yogyakarta"),
        ("https://www.kompas.com/jawa-barat", "Jawa Barat"),
        ("https://www.kompas.com/jawa-tengah", "Jawa Tengah"),
        ("https://www.kompas.com/jawa-timur", "Jawa Timur"),
        ("https://www.kompas.com/kalimantan-barat", "Kalimantan Barat"),
        ("https://www.kompas.com/kalimantan-timur", "Kalimantan Timur"),
        ("https://www.kompas.com/sulawesi-selatan", "Sulawesi Selatan"),
        ("https://www.kompas.com/bali", "Bali"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    return list(found.items())

# ==========================
# Ambil daftar URL artikel per subcategory
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href")
        if not href:
            continue
        full = absolutize(listing_url, href)
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    meta2 = soup.select_one('meta[property="article:published_time"]')
    if meta2 and meta2.get("content"):
        return meta2["content"].strip()
    return ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1.article__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")
    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)
    wrapper = soup.select_one(".read__content") or soup.select_one(".article__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)
    return {"judul": judul, "tanggal_publish": tanggal_publish, "tags": tags, "konten": konten}

# ==========================
# Pipeline utama
# ==========================
def run_scrape_nusaraya(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Nusaraya"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_nusaraya(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - Kalimantan: javascript:void(0)
 - Sumatera Utara: https://www.kompas.com/sumatera-utara
 - Sumatera Selatan: https://www.kompas.com/sumatera-selatan
 - Sumatera Barat: https://www.kompas.com/sumatera-barat
 - Riau: https://www.kompas.com/riau
 - Lampung: https://www.kompas.com/lampung
 - Banten: https://www.kompas.com/banten
 - Yogyakarta: https://yogyakarta.kompas.com
 - Jawa Barat: https://www.kompas.com/jawa-barat
 - Jawa Tengah: https://www.kompas.com/jawa-tengah
 - Jawa Timur: https://www.kompas.com/jawa-timur
 - Kalimantan Barat: https://www.kompas.com/kalimantan-barat
 - Kalimantan Timur: https://www.kompas.com/kalimantan-timur
 - Sulawesi Selatan: https://www.kompas.com/sulawesi-selatan
 - Bali: https://denpasar.kompas.com
 - Indeks: https://indeks.kompas.com/?site=nusaraya
 - Yogyakarta: https://www.kompas.com/yogyakarta
 - Bali: https://www.kompas.com/bali

Crawl sub-kategori: Kalimantan
  [page=1] +0 URL (total 0)
  [page=2

Scraping:   6%|▌         | 3778/66563 [2:03:50<34:17:56,  1.97s/url]


KeyboardInterrupt: 

# Scraping Lifestyle

In [None]:
from google.colab import drive
import os

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasLifestyle"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_lifestyle_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)

Folder Drive: /content/drive/MyDrive/KompasLifestyle
File output : /content/drive/MyDrive/KompasLifestyle/kompas_lifestyle_scraped.csv


In [None]:
# Scraper Kompas Money (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi umum
# ==========================
BASE = "https://lifestyle.kompas.com/"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None  # None = lanjut sampai 3 halaman kosong berturut-turut
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# Session HTTP dengan retry
# ==========================
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helper
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utilities
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Dapatkan daftar sub-kategori dari menu
# ==========================
def discover_menu_links():
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='lifestyle'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            found[href] = name
    except Exception:
        pass

    # fallback kalau parsing gagal
    SEED = OrderedDict([
        ("https://lifestyle.kompas.com/beauty", "Beauty & Grooming"),
        ("https://lifestyle.kompas.com/fashion", "Fashion"),
        ("https://lifestyle.kompas.com/wellness", "Wellness"),
        ("https://lifestyle.kompas.com/relationship", "Relationship"),
        ("https://lifestyle.kompas.com/parenting", "Parenting"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    return list(found.items())

# ==========================
# Ambil daftar URL artikel per subcategory
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href")
        if not href:
            continue
        full = absolutize(listing_url, href)
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    meta2 = soup.select_one('meta[property="article:published_time"]')
    if meta2 and meta2.get("content"):
        return meta2["content"].strip()
    return ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1.article__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")
    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)
    wrapper = soup.select_one(".read__content") or soup.select_one(".article__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)
    return {"judul": judul, "tanggal_publish": tanggal_publish, "tags": tags, "konten": konten}

# ==========================
# Pipeline utama
# ==========================
def run_scrape_lifestyle(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Lifestyle"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_lifestyle(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - Beauty & Grooming: https://lifestyle.kompas.com/beauty
 - Fashion: https://lifestyle.kompas.com/fashion
 - Wellness: https://lifestyle.kompas.com/wellness
 - Relationship: https://lifestyle.kompas.com/relationship
 - Parenting: https://lifestyle.kompas.com/parenting
 - Buku: https://buku.kompas.com
 - Indeks: https://indeks.kompas.com/?site=lifestyle
 - Sadar Stunting: https://genbest.kompas.com
 - Kilas Lifestyle: https://kilaslifestyle.kompas.com

Crawl sub-kategori: Beauty & Grooming
  [page=1] +37 URL (total 37)
  [page=2] +0 URL (total 37)
  [page=3] +0 URL (total 37)
  [page=4] +0 URL (total 37)

Crawl sub-kategori: Fashion
  [page=1] +36 URL (total 36)
  [page=2] +0 URL (total 36)
  [page=3] +0 URL (total 36)
  [page=4] +0 URL (total 36)

Crawl sub-kategori: Wellness
  [page=1] +33 URL (total 33)
  [page=2] +0 URL (total 33)
  [page=3] +0 URL (total 33)
  [page=4] +0 URL (total 33)

Crawl sub-kategori: Relationship
  [page=1] +

Scraping: 100%|██████████| 243/243 [04:24<00:00,  1.09s/url]

Selesai. Artikel baru tersimpan: 137.
CSV: /content/drive/MyDrive/KompasLifestyle/kompas_lifestyle_scraped.csv





# Scraping Travel

In [None]:
from google.colab import drive
import os

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasTravel"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_travel_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)

Folder Drive: /content/drive/MyDrive/KompasTravel
File output : /content/drive/MyDrive/KompasTravel/kompas_travel_scraped.csv


In [None]:
# Scraper Kompas Money (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi umum
# ==========================
BASE = "https://travel.kompas.com/"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None  # None = lanjut sampai 3 halaman kosong berturut-turut
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# Session HTTP dengan retry
# ==========================
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helper
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utilities
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Dapatkan daftar sub-kategori dari menu
# ==========================
def discover_menu_links():
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='travel'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            found[href] = name
    except Exception:
        pass

    # fallback kalau parsing gagal
    SEED = OrderedDict([
        ("https://travel.kompas.com/travel-news", "Travel News"),
        ("https://travel.kompas.com/travel-ideas", "Travel Ideas"),
        ("https://travel.kompas.com/hotel-story", "Hotel Story"),
        ("https://travel.kompas.com/travelpedia", "Travelpedia"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    return list(found.items())

# ==========================
# Ambil daftar URL artikel per subcategory
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href")
        if not href:
            continue
        full = absolutize(listing_url, href)
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    meta2 = soup.select_one('meta[property="article:published_time"]')
    if meta2 and meta2.get("content"):
        return meta2["content"].strip()
    return ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1.article__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")
    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)
    wrapper = soup.select_one(".read__content") or soup.select_one(".article__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)
    return {"judul": judul, "tanggal_publish": tanggal_publish, "tags": tags, "konten": konten}

# ==========================
# Pipeline utama
# ==========================
def run_scrape_travel(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Travel"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_travel(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - Travel News: https://travel.kompas.com/travel-news
 - Travel Ideas: https://travel.kompas.com/travel-ideas
 - Hotel Story: https://travel.kompas.com/hotel-story
 - Travelpedia: https://travel.kompas.com/travelpedia
 - Food: https://www.kompas.com/food
 - Ohayo Jepang: https://ohayojepang.kompas.com/
 - Indeks: https://indeks.kompas.com/?site=travel

Crawl sub-kategori: Travel News
  [page=1] +47 URL (total 47)
  [page=2] +20 URL (total 67)
  [page=3] +19 URL (total 86)
  [page=4] +20 URL (total 106)
  [page=5] +20 URL (total 126)
  [page=6] +20 URL (total 146)
  [page=7] +20 URL (total 166)
  [page=8] +19 URL (total 185)
  [page=9] +20 URL (total 205)
  [page=10] +12 URL (total 217)
  [page=11] +0 URL (total 217)
  [page=12] +0 URL (total 217)
  [page=13] +0 URL (total 217)

Crawl sub-kategori: Travel Ideas
  [page=1] +51 URL (total 51)
  [page=2] +17 URL (total 68)
  [page=3] +19 URL (total 87)
  [page=4] +20 URL (total 107)
  [page=

Scraping: 100%|██████████| 1117/1117 [35:29<00:00,  1.91s/url]

Selesai. Artikel baru tersimpan: 1042.
CSV: /content/drive/MyDrive/KompasTravel/kompas_travel_scraped.csv





# Scraping Sains

In [None]:
from google.colab import drive
import os

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasSains"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_sains_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)

Folder Drive: /content/drive/MyDrive/KompasSains
File output : /content/drive/MyDrive/KompasSains/kompas_sains_scraped.csv


In [None]:
# Scraper Kompas Money (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi umum
# ==========================
BASE = "https://www.kompas.com/sains"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None  # None = lanjut sampai 3 halaman kosong berturut-turut
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# Session HTTP dengan retry
# ==========================
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helper
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utilities
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Dapatkan daftar sub-kategori dari menu
# ==========================
def discover_menu_links():
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='sains'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            found[href] = name
    except Exception:
        pass

    # fallback kalau parsing gagal
    SEED = OrderedDict([
        ("https://www.kompas.com/sains/fenomena", "Fenomena"),
        ("https://www.kompas.com/sains/kita", "Kita"),
        ("https://www.kompas.com/sains/oh-begitu", "Oh Begitu"),
        ("https://www.kompas.com/sains/halo-prof", "Halo Prof"),
        ("https://www.kompas.com/sains/baru-jadi-ortu", "Baru Jadi Ortu"),
        ("https://www.kompas.com/sains/prof-cilik", "Halo Prof"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    return list(found.items())

# ==========================
# Ambil daftar URL artikel per subcategory
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href")
        if not href:
            continue
        full = absolutize(listing_url, href)
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    meta2 = soup.select_one('meta[property="article:published_time"]')
    if meta2 and meta2.get("content"):
        return meta2["content"].strip()
    return ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1.article__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")
    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)
    wrapper = soup.select_one(".read__content") or soup.select_one(".article__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)
    return {"judul": judul, "tanggal_publish": tanggal_publish, "tags": tags, "konten": konten}

# ==========================
# Pipeline utama
# ==========================
def run_scrape_sains(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Sains"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_sains(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - Fenomena: https://www.kompas.com/sains/fenomena
 - Kita: https://www.kompas.com/sains/kita
 - Oh Begitu: https://www.kompas.com/sains/oh-begitu
 - Halo Prof: https://www.kompas.com/sains/halo-prof
 - Baru Jadi Ortu: https://www.kompas.com/sains/baru-jadi-ortu
 - Prof Cilik: https://www.kompas.com/sains/prof-cilik
 - Indeks: https://www.kompas.com/sains/indeks

Crawl sub-kategori: Fenomena
  [page=1] +32 URL (total 32)
  [page=2] +0 URL (total 32)
  [page=3] +0 URL (total 32)
  [page=4] +0 URL (total 32)

Crawl sub-kategori: Kita
  [page=1] +36 URL (total 36)
  [page=2] +0 URL (total 36)
  [page=3] +0 URL (total 36)
  [page=4] +0 URL (total 36)

Crawl sub-kategori: Oh Begitu
  [page=1] +35 URL (total 35)
  [page=2] +0 URL (total 35)
  [page=3] +0 URL (total 35)
  [page=4] +0 URL (total 35)

Crawl sub-kategori: Halo Prof
  [page=1] +36 URL (total 36)
  [page=2] +0 URL (total 36)
  [page=3] +0 URL (total 36)
  [page=4] +0 URL (total 36)


Scraping: 100%|██████████| 232/232 [03:54<00:00,  1.01s/url]

Selesai. Artikel baru tersimpan: 131.
CSV: /content/drive/MyDrive/KompasSains/kompas_sains_scraped.csv





# Scraping Bola

In [None]:
from google.colab import drive
import os

# Tentukan folder dan file output di Drive
DRIVE_FOLDER = "/content/drive/MyDrive/KompasBola"
os.makedirs(DRIVE_FOLDER, exist_ok=True)

OUTPUT_CSV = os.path.join(DRIVE_FOLDER, "kompas_bola_scraped.csv")
print("Folder Drive:", DRIVE_FOLDER)
print("File output :", OUTPUT_CSV)

Folder Drive: /content/drive/MyDrive/KompasBola
File output : /content/drive/MyDrive/KompasBola/kompas_bola_scraped.csv


In [None]:
# Scraper Kompas Money (menu-aware + subcategory + tags + resumeable)

import os, re, time, csv, random, json
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
from tqdm import tqdm

# ==========================
# Konfigurasi umum
# ==========================
BASE = "https://bola.kompas.com/"
REQUEST_DELAY = (0.8, 1.6)
TIMEOUT = 20
MAX_PAGES_PER_SUB = None  # None = lanjut sampai 3 halaman kosong berturut-turut
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}

# ==========================
# Session HTTP dengan retry
# ==========================
from requests.adapters import HTTPAdapter, Retry

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5, connect=5, read=5,
        backoff_factor=1.2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=frozenset(["GET"])
    )
    s.headers.update(HEADERS)
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

SESSION = make_session()

# ==========================
# CSV helper
# ==========================
FIELDNAMES = ["url", "category", "subcategory", "tanggal_publish", "judul", "tags", "konten"]

def ensure_csv(path):
    if not os.path.exists(path):
        with open(path, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
            writer.writeheader()

def load_done_urls(path):
    done = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            r = csv.DictReader(f)
            for row in r:
                u = (row.get("url") or "").strip()
                if u:
                    done.add(u)
    return done

def append_row(path, row: dict):
    clean = {k: (row.get(k) or "") for k in FIELDNAMES}
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=FIELDNAMES)
        w.writerow(clean)

# ==========================
# Utilities
# ==========================
def normalize_ws(text):
    return re.sub(r"\s+", " ", (text or "").strip())

def looks_like_baca_juga(text):
    return "baca juga" in (text or "").lower()

def safe_get(url):
    r = SESSION.get(url, timeout=TIMEOUT)
    r.raise_for_status()
    return r

def absolutize(base_url, href):
    if href.startswith("//"):
        return "https:" + href
    if href.startswith("/"):
        parsed = urlparse(base_url)
        return f"{parsed.scheme}://{parsed.netloc}{href}"
    return href

# ==========================
# Dapatkan daftar sub-kategori dari menu
# ==========================
def discover_menu_links():
    found = OrderedDict()
    try:
        r = safe_get(BASE)
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select(".kanalHeader[data-kanal='bola'] .kanalMenu a[href]"):
            name = normalize_ws(a.get_text())
            href = absolutize(BASE, a.get("href","").strip())
            if not href or not name:
                continue
            found[href] = name
    except Exception:
        pass

    #fallback kalau parsing gagal
    SEED = OrderedDict([
        ("https://bola.kompas.com/timnas-indonesia", "Timnas Indonesia"),
        ("https://bola.kompas.com/liga-indonesia", "Liga Indonesia"),
        ("https://bola.kompas.com/liga-inggris", "Liga Inggris"),
        ("https://bola.kompas.com/liga-italia", "Liga Italia"),
        ("https://bola.kompas.com/liga-champions", "Liga Champions"),
        ("https://www.kompas.com/sports", "Sports"),
        ("https://www.kompas.com/motogp", "Motogp"),
        ("https://www.kompas.com/badminton", "Badminton"),
    ])
    for href, name in SEED.items():
        found.setdefault(href, name)

    return list(found.items())

# ==========================
# Ambil daftar URL artikel per subcategory
# ==========================
def is_article_url(href: str):
    if not href or "kompas.com" not in href:
        return False
    if "/read/" not in href:
        return False
    if any(b in href for b in ["/komentar/", "/copy/"]):
        return False
    return True

def collect_urls_from_list(listing_url):
    try:
        res = safe_get(listing_url)
    except Exception:
        return set()
    soup = BeautifulSoup(res.text, "lxml")
    urls = set()
    for a in soup.select("a[href*='/read/']"):
        href = a.get("href")
        if not href:
            continue
        full = absolutize(listing_url, href)
        if is_article_url(full):
            urls.add(full)
    return urls

def page_url(base, page):
    if page <= 1:
        return base
    parsed = urlparse(base)
    joiner = "&" if parsed.query else "?"
    return f"{base}{joiner}page={page}"

def crawl_subcategory_urls(sub_url, max_pages=None):
    discovered = OrderedDict()
    consecutive_empty = 0
    page = 1
    while True:
        if max_pages is not None and page > max_pages:
            break
        url = page_url(sub_url, page)
        urls = collect_urls_from_list(url)
        before = len(discovered)
        for u in sorted(urls):
            discovered.setdefault(u, None)
        gained = len(discovered) - before
        print(f"  [page={page}] +{gained} URL (total {len(discovered)})")
        time.sleep(random.uniform(*REQUEST_DELAY))
        consecutive_empty = consecutive_empty + 1 if gained == 0 else 0
        if consecutive_empty >= 3:
            break
        page += 1
    return list(discovered.keys())

# ==========================
# Parsing artikel
# ==========================
def extract_publish_date(soup):
    meta = soup.find("meta", attrs={"name": "content_PublishedDate"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    meta2 = soup.select_one('meta[property="article:published_time"]')
    if meta2 and meta2.get("content"):
        return meta2["content"].strip()
    return ""

def extract_tags(soup):
    meta = soup.find("meta", attrs={"name": "content_tags"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    tag_links = soup.select("a.tag__article__link, .tag__item a")
    if tag_links:
        tags = [normalize_ws(a.get_text()) for a in tag_links if a.get_text().strip()]
        return ", ".join(tags)
    return ""

def parse_article(url):
    res = safe_get(url)
    soup = BeautifulSoup(res.text, "lxml")
    title_el = soup.select_one("h1.read__title") or soup.select_one("h1.article__title") or soup.select_one("h1")
    judul = normalize_ws(title_el.get_text()) if title_el else normalize_ws(soup.title.get_text() if soup.title else "")
    tanggal_publish = extract_publish_date(soup)
    tags = extract_tags(soup)
    wrapper = soup.select_one(".read__content") or soup.select_one(".article__content") or soup
    ps = wrapper.find_all("p")
    paragraphs = []
    for p in ps:
        txt = p.get_text(" ").strip()
        if not txt:
            continue
        if looks_like_baca_juga(txt):
            continue
        if "KOMPAS.com" in txt and "Download" in txt:
            continue
        paragraphs.append(normalize_ws(txt))
    konten = " ".join(paragraphs)
    return {"judul": judul, "tanggal_publish": tanggal_publish, "tags": tags, "konten": konten}

# ==========================
# Pipeline utama
# ==========================
def run_scrape_bola(output_csv_path, max_pages_per_sub=None):
    ensure_csv(output_csv_path)
    done = load_done_urls(output_csv_path)
    print(f"URL sudah ada di CSV: {len(done)}")

    submenus = discover_menu_links()
    print("Sub-kategori terdeteksi:")
    for href, name in submenus:
        print(f" - {name}: {href}")

    all_urls = OrderedDict()
    for href, name in submenus:
        print(f"\nCrawl sub-kategori: {name}")
        urls = crawl_subcategory_urls(href, max_pages=max_pages_per_sub)
        for u in urls:
            all_urls.setdefault((u, name), None)

    print(f"\nTotal kandidat (URL unik per sub): {len(all_urls)}")

    category = "Bola"
    saved = 0
    for i, ((url, subcat), _) in enumerate(tqdm(all_urls.items(), desc="Scraping", unit="url"), 1):
        if url in done:
            continue
        try:
            data = parse_article(url)
            if not data or not data.get("judul") or not data.get("konten"):
                continue
            row = {
                "url": url,
                "category": category,
                "subcategory": subcat,
                "tanggal_publish": data["tanggal_publish"],
                "judul": data["judul"],
                "tags": data["tags"],
                "konten": data["konten"]
            }
            append_row(output_csv_path, row)
            done.add(url)
            saved += 1
        except KeyboardInterrupt:
            print("\n⛔ Dihentikan manual, progres tersimpan.")
            break
        except Exception as e:
            print(f"\nERROR [{i}] {url} -> {e.__class__.__name__}: {e}")
        finally:
            time.sleep(random.uniform(*REQUEST_DELAY))

    print(f"Selesai. Artikel baru tersimpan: {saved}.")
    print("CSV:", output_csv_path)

# ==========================
# Jalankan
# ==========================
run_scrape_bola(OUTPUT_CSV, max_pages_per_sub=MAX_PAGES_PER_SUB)


URL sudah ada di CSV: 0
Sub-kategori terdeteksi:
 - Timnas Indonesia: https://bola.kompas.com/timnas-indonesia
 - Liga Indonesia: https://bola.kompas.com/liga-indonesia
 - Liga Inggris: https://bola.kompas.com/liga-inggris
 - Liga Italia: https://bola.kompas.com/liga-italia
 - Liga Champions: https://bola.kompas.com/liga-champions
 - Klasemen: https://bola.kompas.com/klasemen
 - Liga Indonesia: https://bola.kompas.com/klasemen/liga-indonesia
 - Liga Inggris: https://bola.kompas.com/klasemen/liga-inggris
 - Liga Italia: https://bola.kompas.com/klasemen/liga-italia
 - Liga Champions: https://bola.kompas.com/klasemen/liga-champions
 - Sports: https://www.kompas.com/sports
 - Motogp: https://www.kompas.com/motogp
 - Badminton: https://www.kompas.com/badminton
 - Indeks: https://indeks.kompas.com/?site=bola

Crawl sub-kategori: Timnas Indonesia
  [page=1] +30 URL (total 30)
  [page=2] +0 URL (total 30)
  [page=3] +0 URL (total 30)
  [page=4] +0 URL (total 30)

Crawl sub-kategori: Liga Indon

Scraping: 100%|██████████| 365/365 [06:56<00:00,  1.14s/url]

Selesai. Artikel baru tersimpan: 218.
CSV: /content/drive/MyDrive/KompasBola/kompas_bola_scraped.csv





# Preprocessing Money

In [None]:
import pandas as pd
import re
import os

In [None]:
file_id = "1nk0y7u_usHXum3HISicsVuCMyqDrAzVT"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url)
df.head()

Unnamed: 0,url,category,subcategory,tanggal_publish,judul,tags,konten
0,https://money.kompas.com/read/2019/08/01/12421...,Money,Ekbis,2019-08-01 12:42:15,Kompas.com Kembali Jadi Pemenang Kategori Medi...,"media online, Kompas.com, Superbrands","JAKARTA, KOMPAS.com — Superbrands, lembaga arb..."
1,https://money.kompas.com/read/2025/11/09/08100...,Money,Ekbis,2025-11-09 21:53:45,"Warren Buffett Siap Pamit, Kas Berkshire Sentu...","Berkshire Hathaway, warren buffet, kas berkshi...","KOMPAS.com – Saham Berkshire Hathaway naik 4,5..."
2,https://money.kompas.com/read/2025/11/09/22024...,Money,Ekbis,2025-11-09 22:02:45,"Emiten Ramai-ramai Lakukan Buyback Saham, Apa ...","pasar modal, aksi korporasi, investasi saham, ...","JAKARTA, KOMPAS.com — Sejumlah emiten melakuka..."
3,https://money.kompas.com/read/2025/11/10/06280...,Money,Ekbis,2025-11-10 06:28:00,"Soal Redenominasi Rupiah, Pemerintah Perlu Bua...","redenominasi, uang rupiah, RUU Redenominasi, r...","JAKARTA, KOMPAS.com - Chief Economist Permata ..."
4,https://money.kompas.com/read/2025/11/10/06572...,Money,Ekbis,2025-11-10 06:57:27,"Wall Street Sepekan Ini Diproyeksi Goyang, Inv...","Wall Street, bursa saham, pasar saham","NEW YORK, KOMPAS.com - Bursa saham Amerika Ser..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              354 non-null    object
 1   category         354 non-null    object
 2   subcategory      354 non-null    object
 3   tanggal_publish  354 non-null    object
 4   judul            354 non-null    object
 5   tags             354 non-null    object
 6   konten           354 non-null    object
dtypes: object(7)
memory usage: 19.5+ KB


In [None]:
df.rename(columns={
    'judul': 'judul_berita',
    'konten': 'konten_berita'
}, inplace=True)


In [None]:
df.columns.tolist()

['url',
 'category',
 'subcategory',
 'tanggal_publish',
 'judul_berita',
 'tags',
 'konten_berita']

## Null Handling

In [None]:
df['konten_berita'].isna().sum()

np.int64(0)

## Duplicate

In [None]:
df.duplicated().sum()

np.int64(0)

## Perbaiki Penulisan

In [None]:
def clean_text(text):
    # mengganti newline dengan spasi
    text = text.replace("\n", " ")
    # hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    # spasi setelah tanda baca jika belum ada
    text = re.sub(r'([.,;:!?])(?!\s)', r'\1 ', text)
    # domain
    #text = re.sub(r'\.(?!\s|[a-zA-Z])', r'. ', text)
    # hapus spasi berlebih setelah penambahan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  JAKARTA, KOMPAS. com — Superbrands, lembaga ar...
1  KOMPAS. com – Saham Berkshire Hathaway naik 4,...
2  JAKARTA, KOMPAS. com — Sejumlah emiten melakuk...
3  JAKARTA, KOMPAS. com - Chief Economist Permata...
4  NEW YORK, KOMPAS. com - Bursa saham Amerika Se...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Kompas. com Kembali Jadi Pemenang Kategori Med...
1  Warren Buffett Siap Pamit, Kas Berkshire Sentu...
2  Emiten Ramai-ramai Lakukan Buyback Saham, Apa ...
3  Soal Redenominasi Rupiah, Pemerintah Perlu Bua...
4  Wall Street Sepekan Ini Diproyeksi Goyang, Inv...


## Hapus Atribut Teks Tidak Penting

In [None]:
import re

def clean_kompas_text(text: str) -> str:
    # ubah kompas. com menjadi kompas.com
    text = re.sub(r'(?i)kompas\s*\.\s*com', 'kompas.com', text)

    # hapus lokasi di awal kalimat seperti "JAKARTA, —", "NEW YORK, -", dll
    text = re.sub(
        r'^[A-ZÀ-Ý][A-ZÀ-Ý\s\(\)\/\-\.]*,\s*[—–-]\s*',
        '',
        text.strip()
    )

    # hapus kompas doc
    text = re.sub(r'(?is)kompas\.com/\s*dok\.[^.]*\.', '', text)

    # hapus kompas.com
    text = re.sub(r'(?i)(?:[A-Za-z]*\s*[-–\\\/]*)?kompas\.com[\\/A-Za-z0-9\-]*', '', text)

    # gabungan
    text = re.sub(r'^[A-Za-z\s]*?[a-z](?=[A-Z])', '', text)

    # hapus baca, baca juga, dan kombinasinya
    #text = re.sub(r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n])', '', text)
    text = re.sub(
    r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n]|$)',
    '',
    text)

    # mention
    text = re.sub(r'\(@[^)]+\)', '', text)
    text = re.sub(r'@\w+', '', text)

    # kredit kompas.com\
    text = re.sub(r'(?i)\bkompas\.com/[\w\s.]+?[a-z](?=\s*[A-Z]{2,}|[A-Z][a-z]*[A-Z])', '', text)

    # hapus .com yang mungkin tersisa
    text = re.sub(r'\b\w*\.com\b', '', text, flags=re.IGNORECASE)

    # tanda baca dan spasi
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,!?;:])([^\s"”’])', r'\1 \2', text)
    text = re.sub(r'(?<=\w)\(', r' (', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'\)(?=[A-Za-z0-9])', r') ', text)
    text = re.sub(r'\s+(")', r'\1', text)
    text = re.sub(r'\.\s*"', '. "', text)

    # tanda baca berulang
    text = re.sub(r'([.,!?;:])(\s*\1)+', r'\1', text)
    text = re.sub(r'([.,!?;:])\s*[.,!?;:]+', r'\1', text)
    text = re.sub(r'"\s*"', '"', text)
    text = re.sub(r'""+', '"', text)

    # non huruf awal karakter
    text = re.sub(r'^[^A-Za-z0-9]+', '', text)

    # spasi ganda
    text = re.sub(r'\s{2,}', ' ', text)

    # trim spasi
    text = text.strip()

    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_kompas_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Superbrands, lembaga arbiter internasional unt...
1  Saham Berkshire Hathaway naik 4, 5 persen pada...
2  Sejumlah emiten melakukan aksi korporasi berup...
3  Chief Economist Permata Bank, Josua Pardede, m...
4  Bursa saham Amerika Serikat (AS) alias Wall St...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_kompas_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Kembali Jadi Pemenang Kategori Media Online Te...
1  Warren Buffett Siap Pamit, Kas Berkshire Sentu...
2  Emiten Ramai-ramai Lakukan Buyback Saham, Apa ...
3  Soal Redenominasi Rupiah, Pemerintah Perlu Bua...
4  Wall Street Sepekan Ini Diproyeksi Goyang, Inv...


In [None]:
df["judul_berita"] = df["judul_berita"].apply(
    lambda x: x.strip() + "." if not re.search(r'[.!?…]$', str(x).strip()) else x.strip()
)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              354 non-null    object
 1   category         354 non-null    object
 2   subcategory      354 non-null    object
 3   tanggal_publish  354 non-null    object
 4   judul_berita     354 non-null    object
 5   tags             354 non-null    object
 6   konten_berita    354 non-null    object
dtypes: object(7)
memory usage: 19.5+ KB


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.to_csv("kompas_money_cleaned.csv", index=False)

# Preprocessing Tekno

In [None]:
file_id = "1W7Tp9nhryglg7I0FIEA5nXg2dQSgwvLx"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url)
df.head()

Unnamed: 0,url,category,subcategory,tanggal_publish,judul,tags,konten
0,http://tekno.kompas.com/read/2025/10/13/112200...,Tekno,Apps & OS,2025-10-13 11:22:00,"Menjajal Dyson OnTrac, Headphone Futuristik Ha...","musik, headphone, dyson, ANC, Dyson OnTrac, he...",Ringkasan artikel: KOMPAS.com - Dyson selama i...
1,http://tekno.kompas.com/read/2025/10/14/160300...,Tekno,Apps & OS,2025-10-14 16:03:00,"Menjajal Samsung Galaxy A17 4G: Kamera Stabil,...","Samsung, hands on, Samsung Galaxy A17 4G, Harg...",KOMPAS.com – Samsung resmi meluncurkan Galaxy ...
2,http://tekno.kompas.com/read/2025/10/26/130300...,Tekno,Apps & OS,2025-10-26 13:03:00,Hands-on Samsung Galaxy Tab A11: Tablet Rp 2 J...,"Samsung, Indonesia, hands on, samsung Galaxy T...","Rangkuman berita: KOMPAS.com - Pekan lalu, Sam..."
3,https://money.kompas.com/read/2019/08/01/12421...,Tekno,Apps & OS,2019-08-01 12:42:15,Kompas.com Kembali Jadi Pemenang Kategori Medi...,"media online, Kompas.com, Superbrands","JAKARTA, KOMPAS.com — Superbrands, lembaga arb..."
4,https://tekno.kompas.com/read/2025/06/14/18070...,Tekno,Apps & OS,2025-11-04 14:50:47,Mengapa HP Android dan iPhone Wajib Restart Ru...,"Samsung, restart, restart iPhone",KOMPAS.com – Vendor ponsel Samsung menganjurka...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401 entries, 0 to 400
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              401 non-null    object
 1   category         401 non-null    object
 2   subcategory      401 non-null    object
 3   tanggal_publish  401 non-null    object
 4   judul            401 non-null    object
 5   tags             401 non-null    object
 6   konten           401 non-null    object
dtypes: object(7)
memory usage: 22.1+ KB


In [None]:
df.rename(columns={
    'judul': 'judul_berita',
    'konten': 'konten_berita'
}, inplace=True)

In [None]:
df.columns.tolist()

['url',
 'category',
 'subcategory',
 'tanggal_publish',
 'judul_berita',
 'tags',
 'konten_berita']

## Null Handling

In [None]:
df['konten_berita'].isna().sum()

np.int64(0)

## Duplicate

In [None]:
df.duplicated().sum()

np.int64(0)

## Perbaiki Penulisan

In [None]:
def clean_text(text):
    # mengganti newline dengan spasi
    text = text.replace("\n", " ")
    # hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    # spasi setelah tanda baca jika belum ada
    text = re.sub(r'([.,;:!?])(?!\s)', r'\1 ', text)
    # domain
    #text = re.sub(r'\.(?!\s|[a-zA-Z])', r'. ', text)
    # hapus spasi berlebih setelah penambahan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Ringkasan artikel: KOMPAS. com - Dyson selama ...
1  KOMPAS. com – Samsung resmi meluncurkan Galaxy...
2  Rangkuman berita: KOMPAS. com - Pekan lalu, Sa...
3  JAKARTA, KOMPAS. com — Superbrands, lembaga ar...
4  KOMPAS. com – Vendor ponsel Samsung menganjurk...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Menjajal Dyson OnTrac, Headphone Futuristik Ha...
1  Menjajal Samsung Galaxy A17 4G: Kamera Stabil,...
2  Hands-on Samsung Galaxy Tab A11: Tablet Rp 2 J...
3  Kompas. com Kembali Jadi Pemenang Kategori Med...
4  Mengapa HP Android dan iPhone Wajib Restart Ru...


## Hapus Atribut Teks Tidak Penting

In [None]:
import re

def clean_kompas_text(text: str) -> str:
    # ubah kompas. com menjadi kompas.com
    text = re.sub(r'(?i)kompas\s*\.\s*com', 'kompas.com', text)

    # hapus lokasi di awal kalimat seperti "JAKARTA, —", "NEW YORK, -", dll
    text = re.sub(
    r'^\s*[A-ZÀ-Ý][A-ZÀ-Ý\s\(\)\/\-\.]*,\s*[–—-]?\s*',
    '',
    text.strip()
    )

    # hapus frasa "Ringkasan berita", "Ringkasan artikel", "Rangkuman berita"
    text = re.sub(
    r'(?i)^(ringkasan|rangkuman)\s+(berita|artikel)\s*[:：–—-]\s*',
    '',
    text.strip()
    )

    # hapus kompas doc
    text = re.sub(r'(?is)kompas\.com/\s*dok\.[^.]*\.', '', text)

    # hapus kompas.com
    text = re.sub(r'(?i)(?:[A-Za-z]*\s*[-–\\\/]*)?kompas\.com[\\/A-Za-z0-9\-]*', '', text)

    # gabungan
    text = re.sub(r'^[A-Za-z\s]*?[a-z](?=[A-Z])', '', text)

    # hapus baca, baca juga, dan kombinasinya
    #text = re.sub(r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n])', '', text)
    text = re.sub(
    r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n]|$)',
    '',
    text)

    # mention
    text = re.sub(r'\(@[^)]+\)', '', text)
    text = re.sub(r'@\w+', '', text)

    # kredit kompas.com\
    text = re.sub(r'(?i)\bkompas\.com/[\w\s.]+?[a-z](?=\s*[A-Z]{2,}|[A-Z][a-z]*[A-Z])', '', text)

    # hapus .com yang mungkin tersisa
    text = re.sub(r'\b\w*\.com\b', '', text, flags=re.IGNORECASE)

    # tanda baca dan spasi
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,!?;:])([^\s"”’])', r'\1 \2', text)
    text = re.sub(r'(?<=\w)\(', r' (', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'\)(?=[A-Za-z0-9])', r') ', text)
    text = re.sub(r'\s+(")', r'\1', text)
    text = re.sub(r'\.\s*"', '. "', text)

    # tanda baca berulang
    text = re.sub(r'([.,!?;:])(\s*\1)+', r'\1', text)
    text = re.sub(r'([.,!?;:])\s*[.,!?;:]+', r'\1', text)
    text = re.sub(r'"\s*"', '"', text)
    text = re.sub(r'""+', '"', text)

    # non huruf awal karakter
    text = re.sub(r'^[^A-Za-z0-9]+', '', text)

    # spasi ganda
    text = re.sub(r'\s{2,}', ' ', text)

    # trim spasi
    text = text.strip()

    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_kompas_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Dyson selama ini dikenal dengan produk-produk ...
1  Samsung resmi meluncurkan Galaxy A17 4G di Ind...
2  Pekan lalu, Samsung resmi meluncurkan tablet e...
3  Superbrands, lembaga arbiter internasional unt...
4  Vendor ponsel Samsung menganjurkan pengguna un...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_kompas_text)
print(df[['judul_berita']].head())

                                        judul_berita
0         Trac, Headphone Futuristik Harga Rp10 Juta
1  Menjajal Samsung Galaxy A17 4G: Kamera Stabil,...
2  Hands-on Samsung Galaxy Tab A11: Tablet Rp 2 J...
3  Kembali Jadi Pemenang Kategori Media Online Te...
4                         Phone Wajib Restart Rutin?


In [None]:
df["judul_berita"] = df["judul_berita"].apply(
    lambda x: x.strip() + "." if not re.search(r'[.!?…]$', str(x).strip()) else x.strip()
)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401 entries, 0 to 400
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              401 non-null    object
 1   category         401 non-null    object
 2   subcategory      401 non-null    object
 3   tanggal_publish  401 non-null    object
 4   judul_berita     401 non-null    object
 5   tags             401 non-null    object
 6   konten_berita    401 non-null    object
dtypes: object(7)
memory usage: 22.1+ KB


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.to_csv("kompas_tekno_cleaned.csv", index=False)

# Preprocessing Otomotif

In [None]:
file_id = "1cvmmcIAI6vkInwqIUvu9vyai-g1JiUF4"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url)
df.head()

Unnamed: 0,url,category,subcategory,tanggal_publish,judul,tags,konten
0,https://money.kompas.com/read/2019/08/01/12421...,Otomotif,News,2019-08-01 12:42:15,Kompas.com Kembali Jadi Pemenang Kategori Medi...,"media online, Kompas.com, Superbrands","JAKARTA, KOMPAS.com — Superbrands, lembaga arb..."
1,https://otomotif.kompas.com/read/2025/11/09/20...,Otomotif,News,2025-11-09 20:45:28,"Hasil MotoGP Portugal 2025: Bezzecchi Juara, M...","MotoGP, balap motor, MotoGP Portugal, motogp 2...","JAKARTA, KOMPAS.com - MotoGP Portugal 2025 yan..."
2,https://otomotif.kompas.com/read/2025/11/09/21...,Otomotif,News,2025-11-09 21:10:40,Klasemen MotoGP 2025 Usai GP Portugal: Duo Már...,"MotoGP, klasemen, MotoGP Portugal","JAKARTA, KOMPAS.com - Marco Bezzecchi tampil g..."
3,https://otomotif.kompas.com/read/2025/11/10/07...,Otomotif,News,2025-11-10 07:42:00,"Pemulihan Lancar, Marc Marquez Segera Kembali ...","MotoGP, Marc Marquez, Ducati, Marquez","JAKARTA, KOMPAS.com – Manajer, Marc Marquez ya..."
4,https://otomotif.kompas.com/read/2025/11/10/08...,Otomotif,News,2025-11-10 08:42:00,Tantangan Suzuki Menjual Satria Pro di Tengah ...,"motor matik, Satria F150, Teuku Agha, Satria Pro","BOGOR, KOMPAS.com - Suzuki Indomobil Sales (SI..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              362 non-null    object
 1   category         362 non-null    object
 2   subcategory      362 non-null    object
 3   tanggal_publish  354 non-null    object
 4   judul            362 non-null    object
 5   tags             362 non-null    object
 6   konten           362 non-null    object
dtypes: object(7)
memory usage: 19.9+ KB


In [None]:
df.rename(columns={
    'judul': 'judul_berita',
    'konten': 'konten_berita'
}, inplace=True)

In [None]:
df.columns.tolist()

['url',
 'category',
 'subcategory',
 'tanggal_publish',
 'judul_berita',
 'tags',
 'konten_berita']

## Null Handling

In [None]:
df['konten_berita'].isna().sum()

np.int64(0)

## Duplicate

In [None]:
df.duplicated().sum()

np.int64(0)

## Perbaiki Penulisan

In [None]:
def clean_text(text):
    # mengganti newline dengan spasi
    text = text.replace("\n", " ")
    # hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    # spasi setelah tanda baca jika belum ada
    text = re.sub(r'([.,;:!?])(?!\s)', r'\1 ', text)
    # domain
    #text = re.sub(r'\.(?!\s|[a-zA-Z])', r'. ', text)
    # hapus spasi berlebih setelah penambahan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  JAKARTA, KOMPAS. com — Superbrands, lembaga ar...
1  JAKARTA, KOMPAS. com - MotoGP Portugal 2025 ya...
2  JAKARTA, KOMPAS. com - Marco Bezzecchi tampil ...
3  JAKARTA, KOMPAS. com – Manajer, Marc Marquez y...
4  BOGOR, KOMPAS. com - Suzuki Indomobil Sales (S...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Kompas. com Kembali Jadi Pemenang Kategori Med...
1  Hasil MotoGP Portugal 2025: Bezzecchi Juara, M...
2  Klasemen MotoGP 2025 Usai GP Portugal: Duo Már...
3  Pemulihan Lancar, Marc Marquez Segera Kembali ...
4  Tantangan Suzuki Menjual Satria Pro di Tengah ...


## Hapus Atribut Teks Tidak Penting

In [None]:
import re

def clean_kompas_text(text: str) -> str:
    # ubah "kompas. com" menjadi "kompas.com"
    text = re.sub(r'(?i)\bkompas\s*\.\s*com\b', 'kompas.com', text.strip())

    # hapus jika kompas.com muncul di awal kalimat
    text = re.sub(r'(?i)^\s*kompas\.com[:,\-\s]*', '', text.strip())

    # hapus lokasi di awal kalimat seperti "JAKARTA, —", "NEW YORK, -", dll
    text = re.sub(
    r'^\s*[A-ZÀ-Ý][A-ZÀ-Ý\s\(\)\/\-\.]*,\s*[–—-]?\s*',
    '',
    text.strip()
    )

    # hapus frasa "Ringkasan berita", "Ringkasan artikel", "Rangkuman berita"
    text = re.sub(
    r'(?i)^(ringkasan|rangkuman)\s+(berita|artikel)\s*[:：–—-]\s*',
    '',
    text.strip()
    )

    # hapus kompas doc
    text = re.sub(r'(?is)kompas\.com/\s*dok\.[^.]*\.', '', text)

    # hapus kompas.com
    text = re.sub(r'(?i)(?:[A-Za-z]*\s*[-–\\\/]*)?kompas\.com[\\/A-Za-z0-9\-]*', '', text)

    # gabungan
    text = re.sub(r'^[A-Za-z\s]*?[a-z](?=[A-Z])', '', text)

    # hapus baca, baca juga, dan kombinasinya
    #text = re.sub(r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n])', '', text)
    text = re.sub(
    r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n]|$)',
    '',
    text)

    # mention
    text = re.sub(r'\(@[^)]+\)', '', text)
    text = re.sub(r'@\w+', '', text)

    # kredit kompas.com\
    text = re.sub(r'(?i)\bkompas\.com/[\w\s.]+?[a-z](?=\s*[A-Z]{2,}|[A-Z][a-z]*[A-Z])', '', text)

    # hapus .com yang mungkin tersisa
    text = re.sub(r'\b\w*\.com\b', '', text, flags=re.IGNORECASE)

    # tanda baca dan spasi
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,!?;:])([^\s"”’])', r'\1 \2', text)
    text = re.sub(r'(?<=\w)\(', r' (', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'\)(?=[A-Za-z0-9])', r') ', text)
    text = re.sub(r'\s+(")', r'\1', text)
    text = re.sub(r'\.\s*"', '. "', text)

    # tanda baca berulang
    text = re.sub(r'([.,!?;:])(\s*\1)+', r'\1', text)
    text = re.sub(r'([.,!?;:])\s*[.,!?;:]+', r'\1', text)
    text = re.sub(r'"\s*"', '"', text)
    text = re.sub(r'""+', '"', text)

    # non huruf awal karakter
    text = re.sub(r'^[^A-Za-z0-9]+', '', text)

    # spasi ganda
    text = re.sub(r'\s{2,}', ' ', text)

    # trim spasi
    text = text.strip()

    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_kompas_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Superbrands, lembaga arbiter internasional unt...
1  MotoGP Portugal 2025 yang digelar di Sirkuit A...
2  Marco Bezzecchi tampil gemilang di Grand Prix ...
3  Manajer, Marc Marquez yaitu Jaime Martinez men...
4  Suzuki Indomobil Sales (SIS) resmi meluncurkan...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_kompas_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Kembali Jadi Pemenang Kategori Media Online Te...
1   GP Portugal 2025: Bezzecchi Juara, Marquez Kedua
2  GP 2025 Usai GP Portugal: Duo Márquez di Posis...
3  Pemulihan Lancar, Marc Marquez Segera Kembali ...
4  Tantangan Suzuki Menjual Satria Pro di Tengah ...


In [None]:
df["judul_berita"] = df["judul_berita"].apply(
    lambda x: x.strip() + "." if not re.search(r'[.!?…]$', str(x).strip()) else x.strip()
)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              362 non-null    object
 1   category         362 non-null    object
 2   subcategory      362 non-null    object
 3   tanggal_publish  354 non-null    object
 4   judul_berita     362 non-null    object
 5   tags             362 non-null    object
 6   konten_berita    362 non-null    object
dtypes: object(7)
memory usage: 19.9+ KB


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.to_csv("kompas_otomotif_cleaned.csv", index=False)

# Preprocessing Nusaraya

In [None]:
file_id = "1YLosd1sCRZJAWIvzGfreHMeEXomdpTk7"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url)
df.head()

Unnamed: 0,url,category,subcategory,tanggal_publish,judul,tags,konten
0,http://medan.kompas.com/read/2025/11/09/125603...,Nusaraya,Sumatera Utara,2025-11-09 12:56:03,Detik-detik Wanita di Deli Serdang Tewas Usai ...,"kriminal, bunuh diri, Sumatera Utara, pengania...","MEDAN, KOMPAS.com - Seorang perempuan berinisi..."
1,http://medan.kompas.com/read/2025/11/09/193503...,Nusaraya,Sumatera Utara,2025-11-09 19:35:03,"Datangi Pusat Pasar Medan, Menteri PU: Bocor S...","Medan, sumut, Menteri PU Dody Hanggodo, pasar ...","MEDAN,KOMPAS.com- Menteri Pekerjaan Umum (PU) ..."
2,http://medan.kompas.com/read/2025/11/09/201230...,Nusaraya,Sumatera Utara,2025-11-09 20:12:30,Wanita Penagih Utang di Medan Dikeroyok Keluar...,"penagih utang medan dikeroyok, video penagih u...","MEDAN, KOMPAS.com- Seorang pegawai koperasi di..."
3,http://medan.kompas.com/read/2025/11/10/052336...,Nusaraya,Sumatera Utara,2025-11-10 5:23:36,Siswa di Nias Lewati Sungai ke Sekolah karena ...,"Menteri PU, pembangunan jembatan, Bupati Nias,...","MEDAN, KOMPAS.com - Menteri Pekerjaan Umum (PU..."
4,http://medan.kompas.com/read/2025/11/10/053000...,Nusaraya,Sumatera Utara,2025-11-10 5:30:00,"Firasat Driver Ojol Benar, Tangannya sampai Be...","Medan, sumut, paket narkotika driver ojol medan","MEDAN, KOMPAS.com – Seorang driver ojek online..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3777 entries, 0 to 3776
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              3777 non-null   object
 1   category         3777 non-null   object
 2   subcategory      3777 non-null   object
 3   tanggal_publish  3777 non-null   object
 4   judul            3777 non-null   object
 5   tags             3774 non-null   object
 6   konten           3777 non-null   object
dtypes: object(7)
memory usage: 206.7+ KB


In [None]:
df.rename(columns={
    'judul': 'judul_berita',
    'konten': 'konten_berita'
}, inplace=True)

In [None]:
df.columns.tolist()

['url',
 'category',
 'subcategory',
 'tanggal_publish',
 'judul_berita',
 'tags',
 'konten_berita']

## Null Handling

In [None]:
df['konten_berita'].isna().sum()

np.int64(0)

## Duplicate

In [None]:
df.duplicated().sum()

np.int64(0)

## Perbaiki Penulisan

In [None]:
def clean_text(text):
    # mengganti newline dengan spasi
    text = text.replace("\n", " ")
    # hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    # spasi setelah tanda baca jika belum ada
    text = re.sub(r'([.,;:!?])(?!\s)', r'\1 ', text)
    # domain
    #text = re.sub(r'\.(?!\s|[a-zA-Z])', r'. ', text)
    # hapus spasi berlebih setelah penambahan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  MEDAN, KOMPAS. com - Seorang perempuan berinis...
1  MEDAN, KOMPAS. com- Menteri Pekerjaan Umum (PU...
2  MEDAN, KOMPAS. com- Seorang pegawai koperasi d...
3  MEDAN, KOMPAS. com - Menteri Pekerjaan Umum (P...
4  MEDAN, KOMPAS. com – Seorang driver ojek onlin...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Detik-detik Wanita di Deli Serdang Tewas Usai ...
1  Datangi Pusat Pasar Medan, Menteri PU: Bocor S...
2  Wanita Penagih Utang di Medan Dikeroyok Keluar...
3  Siswa di Nias Lewati Sungai ke Sekolah karena ...
4  Firasat Driver Ojol Benar, Tangannya sampai Be...


## Hapus Atribut Teks Tidak Penting

In [None]:
import re

def clean_kompas_text(text: str) -> str:
    # ubah kompas. com menjadi kompas.com
    text = re.sub(r'(?i)kompas\s*\.\s*com', 'kompas.com', text)

    # hapus lokasi di awal kalimat seperti "JAKARTA, —", "NEW YORK, -", dll
    text = re.sub(
    r'^\s*[A-ZÀ-Ý][A-ZÀ-Ý\s\(\)\/\-\.]*,\s*[–—-]?\s*',
    '',
    text.strip()
    )

    # hapus frasa "Ringkasan berita", "Ringkasan artikel", "Rangkuman berita"
    text = re.sub(
    r'(?i)^(ringkasan|rangkuman)\s+(berita|artikel)\s*[:：–—-]\s*',
    '',
    text.strip()
    )

    # hapus kompas doc
    text = re.sub(r'(?is)kompas\.com/\s*dok\.[^.]*\.', '', text)

    # hapus kompas.com
    text = re.sub(r'(?i)(?:[A-Za-z]*\s*[-–\\\/]*)?kompas\.com[\\/A-Za-z0-9\-]*', '', text)

    # gabungan
    text = re.sub(r'^[A-Za-z\s]*?[a-z](?=[A-Z])', '', text)

    # hapus baca, baca juga, dan kombinasinya
    #text = re.sub(r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n])', '', text)
    text = re.sub(
    r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n]|$)',
    '',
    text)

    # mention
    text = re.sub(r'\(@[^)]+\)', '', text)
    text = re.sub(r'@\w+', '', text)

    # kredit kompas.com\
    text = re.sub(r'(?i)\bkompas\.com/[\w\s.]+?[a-z](?=\s*[A-Z]{2,}|[A-Z][a-z]*[A-Z])', '', text)

    # hapus .com yang mungkin tersisa
    text = re.sub(r'\b\w*\.com\b', '', text, flags=re.IGNORECASE)

    # tanda baca dan spasi
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,!?;:])([^\s"”’])', r'\1 \2', text)
    text = re.sub(r'(?<=\w)\(', r' (', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'\)(?=[A-Za-z0-9])', r') ', text)
    text = re.sub(r'\s+(")', r'\1', text)
    text = re.sub(r'\.\s*"', '. "', text)

    # tanda baca berulang
    text = re.sub(r'([.,!?;:])(\s*\1)+', r'\1', text)
    text = re.sub(r'([.,!?;:])\s*[.,!?;:]+', r'\1', text)
    text = re.sub(r'"\s*"', '"', text)
    text = re.sub(r'""+', '"', text)

    # non huruf awal karakter
    text = re.sub(r'^[^A-Za-z0-9]+', '', text)

    # spasi ganda
    text = re.sub(r'\s{2,}', ' ', text)

    # trim spasi
    text = text.strip()

    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_kompas_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Seorang perempuan berinisial AS, tiga puluh li...
1  Menteri Pekerjaan Umum (PU) RI Dody Hanggodo m...
2  Seorang pegawai koperasi di Medan, Sumatera Ut...
3  Menteri Pekerjaan Umum (PU) Doddy Hanggodo mer...
4  Seorang driver ojek online berinisial J melapo...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_kompas_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Detik-detik Wanita di Deli Serdang Tewas Usai ...
1  Datangi Pusat Pasar Medan, Menteri PU: Bocor S...
2  Wanita Penagih Utang di Medan Dikeroyok Keluar...
3  Siswa di Nias Lewati Sungai ke Sekolah karena ...
4  Firasat Driver Ojol Benar, Tangannya sampai Be...


In [None]:
df["judul_berita"] = df["judul_berita"].apply(
    lambda x: x.strip() + "." if not re.search(r'[.!?…]$', str(x).strip()) else x.strip()
)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3777 entries, 0 to 3776
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              3777 non-null   object
 1   category         3777 non-null   object
 2   subcategory      3777 non-null   object
 3   tanggal_publish  3777 non-null   object
 4   judul_berita     3777 non-null   object
 5   tags             3774 non-null   object
 6   konten_berita    3777 non-null   object
dtypes: object(7)
memory usage: 206.7+ KB


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.to_csv("kompas_nusaraya_cleaned.csv", index=False)

# Preprocessing Lifestyle

In [None]:
file_id = "173bnAoXlH9wWQxcqQUtt17wQVH2dLyRr"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url)
df.head()

Unnamed: 0,url,category,subcategory,tanggal_publish,judul,tags,konten
0,https://buku.kompas.com/read/5366/arti-the-sun...,Lifestyle,Beauty & Grooming,2025-10-22 09:00:00,"Arti The Sunset Is Beautiful, Isn’t It?","Trivia, The Sunset Is Beautiful, Isn’t It, Re...","Arti "" The sunset is beautiful, isn't it?"" bel..."
1,https://buku.kompas.com/read/5368/apa-arti-goa...,Lifestyle,Beauty & Grooming,2025-11-05 11:00:00,"Apa Arti GOAT dalam Sepak Bola? Yuk, Cari Tahu...","Trivia, Arti GOAT, Sepak Bola, Rekomendasi Buk...",Penggemar sepak bola pasti sudah tidak asing d...
2,https://buku.kompas.com/read/5435/ciri-cowok-t...,Lifestyle,Beauty & Grooming,2025-10-21 17:00:00,"Ciri Cowok Temperamental, Penyebab, dan Cara M...","Trivia, Cowok Tempramental, Rekomendasi Buku, ...",Pernahkah kamu bertemu seseorang dengan emosi ...
3,https://buku.kompas.com/read/5451/10-olahan-ma...,Lifestyle,Beauty & Grooming,2025-10-21 15:00:00,10 Olahan Makanan dari Sayuran yang Mudah Dimasak,"Trivia, Olahan Makanan dari Sayuran, Resep Mas...",Olahan makanan dari sayur merupakan pilihan te...
4,https://buku.kompas.com/read/5462/berani-berge...,Lifestyle,Beauty & Grooming,2025-10-22 09:00:00,"Berani Bergeser, Berani Memimpin: Menyelami Ga...","Leadershift, Pengembangan Diri, Elex Media Kom...",Kepemimpinan selalu menjadi topik yang tak per...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              137 non-null    object
 1   category         137 non-null    object
 2   subcategory      137 non-null    object
 3   tanggal_publish  137 non-null    object
 4   judul            137 non-null    object
 5   tags             128 non-null    object
 6   konten           137 non-null    object
dtypes: object(7)
memory usage: 7.6+ KB


In [None]:
df.rename(columns={
    'judul': 'judul_berita',
    'konten': 'konten_berita'
}, inplace=True)

In [None]:
df.columns.tolist()

['url',
 'category',
 'subcategory',
 'tanggal_publish',
 'judul_berita',
 'tags',
 'konten_berita']

## Null Handling

In [None]:
df['konten_berita'].isna().sum()

np.int64(0)

## Duplicate

In [None]:
df.duplicated().sum()

np.int64(0)

## Perbaiki Penulisan

In [None]:
def clean_text(text):
    # mengganti newline dengan spasi
    text = text.replace("\n", " ")
    # hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    # spasi setelah tanda baca jika belum ada
    text = re.sub(r'([.,;:!?])(?!\s)', r'\1 ', text)
    # domain
    #text = re.sub(r'\.(?!\s|[a-zA-Z])', r'. ', text)
    # hapus spasi berlebih setelah penambahan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Arti " The sunset is beautiful, isn't it? " be...
1  Penggemar sepak bola pasti sudah tidak asing d...
2  Pernahkah kamu bertemu seseorang dengan emosi ...
3  Olahan makanan dari sayur merupakan pilihan te...
4  Kepemimpinan selalu menjadi topik yang tak per...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_text)
print(df[['judul_berita']].head())

                                        judul_berita
0            Arti The Sunset Is Beautiful, Isn’t It?
1  Apa Arti GOAT dalam Sepak Bola? Yuk, Cari Tahu...
2  Ciri Cowok Temperamental, Penyebab, dan Cara M...
3  10 Olahan Makanan dari Sayuran yang Mudah Dimasak
4  Berani Bergeser, Berani Memimpin: Menyelami Ga...


## Hapus Atribut Teks Tidak Penting

In [None]:
import re

def clean_kompas_text(text: str) -> str:
    # ubah kompas. com menjadi kompas.com
    text = re.sub(r'(?i)kompas\s*\.\s*com', 'kompas.com', text)

    # hapus lokasi di awal kalimat seperti "JAKARTA, —", "NEW YORK, -", dll
    text = re.sub(
    r'^\s*[A-ZÀ-Ý][A-ZÀ-Ý\s\(\)\/\-\.]*,\s*[–—-]?\s*',
    '',
    text.strip()
    )

    # hapus frasa "Ringkasan berita", "Ringkasan artikel", "Rangkuman berita"
    text = re.sub(
    r'(?i)^(ringkasan|rangkuman)\s+(berita|artikel)\s*[:：–—-]\s*',
    '',
    text.strip()
    )

    # hapus kompas doc
    text = re.sub(r'(?is)kompas\.com/\s*dok\.[^.]*\.', '', text)

    # hapus kompas.com
    text = re.sub(r'(?i)(?:[A-Za-z]*\s*[-–\\\/]*)?kompas\.com[\\/A-Za-z0-9\-]*', '', text)

    # gabungan
    text = re.sub(r'^[A-Za-z\s]*?[a-z](?=[A-Z])', '', text)

    # hapus baca, baca juga, dan kombinasinya
    #text = re.sub(r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n])', '', text)
    text = re.sub(
    r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n]|$)',
    '',
    text)

    # mention
    text = re.sub(r'\(@[^)]+\)', '', text)
    text = re.sub(r'@\w+', '', text)

    # kredit kompas.com\
    text = re.sub(r'(?i)\bkompas\.com/[\w\s.]+?[a-z](?=\s*[A-Z]{2,}|[A-Z][a-z]*[A-Z])', '', text)

    # hapus .com yang mungkin tersisa
    text = re.sub(r'\b\w*\.com\b', '', text, flags=re.IGNORECASE)

    # tanda baca dan spasi
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,!?;:])([^\s"”’])', r'\1 \2', text)
    text = re.sub(r'(?<=\w)\(', r' (', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'\)(?=[A-Za-z0-9])', r') ', text)
    text = re.sub(r'\s+(")', r'\1', text)
    text = re.sub(r'\.\s*"', '. "', text)

    # tanda baca berulang
    text = re.sub(r'([.,!?;:])(\s*\1)+', r'\1', text)
    text = re.sub(r'([.,!?;:])\s*[.,!?;:]+', r'\1', text)
    text = re.sub(r'"\s*"', '"', text)
    text = re.sub(r'""+', '"', text)

    # non huruf awal karakter
    text = re.sub(r'^[^A-Za-z0-9]+', '', text)

    # spasi ganda
    text = re.sub(r'\s{2,}', ' ', text)

    # trim spasi
    text = text.strip()

    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_kompas_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Arti" The sunset is beautiful, isn't it?" bela...
1  Penggemar sepak bola pasti sudah tidak asing d...
2  Pernahkah kamu bertemu seseorang dengan emosi ...
3  Olahan makanan dari sayur merupakan pilihan te...
4  Kepemimpinan selalu menjadi topik yang tak per...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_kompas_text)
print(df[['judul_berita']].head())

                                        judul_berita
0            Arti The Sunset Is Beautiful, Isn’t It?
1  Apa Arti GOAT dalam Sepak Bola? Yuk, Cari Tahu...
2  Ciri Cowok Temperamental, Penyebab, dan Cara M...
3  10 Olahan Makanan dari Sayuran yang Mudah Dimasak
4  Berani Bergeser, Berani Memimpin: Menyelami Ga...


In [None]:
df["judul_berita"] = df["judul_berita"].apply(
    lambda x: x.strip() + "." if not re.search(r'[.!?…]$', str(x).strip()) else x.strip()
)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137 entries, 0 to 136
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              137 non-null    object
 1   category         137 non-null    object
 2   subcategory      137 non-null    object
 3   tanggal_publish  137 non-null    object
 4   judul_berita     137 non-null    object
 5   tags             128 non-null    object
 6   konten_berita    137 non-null    object
dtypes: object(7)
memory usage: 7.6+ KB


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.to_csv("kompas_lifestyle_cleaned.csv", index=False)

# Preprocessing Travel

In [None]:
file_id = "1uk90zJF1UddTu9DLoxQaDr9MnNfoFNO7"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url)
df.head()

Unnamed: 0,url,category,subcategory,tanggal_publish,judul,tags,konten
0,https://money.kompas.com/read/2019/08/01/12421...,Travel,Travel News,2019-08-01 12:42:15,Kompas.com Kembali Jadi Pemenang Kategori Medi...,"media online, Kompas.com, Superbrands","JAKARTA, KOMPAS.com — Superbrands, lembaga arb..."
1,https://travel.kompas.com/read/2025/09/30/1201...,Travel,Travel News,2025-09-30 12:01:00,Ekspektasi Vs Realita Wisata Bali yang Disorot...,"Bali, Pariwisata Bali, Wisata bali disorot med...",KOMPAS.com - Bali selama ini memang dikenal se...
2,https://travel.kompas.com/read/2025/10/02/1452...,Travel,Travel News,2025-10-02 14:52:07,"Permudah Turis Asing Datang, Aplikasi All Indo...","Kemenpar, bandara soekarno hatta, Wamenpar Ni ...","KOMPAS.com - Aplikasi ""All Indonesia"" resmi di..."
3,https://travel.kompas.com/read/2025/10/10/1201...,Travel,Travel News,2025-10-10 12:01:00,Kunjungan Wisman Januari–Agustus 2025 Tembus 1...,"kunjungan wisman, kunjungan wisman 2025, kunju...",KOMPAS.com – Kementerian Pariwisata (Kemenpar)...
4,https://travel.kompas.com/read/2025/10/15/1631...,Travel,Travel News,2025-11-07 07:49:37,"Bali Jadi Pulau Terbaik di Asia 2025, Kalahkan...","Bali, Bali Jadi Pulau Terbaik di Asia 2025",KOMPAS.com — Pulau Bali kembali menegaskan rep...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042 entries, 0 to 1041
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              1042 non-null   object
 1   category         1042 non-null   object
 2   subcategory      1042 non-null   object
 3   tanggal_publish  1042 non-null   object
 4   judul            1042 non-null   object
 5   tags             1042 non-null   object
 6   konten           1042 non-null   object
dtypes: object(7)
memory usage: 57.1+ KB


In [None]:
df.rename(columns={
    'judul': 'judul_berita',
    'konten': 'konten_berita'
}, inplace=True)

In [None]:
df.columns.tolist()

['url',
 'category',
 'subcategory',
 'tanggal_publish',
 'judul_berita',
 'tags',
 'konten_berita']

## Null Handling

In [None]:
df['konten_berita'].isna().sum()

np.int64(0)

## Duplicate

In [None]:
df.duplicated().sum()

np.int64(0)

## Perbaiki Penulisan

In [None]:
def clean_text(text):
    # mengganti newline dengan spasi
    text = text.replace("\n", " ")
    # hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    # spasi setelah tanda baca jika belum ada
    text = re.sub(r'([.,;:!?])(?!\s)', r'\1 ', text)
    # domain
    #text = re.sub(r'\.(?!\s|[a-zA-Z])', r'. ', text)
    # hapus spasi berlebih setelah penambahan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  JAKARTA, KOMPAS. com — Superbrands, lembaga ar...
1  KOMPAS. com - Bali selama ini memang dikenal s...
2  KOMPAS. com - Aplikasi "All Indonesia" resmi d...
3  KOMPAS. com – Kementerian Pariwisata (Kemenpar...
4  KOMPAS. com — Pulau Bali kembali menegaskan re...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Kompas. com Kembali Jadi Pemenang Kategori Med...
1  Ekspektasi Vs Realita Wisata Bali yang Disorot...
2  Permudah Turis Asing Datang, Aplikasi All Indo...
3  Kunjungan Wisman Januari–Agustus 2025 Tembus 1...
4  Bali Jadi Pulau Terbaik di Asia 2025, Kalahkan...


## Hapus Atribut Teks Tidak Penting

In [None]:
import re

def clean_kompas_text(text: str) -> str:
    # ubah kompas. com menjadi kompas.com
    text = re.sub(r'(?i)kompas\s*\.\s*com', 'kompas.com', text)

    # hapus lokasi di awal kalimat seperti "JAKARTA, —", "NEW YORK, -", dll
    text = re.sub(
    r'^\s*[A-ZÀ-Ý][A-ZÀ-Ý\s\(\)\/\-\.]*,\s*[–—-]?\s*',
    '',
    text.strip()
    )

    # hapus frasa "Ringkasan berita", "Ringkasan artikel", "Rangkuman berita"
    text = re.sub(
    r'(?i)^(ringkasan|rangkuman)\s+(berita|artikel)\s*[:：–—-]\s*',
    '',
    text.strip()
    )

    # hapus kompas doc
    text = re.sub(r'(?is)kompas\.com/\s*dok\.[^.]*\.', '', text)

    # hapus kompas.com
    text = re.sub(r'(?i)(?:[A-Za-z]*\s*[-–\\\/]*)?kompas\.com[\\/A-Za-z0-9\-]*', '', text)

    # gabungan
    text = re.sub(r'^[A-Za-z\s]*?[a-z](?=[A-Z])', '', text)

    # hapus baca, baca juga, dan kombinasinya
    #text = re.sub(r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n])', '', text)
    text = re.sub(
    r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n]|$)',
    '',
    text)

    # mention
    text = re.sub(r'\(@[^)]+\)', '', text)
    text = re.sub(r'@\w+', '', text)

    # kredit kompas.com\
    text = re.sub(r'(?i)\bkompas\.com/[\w\s.]+?[a-z](?=\s*[A-Z]{2,}|[A-Z][a-z]*[A-Z])', '', text)

    # hapus .com yang mungkin tersisa
    text = re.sub(r'\b\w*\.com\b', '', text, flags=re.IGNORECASE)

    # tanda baca dan spasi
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,!?;:])([^\s"”’])', r'\1 \2', text)
    text = re.sub(r'(?<=\w)\(', r' (', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'\)(?=[A-Za-z0-9])', r') ', text)
    text = re.sub(r'\s+(")', r'\1', text)
    text = re.sub(r'\.\s*"', '. "', text)

    # tanda baca berulang
    text = re.sub(r'([.,!?;:])(\s*\1)+', r'\1', text)
    text = re.sub(r'([.,!?;:])\s*[.,!?;:]+', r'\1', text)
    text = re.sub(r'"\s*"', '"', text)
    text = re.sub(r'""+', '"', text)

    # non huruf awal karakter
    text = re.sub(r'^[^A-Za-z0-9]+', '', text)

    # spasi ganda
    text = re.sub(r'\s{2,}', ' ', text)

    # trim spasi
    text = text.strip()

    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_kompas_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Superbrands, lembaga arbiter internasional unt...
1  Bali selama ini memang dikenal sebagai destina...
2  Aplikasi"All Indonesia" resmi diluncurkan untu...
3  Kementerian Pariwisata (Kemenpar) melaporkan b...
4  Pulau Bali kembali menegaskan reputasinya seba...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_kompas_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Kembali Jadi Pemenang Kategori Media Online Te...
1  Ekspektasi Vs Realita Wisata Bali yang Disorot...
2  Permudah Turis Asing Datang, Aplikasi All Indo...
3  Kunjungan Wisman Januari–Agustus 2025 Tembus 1...
4  Bali Jadi Pulau Terbaik di Asia 2025, Kalahkan...


In [None]:
df["judul_berita"] = df["judul_berita"].apply(
    lambda x: x.strip() + "." if not re.search(r'[.!?…]$', str(x).strip()) else x.strip()
)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042 entries, 0 to 1041
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              1042 non-null   object
 1   category         1042 non-null   object
 2   subcategory      1042 non-null   object
 3   tanggal_publish  1042 non-null   object
 4   judul_berita     1042 non-null   object
 5   tags             1042 non-null   object
 6   konten_berita    1042 non-null   object
dtypes: object(7)
memory usage: 57.1+ KB


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.to_csv("kompas_travel_cleaned.csv", index=False)

# Preprocessing Sains

In [None]:
file_id = "1LDrZ0xhjd6oPfaYwklqBELfk0cIqVer9"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url)
df.head()

Unnamed: 0,url,category,subcategory,tanggal_publish,judul,tags,konten
0,https://money.kompas.com/read/2019/08/01/12421...,Sains,Fenomena,2019-08-01 12:42:15,Kompas.com Kembali Jadi Pemenang Kategori Medi...,"media online, Kompas.com, Superbrands","JAKARTA, KOMPAS.com — Superbrands, lembaga arb..."
1,https://www.kompas.com/sains/read/2024/01/27/2...,Sains,Fenomena,2024-01-27 20:13:40,Apa Itu Debu Kosmik?,"debu kosmik, debu luar angkasa, ruang angkasa,...",KOMPAS.com - Saat kita melihat ke langit malam...
2,https://www.kompas.com/sains/read/2024/01/28/1...,Sains,Fenomena,2024-01-28 10:36:47,Danau Singkarak Sumatera Barat Diguncang 5 Ren...,"gempa, Danau Singkarak","KOMPAS.com - Area Danau Singkarak, Sumatera Ba..."
3,https://www.kompas.com/sains/read/2024/01/28/1...,Sains,Fenomena,2024-01-28 14:32:00,Berapa Lama Mahluk Nyari Abadi Tardigrade Bisa...,"Tardigrade, Mahluk Nyari Abadi Tardigrade , be...",KOMPAS.com - Tardigrade mungkin adalah hewan y...
4,https://www.kompas.com/sains/read/2024/01/29/1...,Sains,Fenomena,2024-01-29 16:10:17,5 Manfaat Campuran Bawang Putih dan Jahe untuk...,"manfaat jahe, manfaat bawang putih, manfaat ca...",KOMPAS.com - Bawang putih dan jahe sama-sama t...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              131 non-null    object
 1   category         131 non-null    object
 2   subcategory      131 non-null    object
 3   tanggal_publish  131 non-null    object
 4   judul            131 non-null    object
 5   tags             131 non-null    object
 6   konten           131 non-null    object
dtypes: object(7)
memory usage: 7.3+ KB


In [None]:
df.rename(columns={
    'judul': 'judul_berita',
    'konten': 'konten_berita'
}, inplace=True)

In [None]:
df.columns.tolist()

['url',
 'category',
 'subcategory',
 'tanggal_publish',
 'judul_berita',
 'tags',
 'konten_berita']

## Null Handling

In [None]:
df['konten_berita'].isna().sum()

np.int64(0)

## Duplicate

In [None]:
df.duplicated().sum()

np.int64(0)

## Perbaiki Penulisan

In [None]:
def clean_text(text):
    # mengganti newline dengan spasi
    text = text.replace("\n", " ")
    # hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    # spasi setelah tanda baca jika belum ada
    text = re.sub(r'([.,;:!?])(?!\s)', r'\1 ', text)
    # domain
    #text = re.sub(r'\.(?!\s|[a-zA-Z])', r'. ', text)
    # hapus spasi berlebih setelah penambahan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  JAKARTA, KOMPAS. com — Superbrands, lembaga ar...
1  KOMPAS. com - Saat kita melihat ke langit mala...
2  KOMPAS. com - Area Danau Singkarak, Sumatera B...
3  KOMPAS. com - Tardigrade mungkin adalah hewan ...
4  KOMPAS. com - Bawang putih dan jahe sama-sama ...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Kompas. com Kembali Jadi Pemenang Kategori Med...
1                               Apa Itu Debu Kosmik?
2  Danau Singkarak Sumatera Barat Diguncang 5 Ren...
3  Berapa Lama Mahluk Nyari Abadi Tardigrade Bisa...
4  5 Manfaat Campuran Bawang Putih dan Jahe untuk...


## Hapus Atribut Teks Tidak Penting

In [None]:
import re

def clean_kompas_text(text: str) -> str:
    # ubah kompas. com menjadi kompas.com
    text = re.sub(r'(?i)kompas\s*\.\s*com', 'kompas.com', text)

    # hapus lokasi di awal kalimat seperti "JAKARTA, —", "NEW YORK, -", dll
    text = re.sub(
    r'^\s*[A-ZÀ-Ý][A-ZÀ-Ý\s\(\)\/\-\.]*,\s*[–—-]?\s*',
    '',
    text.strip()
    )

    # hapus frasa "Ringkasan berita", "Ringkasan artikel", "Rangkuman berita"
    text = re.sub(
    r'(?i)^(ringkasan|rangkuman)\s+(berita|artikel)\s*[:：–—-]\s*',
    '',
    text.strip()
    )

    # hapus kompas doc
    text = re.sub(r'(?is)kompas\.com/\s*dok\.[^.]*\.', '', text)

    # hapus kompas.com
    text = re.sub(r'(?i)(?:[A-Za-z]*\s*[-–\\\/]*)?kompas\.com[\\/A-Za-z0-9\-]*', '', text)

    # gabungan
    text = re.sub(r'^[A-Za-z\s]*?[a-z](?=[A-Z])', '', text)

    # hapus baca, baca juga, dan kombinasinya
    #text = re.sub(r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n])', '', text)
    text = re.sub(
    r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n]|$)',
    '',
    text)

    # mention
    text = re.sub(r'\(@[^)]+\)', '', text)
    text = re.sub(r'@\w+', '', text)

    # kredit kompas.com\
    text = re.sub(r'(?i)\bkompas\.com/[\w\s.]+?[a-z](?=\s*[A-Z]{2,}|[A-Z][a-z]*[A-Z])', '', text)

    # hapus .com yang mungkin tersisa
    text = re.sub(r'\b\w*\.com\b', '', text, flags=re.IGNORECASE)

    # tanda baca dan spasi
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,!?;:])([^\s"”’])', r'\1 \2', text)
    text = re.sub(r'(?<=\w)\(', r' (', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'\)(?=[A-Za-z0-9])', r') ', text)
    text = re.sub(r'\s+(")', r'\1', text)
    text = re.sub(r'\.\s*"', '. "', text)

    # tanda baca berulang
    text = re.sub(r'([.,!?;:])(\s*\1)+', r'\1', text)
    text = re.sub(r'([.,!?;:])\s*[.,!?;:]+', r'\1', text)
    text = re.sub(r'"\s*"', '"', text)
    text = re.sub(r'""+', '"', text)

    # non huruf awal karakter
    text = re.sub(r'^[^A-Za-z0-9]+', '', text)

    # spasi ganda
    text = re.sub(r'\s{2,}', ' ', text)

    # trim spasi
    text = text.strip()

    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_kompas_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Superbrands, lembaga arbiter internasional unt...
1  Saat kita melihat ke langit malam, langit terl...
2  Area Danau Singkarak, Sumatera Barat diguncang...
3  Tardigrade mungkin adalah hewan yang paling un...
4  Bawang putih dan jahe sama-sama terkenal denga...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_kompas_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Kembali Jadi Pemenang Kategori Media Online Te...
1                               Apa Itu Debu Kosmik?
2  Danau Singkarak Sumatera Barat Diguncang 5 Ren...
3  Berapa Lama Mahluk Nyari Abadi Tardigrade Bisa...
4  5 Manfaat Campuran Bawang Putih dan Jahe untuk...


In [None]:
df["judul_berita"] = df["judul_berita"].apply(
    lambda x: x.strip() + "." if not re.search(r'[.!?…]$', str(x).strip()) else x.strip()
)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              131 non-null    object
 1   category         131 non-null    object
 2   subcategory      131 non-null    object
 3   tanggal_publish  131 non-null    object
 4   judul_berita     131 non-null    object
 5   tags             131 non-null    object
 6   konten_berita    131 non-null    object
dtypes: object(7)
memory usage: 7.3+ KB


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.to_csv("kompas_sains_cleaned.csv", index=False)

# Preprocessing Bola

In [None]:
import pandas as pd
import re
import os

In [None]:
file_id = "1eOpacOsUADGvqNprtC_D29cZRXFeQPWu"
url = f"https://drive.google.com/uc?id={file_id}"

df = pd.read_csv(url)
df.head()

Unnamed: 0,url,category,subcategory,tanggal_publish,judul,tags,konten
0,https://bola.kompas.com/read/2025/11/10/180917...,Bola,Timnas Indonesia,2025-11-10 18:09:17,Link Live Streaming Timnas U17 Indonesia Vs Ho...,"Honduras, Link Live Streaming, timnas u17 Indo...",KOMPAS.com - Timnas U17 Indonesia akan melakon...
1,https://bola.kompas.com/read/2025/11/10/231905...,Bola,Timnas Indonesia,2025-11-10 23:19:05,Live Skor Timnas U17 Indonesia Vs Honduras 2-1...,"Honduras, Piala Dunia U17, timnas u17, Piala D...",KOMPAS.com - Timnas U17 Indonesia unggul 2-1 a...
2,https://bola.kompas.com/read/2025/11/10/234616...,Bola,Timnas Indonesia,2025-11-10 23:46:16,"Hasil Timnas U17 Indonesia Vs Honduras 2-1, Ga...","Nova Arianto, timnas u17 Indonesia, Hasil Timn...",KOMPAS.com - Timnas U17 Indonesia berhasil mer...
3,https://bola.kompas.com/read/2025/11/10/235440...,Bola,Timnas Indonesia,2025-11-10 23:54:40,Klasemen Akhir Grup H Piala Dunia U17 Usai Tim...,"klasemen, timnas u17 Indonesia, Piala Dunia U1...",KOMPAS.com - Timnas U17 Indonesia memainkan pe...
4,https://bola.kompas.com/read/2025/11/11/001045...,Bola,Timnas Indonesia,2025-11-11 00:10:45,Timnas U17 Indonesia Vs Honduras 2-1: 3 Syarat...,"Honduras, timnas u17 Indonesia, Piala Dunia U1...",KOMPAS.com - Timnas U17 Indonesia sukses merai...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              218 non-null    object
 1   category         218 non-null    object
 2   subcategory      218 non-null    object
 3   tanggal_publish  218 non-null    object
 4   judul            218 non-null    object
 5   tags             218 non-null    object
 6   konten           218 non-null    object
dtypes: object(7)
memory usage: 12.1+ KB


In [None]:
df.rename(columns={
    'judul': 'judul_berita',
    'konten': 'konten_berita'
}, inplace=True)


In [None]:
df.columns.tolist()

['url',
 'category',
 'subcategory',
 'tanggal_publish',
 'judul_berita',
 'tags',
 'konten_berita']

## Null Handling

In [None]:
df['konten_berita'].isna().sum()

np.int64(0)

## Duplicate

In [None]:
df.duplicated().sum()

np.int64(0)

## Perbaiki Penulisan

In [None]:
def clean_text(text):
    # mengganti newline dengan spasi
    text = text.replace("\n", " ")
    # hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()
    # spasi setelah tanda baca jika belum ada
    text = re.sub(r'([.,;:!?])(?!\s)', r'\1 ', text)
    # domain
    #text = re.sub(r'\.(?!\s|[a-zA-Z])', r'. ', text)
    # hapus spasi berlebih setelah penambahan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  KOMPAS. com - Timnas U17 Indonesia akan melako...
1  KOMPAS. com - Timnas U17 Indonesia unggul 2-1 ...
2  KOMPAS. com - Timnas U17 Indonesia berhasil me...
3  KOMPAS. com - Timnas U17 Indonesia memainkan p...
4  KOMPAS. com - Timnas U17 Indonesia sukses mera...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Link Live Streaming Timnas U17 Indonesia Vs Ho...
1  Live Skor Timnas U17 Indonesia Vs Honduras 2-1...
2  Hasil Timnas U17 Indonesia Vs Honduras 2-1, Ga...
3  Klasemen Akhir Grup H Piala Dunia U17 Usai Tim...
4  Timnas U17 Indonesia Vs Honduras 2-1: 3 Syarat...


## Hapus Atribut Teks Tidak Penting

In [None]:
import re

def clean_kompas_text(text: str) -> str:
    # ubah kompas. com menjadi kompas.com
    text = re.sub(r'(?i)kompas\s*\.\s*com', 'kompas.com', text)

    # hapus kompas doc
    text = re.sub(r'(?is)kompas\.com/\s*dok\.[^.]*\.', '', text)

    # hapus kompas.com
    text = re.sub(r'(?i)(?:[A-Za-z]*\s*[-–\\\/]*)?kompas\.com[\\/A-Za-z0-9\-]*', '', text)

    # gabungan
    text = re.sub(r'^[A-Za-z\s]*?[a-z](?=[A-Z])', '', text)

    # hapus baca, baca juga, dan kombinasinya
    #text = re.sub(r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n])', '', text)
    text = re.sub(
    r'(?i)\(?baca(?:\s+(?:juga|selengkapnya))?\s*[:：]\s*[^.!?\n]*(?=[.!?\n]|$)',
    '',
    text)

    # mention
    text = re.sub(r'\(@[^)]+\)', '', text)
    text = re.sub(r'@\w+', '', text)

    # kredit kompas.com\
    text = re.sub(r'(?i)\bkompas\.com/[\w\s.]+?[a-z](?=\s*[A-Z]{2,}|[A-Z][a-z]*[A-Z])', '', text)

    # hapus .com yang mungkin tersisa
    text = re.sub(r'\b\w*\.com\b', '', text, flags=re.IGNORECASE)

    # tanda baca dan spasi
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    text = re.sub(r'([.,!?;:])([^\s"”’])', r'\1 \2', text)
    text = re.sub(r'(?<=\w)\(', r' (', text)
    text = re.sub(r'\(\s+', '(', text)
    text = re.sub(r'\s+\)', ')', text)
    text = re.sub(r'\)(?=[A-Za-z0-9])', r') ', text)
    text = re.sub(r'\s+(")', r'\1', text)
    text = re.sub(r'\.\s*"', '. "', text)

    # tanda baca berulang
    text = re.sub(r'([.,!?;:])(\s*\1)+', r'\1', text)
    text = re.sub(r'([.,!?;:])\s*[.,!?;:]+', r'\1', text)
    text = re.sub(r'"\s*"', '"', text)
    text = re.sub(r'""+', '"', text)

    # non huruf awal karakter
    text = re.sub(r'^[^A-Za-z0-9]+', '', text)

    # spasi ganda
    text = re.sub(r'\s{2,}', ' ', text)

    # trim spasi
    text = text.strip()

    return text

In [None]:
df['konten_berita'] = df['konten_berita'].apply(clean_kompas_text)
print(df[['konten_berita']].head())

                                       konten_berita
0  Timnas U17 Indonesia akan melakoni partai pamu...
1  Timnas U17 Indonesia unggul 2-1 atas Honduras ...
2  Timnas U17 Indonesia berhasil meraih kemenanga...
3  Timnas U17 Indonesia memainkan pertandingan te...
4  Timnas U17 Indonesia sukses meraih kemenangan ...


In [None]:
df['judul_berita'] = df['judul_berita'].apply(clean_kompas_text)
print(df[['judul_berita']].head())

                                        judul_berita
0  Link Live Streaming Timnas U17 Indonesia Vs Ho...
1  Live Skor Timnas U17 Indonesia Vs Honduras 2-1...
2  Hasil Timnas U17 Indonesia Vs Honduras 2-1, Ga...
3  Klasemen Akhir Grup H Piala Dunia U17 Usai Tim...
4  Timnas U17 Indonesia Vs Honduras 2-1: 3 Syarat...


In [None]:
df["judul_berita"] = df["judul_berita"].apply(
    lambda x: x.strip() + "." if not re.search(r'[.!?…]$', str(x).strip()) else x.strip()
)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              218 non-null    object
 1   category         218 non-null    object
 2   subcategory      218 non-null    object
 3   tanggal_publish  218 non-null    object
 4   judul_berita     218 non-null    object
 5   tags             218 non-null    object
 6   konten_berita    218 non-null    object
dtypes: object(7)
memory usage: 12.1+ KB


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.to_csv("kompas_bola_cleaned.csv", index=False)
