# Web Scraping Artikel Berita Transjakarta

## Instalasi library yang dibutuhkan (cukup jalankan sekali)
## Pastikan Anda menjalankan ini di terminal atau command prompt:
## pip install pandas requests beautifulsoup4

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import os

In [4]:
# --- 1. Membaca File CSV ---
csv_path = '../Data/Kelompok2_Artikel_TransJakarta - Artikel Berita.csv'

In [None]:
if not os.path.exists(csv_path):
    print("Error: File tidak ditemukan.")
else:
    print("File CSV berhasil ditemukan. Memulai proses scraping...")
    df = pd.read_csv(csv_path)

    # --- 2. Fungsi Scraping ---
    def scrape_article(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        try:
            response = requests.get(url, headers=headers, timeout=15)
            if response.status_code != 200:
                return "Gagal", f"Status Code: {response.status_code}", "request_gagal"

            soup = BeautifulSoup(response.content, 'html.parser')
            title = "Judul tidak ditemukan"
            content = "Konten tidak ditemukan"

            # Logika untuk setiap sumber berita
            if any(domain in url for domain in ['news.detik.com', 'oto.detik.com', 'finance.detik.com', 'travel.detik.com']):
                title_tag = soup.find('h1', class_='detail__title')
                article_body = soup.find('div', class_='detail__body-text')
                if title_tag: title = title_tag.get_text(strip=True)
                if article_body:
                    paragraphs = article_body.find_all('p')
                    content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            elif 'kompas.id' in url:
                title_tag = soup.find('h1', class_='title-read')
                article_body = soup.find('div', class_='paywall')
                if title_tag: title = title_tag.get_text(strip=True)
                if article_body:
                    paragraphs = article_body.find_all('p')
                    content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            elif 'liputan6.com' in url:
                # Coba cari judul dengan class pertama
                title_tag = soup.find('h1', class_='article-header__title')
                if not title_tag: # Jika tidak ketemu, coba class kedua
                    title_tag = soup.find('h1', class_='read-page--header--title')
                if title_tag: title = title_tag.get_text(strip=True)

                # Coba cari konten dengan class pertama
                article_body = soup.find('div', class_='article-body__content-aside')
                if not article_body: # Jika tidak ketemu, coba class kedua
                    article_body = soup.find('div', class_='read-page--content')
                
                if article_body:
                    for unwanted_div in article_body.find_all('div', class_='baca-juga'):
                        unwanted_div.decompose()
                    paragraphs = article_body.find_all('p')
                    content = "\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            
            elif 'tempo.co' in url:
                title_tag = soup.find('h1', itemprop='headline')
                article_body = soup.find('div', itemprop='articleBody')
                if title_tag: title = title_tag.get_text(strip=True)
                if article_body:
                    paragraphs = article_body.find_all('p')
                    content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            elif 'cnnindonesia.com' in url:
                title_tag = soup.find('h1', class_='text-cnn_black')
                article_body = soup.find('div', class_='detail-text')
                if title_tag: title = title_tag.get_text(strip=True)
                if article_body:
                    # Hapus transkrip video jika ada
                    for transcript in article_body.find_all('div', class_='transcript-box'):
                        transcript.decompose()
                    paragraphs = article_body.find_all('p')
                    content = "\n".join([p.get_text(strip=True) for p in paragraphs])

            # Membersihkan sisa teks promosi
            if "Baca juga:" in content:
                content = content.split("Baca juga:")[0]
            if "Simak video" in content:
                content = content.split("Simak video")[0]

            return title, content.strip(), "success"

        except requests.RequestException as e:
            return "Gagal", f"Error Koneksi: {e}", "koneksi_gagal"
        except Exception as e:
            return "Gagal", f"Error Lainnya: {e}", "exception"

    # --- 3. Proses Scraping Seluruh Link ---
    results = []
    total_links = len(df)
    for index, row in df.iterrows():
        link = row['Link']
        if pd.notna(link) and link.startswith('http'):
            print(f"Scraping ({index + 1}/{total_links}): {link}")
            judul, isi_berita, status = scrape_article(link)
            results.append({'Judul': judul, 'Isi Berita': isi_berita, 'Status': status})
            time.sleep(1) # Jeda 0.5 detik
        else:
            results.append({'Judul': 'Tidak ada link', 'Isi Berita': '', 'Status': 'link_kosong'})

    # --- 4. Menggabungkan Hasil dan Menyimpan File ---
    df_hasil = pd.DataFrame(results)
    df_final = pd.concat([df.reset_index(drop=True), df_hasil], axis=1)

    output_path = '../Data/hasil_scraping_transjakarta.csv'
    df_final.to_csv(output_path, index=False, encoding='utf-8', sep=';')

    print(f"\n✅ Proses scraping selesai. Hasil disimpan di file: {output_path}")
    print("\nCuplikan 5 baris pertama dari data hasil scraping:")
    print(df_final[['Link', 'Judul', 'Status']].head())

    # Cek jumlah scraping yang berhasil vs gagal
    sukses_count = df_final[df_final['Status'] == 'success'].shape[0]
    gagal_count = total_links - sukses_count
    print(f"Total link: {total_links} | Berhasil: {sukses_count} | Gagal: {gagal_count}")

File CSV berhasil ditemukan. Memulai proses scraping...
Scraping (1/150): https://news.detik.com/berita/d-8114173/hore-naik-transjakarta-mrt-lrt-jakarta-rp-1-di-17-19-september
Scraping (2/150): https://news.detik.com/berita/d-8084934/sejumlah-layanan-transjakarta-terganggu-imbas-demo-ini-daftar-rute-yang-dialihkan
Scraping (3/150): https://news.detik.com/berita/d-8108095/truk-mogok-di-jalan-gatsu-arah-cawang-sejumlah-rute-transj-terganggu
Scraping (4/150): https://news.detik.com/berita/d-8018912/rano-karno-rute-transjakarta-ancol-blok-m-akan-meluncur-agustus
Scraping (5/150): https://oto.detik.com/berita/d-8127426/bus-listrik-transjakarta-kecelakaan-di-setiabudi-bukan-karena-rem-blong
Scraping (6/150): https://news.detik.com/berita/d-8099155/transjakarta-investigasi-internal-usai-bus-seruduk-warung-di-jaksel
Scraping (7/150): https://news.detik.com/berita/d-8125150/investigasi-3-kali-kecelakaan-bus-dalam-sebulan-transj-gandeng-knkt
Scraping (8/150): https://news.detik.com/berita/d-805