# Crawling Artikel Berita

## Anggota Kelompok
- **Nama:** Aulia Muzhaffar
  - **NPM:** 2108107010033
- **Nama:** Muhammad Ghufran
  - **NPM:** 2108107010080

## Deskripsi Tugas
Tugas ini bertujuan untuk melakukan crawling artikel berita dengan menggunakan Python. Kami akan menggunakan library seperti BeautifulSoup dan requests untuk mengumpulkan data dari situs berita.

## Tujuan
- Mengumpulkan 50000000 artikel berita terbaru.
- Menyimpan data yang diperoleh dalam format yang sesuai untuk analisis lebih lanjut.

## Alat dan Teknologi
- Python
- BeautifulSoup
- Requests

## Langkah-Langkah
1. **Persiapan Lingkungan**
   - Instalasi library yang diperlukan.
   
   ```bash
   pip install requests beautifulsoup4


In [9]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Function to get links from Tribunnews for a specific date
def get_links_for_date(date_str):
    url = f'https://www.tribunnews.com/indeks/lifestyle?date={date_str}'
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch page for {date_str}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    # Attempt to find article links (adjust based on HTML structure)
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Filter out non-article links (e.g., ads, navigation)
        if '/lifestyle/' in href:
            # Add ?page=all to the end of the URL
            if not href.endswith('?page=all'):
                href = href + '?page=all'
            links.append(href)

    return links

# Start from the current date
start_date = datetime.now()

all_links = []
date = start_date

# Looping from the current date backwards, until we collect 5000 links
while len(all_links) < 5000:
    date_str = date.strftime('%Y-%m-%d')  # Format date as 'YYYY-MM-DD'
    print(f"Fetching links for {date_str}...")

    links = get_links_for_date(date_str)

    all_links.extend(links)

    # Stop if 5000 links are reached
    if len(all_links) >= 5000:
        break

    # Calculate percentage progress
    percentage = (len(all_links) / 5000) * 100
    print(f"Progress: {len(all_links)} links fetched ({percentage:.2f}% completed)")

    # Move to the previous day
    date -= timedelta(days=1)

# Save the links to 'tribunnews_regional_links.txt'
with open('data/link/tribunnews_lifestyle_links.txt', 'w') as file:
    for link in all_links[:5000]:  # Ensure only 5000 links are saved
        file.write(link + '\n')

print(f"Total links saved: {len(all_links[:5000])}")

Fetching links for 2024-12-06...
Progress: 8 links fetched (0.16% completed)
Fetching links for 2024-12-05...
Progress: 21 links fetched (0.42% completed)
Fetching links for 2024-12-04...
Progress: 28 links fetched (0.56% completed)
Fetching links for 2024-12-03...
Progress: 40 links fetched (0.80% completed)
Fetching links for 2024-12-02...
Progress: 46 links fetched (0.92% completed)
Fetching links for 2024-12-01...
Progress: 53 links fetched (1.06% completed)
Fetching links for 2024-11-30...
Progress: 63 links fetched (1.26% completed)
Fetching links for 2024-11-29...
Progress: 72 links fetched (1.44% completed)
Fetching links for 2024-11-28...
Progress: 81 links fetched (1.62% completed)
Fetching links for 2024-11-27...
Progress: 89 links fetched (1.78% completed)
Fetching links for 2024-11-26...
Progress: 97 links fetched (1.94% completed)
Fetching links for 2024-11-25...
Progress: 106 links fetched (2.12% completed)
Fetching links for 2024-11-24...
Progress: 112 links fetched (2.

In [None]:
import os
import requests
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

# Folder untuk menyimpan file HTML
folder_name = 'artikel_html_regional'
os.makedirs(folder_name, exist_ok=True)  # Buat folder jika belum ada

# Lock untuk menghindari konflik saat menulis ke konsol
lock = threading.Lock()

# Function untuk mendownload artikel dan menyimpannya sebagai file HTML
def download_article(link, index, total_links):
    try:
        response = requests.get(link, timeout=10)  # Timeout 10 detik
        if response.status_code == 200:
            # Nama file HTML dengan format artikel_{index}.html
            file_name = os.path.join(folder_name, f'artikel_{index}.html')
            with open(file_name, 'w', encoding='utf-8') as file:
                file.write(response.text)

            # Menggunakan Lock untuk memastikan hanya satu thread yang menulis ke konsol
            with lock:
                print(f"Artikel {index} berhasil diunduh: {link}")
        else:
            with lock:
                print(f"Gagal mengunduh artikel {index}: {link}, Status code: {response.status_code}")
        
        # Hitung persentase progres
        percentage = (index / total_links) * 100
        return index, percentage

    except requests.exceptions.Timeout:
        with lock:
            print(f"Timeout saat mengunduh artikel {index}: {link}")
        return index, None
    except Exception as e:
        with lock:
            print(f"Error saat mengunduh artikel {index}: {link} - {str(e)}")
        return index, None

# Baca semua link dari file 'tribunnews_nasional_links.txt'
with open('data/link/tribunnews_regional_links.txt', 'r') as f:
    links = f.readlines()

# Hitung total link
total_links = len(links)

# Fungsi untuk menampilkan progres secara manual
def update_progress(futures, total_links):
    for future in as_completed(futures):
        index, percentage = future.result()
        if percentage is not None:
            print(f"Progres: {index}/{total_links} ({percentage:.2f}%)")

# Gunakan ThreadPoolExecutor untuk menjalankan download artikel secara paralel
with ThreadPoolExecutor(max_workers=20) as executor:  # Menggunakan 20 thread
    futures = []

    # Mengirim semua tugas download artikel ke executor
    for index, link in enumerate(links, start=1):
        link = link.strip()  # Menghapus karakter newline di akhir setiap link
        future = executor.submit(download_article, link, index, total_links)
        futures.append(future)

    # Tunggu semua pekerjaan selesai dan update progres
    update_progress(futures, total_links)

print("Proses download selesai.")


Artikel 8 berhasil diunduh: https://www.tribunnews.com/regional/2024/12/06/video-terpidana-kasus-vina-lepas-kepergian-sang-ibunda-sudirman-hadir-dikawal-petugas-bersenjata?page=all
Progres: 8/5000 (0.16%)
Artikel 17 berhasil diunduh: https://www.tribunnews.com/regional/2024/12/06/sosok-guru-korban-pembunuhan-sekeluarga-di-kediri-jadi-pns-pada-2019-berdedikasi-dan-kinerja-baik?page=all
Progres: 17/5000 (0.34%)
Artikel 3 berhasil diunduh: https://www.tribunnews.com/regional/2024/12/06/yandri-susanto-hadiri-penyerahan-bantuan-alat-berat-ke-warga-desa-tepian-langsat-kalimantan-timur?page=all
Artikel 11 berhasil diunduh: https://www.tribunnews.com/regional/2024/12/06/belum-ada-tersangka-dalam-kasus-penembakan-gamma-di-semarang-ini-kata-polda?page=all
Artikel 2 berhasil diunduh: https://www.tribunnews.com/regional/2024/12/06/kemenkes-kirim-obat-obatan-untuk-korban-terdampak-banjir-di-sukabumi?page=all
Artikel 16 berhasil diunduh: https://www.tribunnews.com/regional/2024/12/06/duka-sudirman-t

In [1]:
import os
import csv
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor  # Menggunakan ThreadPoolExecutor
from tqdm import tqdm

# Daftar folder input yang berisi artikel HTML dan kategori terkait
input_folders = [
    {'folder': 'artikel_html_nasional', 'kategori': 'nasional'},
    {'folder': 'artikel_html_metropolitan', 'kategori': 'metropolitan'},
    {'folder': 'artikel_html_lifestyle', 'kategori': 'lifestyle'},
    {'folder': 'artikel_html_regional', 'kategori': 'regional'}
]

output_file = 'artikel.csv'

# Function untuk men-scrap konten dari file HTML
def scrap_article_from_html(file_path, kategori):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        soup = BeautifulSoup(content, 'html.parser')

        # Mengambil judul
        judul = soup.find("h1", class_="f50 black2 f400 crimson")
        judul_text = judul.text.strip() if judul else "Judul tidak ditemukan"

        # Mengambil tanggal
        div_element = soup.find("div", class_="grey bdr3 pb10 pt10")
        tanggal = div_element.find("span").text.strip() if div_element and div_element.find("span") else "Tanggal tidak ditemukan"

        # Mengambil penulis
        penulis = soup.find("div", id="penulis")
        nama_penulis = penulis.find("a").text.strip() if penulis and penulis.find("a") else "Penulis tidak ditemukan"

        # Mengambil editor
        editor = soup.find("div", id="editor")
        nama_editor = editor.find("a").text.strip() if editor and editor.find("a") else "Editor tidak ditemukan"

        # Mengambil isi artikel
        artikel = soup.find("div", class_="side-article txt-article multi-fontsize")
        isi_artikel = artikel.text.strip() if artikel else "Isi artikel tidak ditemukan"

        # Kategori artikel sudah diteruskan
        return [judul_text, tanggal, nama_penulis, nama_editor, isi_artikel, kategori]
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None  # Kembalikan None jika terjadi error

# Function untuk pemrosesan paralel
def process_file(file_name, folder, kategori):
    file_path = os.path.join(folder, file_name)
    return scrap_article_from_html(file_path, kategori)

# Menjalankan scraping dengan paralelisme menggunakan ThreadPoolExecutor
with open(output_file, 'w', encoding='utf-8', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["Judul", "Tanggal", "Penulis", "Editor", "Isi Artikel", "Kategori"])  # Header kolom

    # Menelusuri setiap folder dan file HTML yang ada di dalamnya
    for folder_info in input_folders:
        folder = folder_info['folder']
        kategori = folder_info['kategori']
        file_list = [file_name for file_name in os.listdir(folder) if file_name.endswith('.html')]

        with ThreadPoolExecutor(max_workers=3) as executor:  # Menggunakan ThreadPoolExecutor
            results = list(tqdm(executor.map(lambda file_name: process_file(file_name, folder, kategori), file_list), 
                                total=len(file_list), desc=f"Scraping artikel {kategori}", unit="file"))

        # Filter hasil None
        results = [result for result in results if result is not None]

        # Menulis hasil scraping ke file CSV
        csvwriter.writerows(results)  # Menulis seluruh hasil secara sekaligus

print(f"Scraping selesai dan hasil disimpan di file '{output_file}'.")


Scraping artikel nasional: 100%|██████████| 4854/4854 [07:34<00:00, 10.69file/s]
Scraping artikel metropolitan: 100%|██████████| 4989/4989 [08:04<00:00, 10.30file/s]
Scraping artikel lifestyle: 100%|██████████| 2915/2915 [04:28<00:00, 10.85file/s]
Scraping artikel regional: 100%|██████████| 5000/5000 [08:01<00:00, 10.38file/s]


Scraping selesai dan hasil disimpan di file 'artikel.csv'.
