# Crawling Data pada Website Tribunnews

## 1. crawling data pada beberapa kategori 

### Kategori Internasional

In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Function to get links from Tribunnews for a specific date
def get_links_for_date(date_str):
    url = f'https://www.tribunnews.com/indeks/internasional?date={date_str}'
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch page for {date_str}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    # Attempt to find article links (adjust based on HTML structure)
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Filter out non-article links (e.g., ads, navigation)
        if '/internasional/' in href:
            # Add ?page=all to the end of the URL
            if not href.endswith('?page=all'):
                href = href + '?page=all'
            links.append(href)

    return links

# Start from the current date
start_date = datetime.now()

all_links = []
date = start_date

# Looping from the current date backwards, until we collect 5000 links
while len(all_links) < 5000:
    date_str = date.strftime('%Y-%m-%d')  # Format date as 'YYYY-MM-DD'
    print(f"Fetching links for {date_str}...")

    links = get_links_for_date(date_str)

    all_links.extend(links)

    # Stop if 5000 links are reached
    if len(all_links) >= 5000:
        break

    # Calculate percentage progress
    percentage = (len(all_links) / 5000) * 100
    print(f"Progress: {len(all_links)} links fetched ({percentage:.2f}% completed)")

    # Move to the previous day
    date -= timedelta(days=1)

# Save the links to 'tribunnews_regional_links.txt'
with open('tribunnews_internasional_links.txt', 'w') as file:
    for link in all_links[:5000]:  # Ensure only 5000 links are saved
        file.write(link + '\n')

print(f"Total links saved: {len(all_links[:5000])}")

Fetching links for 2024-12-06...
Progress: 20 links fetched (0.40% completed)
Fetching links for 2024-12-05...
Progress: 40 links fetched (0.80% completed)
Fetching links for 2024-12-04...
Progress: 60 links fetched (1.20% completed)
Fetching links for 2024-12-03...
Progress: 80 links fetched (1.60% completed)
Fetching links for 2024-12-02...
Progress: 100 links fetched (2.00% completed)
Fetching links for 2024-12-01...
Progress: 120 links fetched (2.40% completed)
Fetching links for 2024-11-30...
Progress: 140 links fetched (2.80% completed)
Fetching links for 2024-11-29...
Progress: 160 links fetched (3.20% completed)
Fetching links for 2024-11-28...
Progress: 180 links fetched (3.60% completed)
Fetching links for 2024-11-27...
Progress: 200 links fetched (4.00% completed)
Fetching links for 2024-11-26...
Progress: 220 links fetched (4.40% completed)
Fetching links for 2024-11-25...
Progress: 240 links fetched (4.80% completed)
Fetching links for 2024-11-24...
Progress: 260 links fet

### Kategori Pendidikan

In [2]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

# Function to get links from Tribunnews for a specific date
def get_links_for_date(date_str):
    url = f'https://www.tribunnews.com/indeks/pendidikan?date={date_str}'
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch page for {date_str}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    # Attempt to find article links (adjust based on HTML structure)
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Filter out non-article links (e.g., ads, navigation)
        if '/pendidikan/' in href:
            # Add ?page=all to the end of the URL
            if not href.endswith('?page=all'):
                href = href + '?page=all'
            links.append(href)

    return links

# Start from the current date
start_date = datetime.now()

all_links = []
date = start_date

# Looping from the current date backwards, until we collect 5000 links
while len(all_links) < 5000:
    date_str = date.strftime('%Y-%m-%d')  # Format date as 'YYYY-MM-DD'
    print(f"Fetching links for {date_str}...")

    links = get_links_for_date(date_str)

    all_links.extend(links)

    # Stop if 5000 links are reached
    if len(all_links) >= 5000:
        break

    # Calculate percentage progress
    percentage = (len(all_links) / 5000) * 100
    print(f"Progress: {len(all_links)} links fetched ({percentage:.2f}% completed)")

    # Move to the previous day
    date -= timedelta(days=1)

# Save the links to 'tribunnews_regional_links.txt'
with open('tribunnews_pendidikan_links.txt', 'w') as file:
    for link in all_links[:5000]:  # Ensure only 5000 links are saved
        file.write(link + '\n')

print(f"Total links saved: {len(all_links[:5000])}")

Fetching links for 2024-12-06...
Progress: 20 links fetched (0.40% completed)
Fetching links for 2024-12-05...
Progress: 40 links fetched (0.80% completed)
Fetching links for 2024-12-04...
Progress: 60 links fetched (1.20% completed)
Fetching links for 2024-12-03...
Progress: 80 links fetched (1.60% completed)
Fetching links for 2024-12-02...
Progress: 91 links fetched (1.82% completed)
Fetching links for 2024-12-01...
Progress: 100 links fetched (2.00% completed)
Fetching links for 2024-11-30...
Progress: 115 links fetched (2.30% completed)
Fetching links for 2024-11-29...
Progress: 134 links fetched (2.68% completed)
Fetching links for 2024-11-28...
Progress: 154 links fetched (3.08% completed)
Fetching links for 2024-11-27...
Progress: 162 links fetched (3.24% completed)
Fetching links for 2024-11-26...
Progress: 182 links fetched (3.64% completed)
Fetching links for 2024-11-25...
Progress: 202 links fetched (4.04% completed)
Fetching links for 2024-11-24...
Progress: 209 links fetc

### Download Struktur HTML setiap artikel

In [1]:
## Download html file for internasional

import os
import requests
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

# Folder untuk menyimpan file HTML
folder_name = 'artikel_html_internasional'
os.makedirs(folder_name, exist_ok=True)  # Buat folder jika belum ada

# Lock untuk menghindari konflik saat menulis ke konsol
lock = threading.Lock()

# Function untuk mendownload artikel dan menyimpannya sebagai file HTML
def download_article(link, index, total_links):
    try:
        response = requests.get(link, timeout=10)  # Timeout 10 detik
        if response.status_code == 200:
            # Nama file HTML dengan format artikel_{index}.html
            file_name = os.path.join(folder_name, f'artikel_{index}.html')
            with open(file_name, 'w', encoding='utf-8') as file:
                file.write(response.text)

            # Menggunakan Lock untuk memastikan hanya satu thread yang menulis ke konsol
            with lock:
                print(f"Artikel {index} berhasil diunduh: {link}")
        else:
            with lock:
                print(f"Gagal mengunduh artikel {index}: {link}, Status code: {response.status_code}")
        
        # Hitung persentase progres
        percentage = (index / total_links) * 100
        return index, percentage

    except requests.exceptions.Timeout:
        with lock:
            print(f"Timeout saat mengunduh artikel {index}: {link}")
        return index, None
    except Exception as e:
        with lock:
            print(f"Error saat mengunduh artikel {index}: {link} - {str(e)}")
        return index, None

# Baca semua link dari file 'tribunnews_nasional_links.txt'
with open('data/link/tribunnews_internasional_links.txt', 'r') as f:
    links = f.readlines()

# Hitung total link
total_links = len(links)

# Fungsi untuk menampilkan progres secara manual
def update_progress(futures, total_links):
    for future in as_completed(futures):
        index, percentage = future.result()
        if percentage is not None:
            print(f"Progres: {index}/{total_links} ({percentage:.2f}%)")

# Gunakan ThreadPoolExecutor untuk menjalankan download artikel secara paralel
with ThreadPoolExecutor(max_workers=20) as executor:  # Menggunakan 20 thread
    futures = []

    # Mengirim semua tugas download artikel ke executor
    for index, link in enumerate(links, start=1):
        link = link.strip()  # Menghapus karakter newline di akhir setiap link
        future = executor.submit(download_article, link, index, total_links)
        futures.append(future)

    # Tunggu semua pekerjaan selesai dan update progres
    update_progress(futures, total_links)

print("Proses download selesai.")

Artikel 5 berhasil diunduh: https://www.tribunnews.com/internasional/2024/12/06/oposisi-sudah-berada-di-gerbang-kota-homs-panglima-perang-hts-tujuan-kami-gulingkan-rezim-assad?page=all
Progres: 5/5000 (0.10%)
Artikel 11 berhasil diunduh: https://www.tribunnews.com/internasional/2024/12/06/hizbullah-akan-mendukung-suriah-dalam-menggagalkan-tujuan-teroris-kata-sheikh-naim-qassem?page=all
Progres: 11/5000 (0.22%)
Artikel 16 berhasil diunduh: https://www.tribunnews.com/internasional/2024/12/06/oposisi-suriah-rebut-kota-penting-hama-peneliti-internasional-pukulan-telak-rezim-bashar-al-assad?page=all
Progres: 16/5000 (0.32%)
Artikel 10 berhasil diunduh: https://www.tribunnews.com/internasional/2024/12/06/44612-jiwa-melayang-akibat-perang-israel-hamas-di-gaza?page=all
Progres: 10/5000 (0.20%)
Artikel 15 berhasil diunduh: https://www.tribunnews.com/internasional/2024/12/06/brigade-al-quds-brigade-nablus-sergap-pasukan-infanteri-israel-di-poros-hashashin-di-kamp-balata?page=all
Progres: 15/5000

In [2]:
## Download html file for Pendidikan

import os
import requests
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

# Folder untuk menyimpan file HTML
folder_name = 'artikel_html_pendidikan'
os.makedirs(folder_name, exist_ok=True)  # Buat folder jika belum ada

# Lock untuk menghindari konflik saat menulis ke konsol
lock = threading.Lock()

# Function untuk mendownload artikel dan menyimpannya sebagai file HTML
def download_article(link, index, total_links):
    try:
        response = requests.get(link, timeout=10)  # Timeout 10 detik
        if response.status_code == 200:
            # Nama file HTML dengan format artikel_{index}.html
            file_name = os.path.join(folder_name, f'artikel_{index}.html')
            with open(file_name, 'w', encoding='utf-8') as file:
                file.write(response.text)

            # Menggunakan Lock untuk memastikan hanya satu thread yang menulis ke konsol
            with lock:
                print(f"Artikel {index} berhasil diunduh: {link}")
        else:
            with lock:
                print(f"Gagal mengunduh artikel {index}: {link}, Status code: {response.status_code}")
        
        # Hitung persentase progres
        percentage = (index / total_links) * 100
        return index, percentage

    except requests.exceptions.Timeout:
        with lock:
            print(f"Timeout saat mengunduh artikel {index}: {link}")
        return index, None
    except Exception as e:
        with lock:
            print(f"Error saat mengunduh artikel {index}: {link} - {str(e)}")
        return index, None

# Baca semua link dari file 'tribunnews_nasional_links.txt'
with open('data/link/tribunnews_pendidikan_links.txt', 'r') as f:
    links = f.readlines()

# Hitung total link
total_links = len(links)

# Fungsi untuk menampilkan progres secara manual
def update_progress(futures, total_links):
    for future in as_completed(futures):
        index, percentage = future.result()
        if percentage is not None:
            print(f"Progres: {index}/{total_links} ({percentage:.2f}%)")

# Gunakan ThreadPoolExecutor untuk menjalankan download artikel secara paralel
with ThreadPoolExecutor(max_workers=20) as executor:  # Menggunakan 20 thread
    futures = []

    # Mengirim semua tugas download artikel ke executor
    for index, link in enumerate(links, start=1):
        link = link.strip()  # Menghapus karakter newline di akhir setiap link
        future = executor.submit(download_article, link, index, total_links)
        futures.append(future)

    # Tunggu semua pekerjaan selesai dan update progres
    update_progress(futures, total_links)

print("Proses download selesai.")

Artikel 17 berhasil diunduh: https://www.tribunnews.com/pendidikan/2024/12/06/kumpulan-25-soal-pas-bahasa-indonesia-kelas-7-semester-1-lengkap-dengan-kunci-jawabannya?page=all
Progres: 17/5000 (0.34%)
Artikel 5 berhasil diunduh: https://www.tribunnews.com/pendidikan/2024/12/06/inisiatif-tanoto-foundation-siapkan-generasi-berkualitas?page=all
Progres: 5/5000 (0.10%)
Artikel 12 berhasil diunduh: https://www.tribunnews.com/pendidikan/2024/12/06/kunci-jawaban-pmm-mengapa-pemahaman-perbedaan-intelektual-penting-dalam-konteks-pendidikan?page=all
Progres: 12/5000 (0.24%)
Artikel 10 berhasil diunduh: https://www.tribunnews.com/pendidikan/2024/12/06/pintu-mahasiswa-raih-gelar-ganda-internasional-dari-kampus-ternama?page=all
Artikel 4 berhasil diunduh: https://www.tribunnews.com/pendidikan/2024/12/06/25-soal-pas-bahasa-jawa-kelas-6-sd-semester-1-kurikulum-merdeka-lengkap-dengan-kunci-jawaban?page=all
Progres: 10/5000 (0.20%)
Progres: 4/5000 (0.08%)
Artikel 20 berhasil diunduh: https://www.tribun

In [3]:
%pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### Scraping data yanng penting dari teks

In [4]:
## Ubah beberapa folder html ke csv tabel

import os
import csv
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor  # Menggunakan ThreadPoolExecutor
from tqdm import tqdm

# Daftar folder input yang berisi artikel HTML dan kategori terkait
input_folders = [
    {'folder': 'artikel_html_internasional', 'kategori': 'internasional'},
    {'folder': 'artikel_html_pendidikan', 'kategori': 'pendidikan'}
    
]

output_file = 'artikel.csv'

# Function untuk men-scrap konten dari file HTML
def scrap_article_from_html(file_path, kategori):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        soup = BeautifulSoup(content, 'html.parser')

        # Mengambil judul
        judul = soup.find("h1", class_="f50 black2 f400 crimson")
        judul_text = judul.text.strip() if judul else "Judul tidak ditemukan"

        # Mengambil tanggal
        div_element = soup.find("div", class_="grey bdr3 pb10 pt10")
        tanggal = div_element.find("span").text.strip() if div_element and div_element.find("span") else "Tanggal tidak ditemukan"

        # Mengambil penulis
        penulis = soup.find("div", id="penulis")
        nama_penulis = penulis.find("a").text.strip() if penulis and penulis.find("a") else "Penulis tidak ditemukan"

        # Mengambil editor
        editor = soup.find("div", id="editor")
        nama_editor = editor.find("a").text.strip() if editor and editor.find("a") else "Editor tidak ditemukan"

        # Mengambil isi artikel
        artikel = soup.find("div", class_="side-article txt-article multi-fontsize")
        isi_artikel = artikel.text.strip() if artikel else "Isi artikel tidak ditemukan"

        # Kategori artikel sudah diteruskan
        return [judul_text, tanggal, nama_penulis, nama_editor, isi_artikel, kategori]
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None  # Kembalikan None jika terjadi error

# Function untuk pemrosesan paralel
def process_file(file_name, folder, kategori):
    file_path = os.path.join(folder, file_name)
    return scrap_article_from_html(file_path, kategori)

# Menjalankan scraping dengan paralelisme menggunakan ThreadPoolExecutor
with open(output_file, 'w', encoding='utf-8', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["Judul", "Tanggal", "Penulis", "Editor", "Isi Artikel", "Kategori"])  # Header kolom

    # Menelusuri setiap folder dan file HTML yang ada di dalamnya
    for folder_info in input_folders:
        folder = folder_info['folder']
        kategori = folder_info['kategori']
        file_list = [file_name for file_name in os.listdir(folder) if file_name.endswith('.html')]

        with ThreadPoolExecutor(max_workers=2) as executor:  # Menggunakan ThreadPoolExecutor
            results = list(tqdm(executor.map(lambda file_name: process_file(file_name, folder, kategori), file_list), 
                                total=len(file_list), desc=f"Scraping artikel {kategori}", unit="file"))

        # Filter hasil None
        results = [result for result in results if result is not None]

        # Menulis hasil scraping ke file CSV
        csvwriter.writerows(results)  # Menulis seluruh hasil secara sekaligus

print(f"Scraping selesai dan hasil disimpan di file '{output_file}'.")

Scraping artikel internasional: 100%|██████████| 4998/4998 [18:57<00:00,  4.39file/s]
Scraping artikel pendidikan: 100%|██████████| 4998/4998 [18:39<00:00,  4.47file/s]


Scraping selesai dan hasil disimpan di file 'artikel.csv'.
