In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd

In [114]:
def scrape_liputan6():
    url = "https://www.liputan6.com/tekno/berita"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    try:
        # Request halaman indeks
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Cari semua link artikel
        articles = []
        article_elements = soup.find_all('a', href=True)
        
        for element in article_elements:
            href = element['href']
            if '/read/' in href and href not in articles:
                full_url = f"https://www.liputan6.com{href}" if not href.startswith('http') else href
                articles.append(full_url)
        
        print(f"Found {len(articles)} articles")
        
        # Scrape setiap artikel
        data = []
        for article_url in articles:
            print(f"Scraping: {article_url}")
            
            try:
                article_response = requests.get(article_url, headers=headers)
                article_soup = BeautifulSoup(article_response.text, 'html.parser')
                
                # Ekstrak judul
                title = article_soup.find('h1')
                title = title.text.strip() if title else "No title"
                
                # Ekstrak konten
                content_div = article_soup.find('div', class_='article-content-body__item-content')
                if not content_div:
                    content_div = article_soup.find('div', class_='read-page--content')
                
                content = ""
                if content_div:
                    paragraphs = content_div.find_all('p')
                    content = ' '.join([p.text.strip() for p in paragraphs])
                
                data.append({
                    'title': title,
                    'url': article_url,
                    'content': content
                })
                
                time.sleep(3)  # Menghormati website
                
            except Exception as e:
                print(f"Error scraping {article_url}: {e}")
                continue
        
        # Simpan ke CSV
        with open('l6t_technews.csv', 'w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=['title', 'url', 'content'])
            writer.writeheader()
            writer.writerows(data)
        
    except Exception as e:
        print(f"Error: {e}")

In [115]:
if __name__ == "__main__":
    scrape_liputan6()

Found 82 articles
Scraping: https://www.liputan6.com/tekno/read/6141666/apple-dan-google-jajaki-integrasi-gemini-untuk-otak-baru-siri
Scraping: https://www.liputan6.com/tekno/read/6140904/tak-mau-kalah-dengan-apple-5-raksasa-hp-china-bersatu-perketat-sistem-keamanan-pengguna
Scraping: https://www.liputan6.com/tekno/read/6136673/google-photos-bakal-bisa-bedakan-foto-asli-vs-buatan-ai-begini-cara-kerjanya
Scraping: https://www.liputan6.com/tekno/read/6140303/hp-redmi-note-15-pro-dan-redmi-note-15-pro-plus-lolos-tkdn-siap-meluncur-di-indonesia
Scraping: https://www.liputan6.com/tekno/read/6140153/harga-samsung-galaxy-a07-mulai-rp1-jutaan-dapat-update-os-hingga-6-tahun
Scraping: https://www.liputan6.com/tekno/read/6139396/infinix-hot-60-pro-hadir-di-indonesia-hp-android-rp-2-jutaan-dengan-spesifikasi-gahar
Scraping: https://www.liputan6.com/tekno/read/6139496/vivo-v60-meluncur-28-agustus-hadir-dengan-kamera-telephoto-pertama-di-seri-v
Scraping: https://www.liputan6.com/tekno/read/6137809/s

In [2]:
# List semua file CSV yang ingin digabungkan
file_paths = [
    'l6t_gadget.csv', 'l6t_internet.csv', 'l6t_games.csv', 'l6t_apps.csv', 'l6t_technews.csv', 'l6t_tips.csv'
]

# Membaca dan menggabungkan semua file
all_data = pd.concat((pd.read_csv(file) for file in file_paths), ignore_index=True)

# Menyimpan hasil gabungan ke file baru
all_data.to_csv('l6_teknologi.csv', index=False, encoding='utf-8-sig')