In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
import pandas as pd

# Define headers for web requests (to mimic a browser request)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Function to scrape the Bugis dictionary website
def scrape_bahasa_bugis_words(url):
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()  # Raise an exception for any non-200 responses
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Modify these according to the structure of the Bugis website
    word_entries = []
    entries = soup.find_all('div', class_='word-entry')  # Example: you might need to adjust this selector

    for entry in entries:
        bugis_word = entry.find('span', class_='bugis-word').get_text(strip=True)
        translation = entry.find('span', class_='translation').get_text(strip=True)
        word_entries.append({'bugis_word': bugis_word, 'translation': translation})

    return word_entries

# Function to save words to a CSV file
def save_words_to_csv(words, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['bugis_word', 'translation'])
        writer.writeheader()
        for word in words:
            writer.writerow(word)

# Function to convert words to a DataFrame for easy inspection
def words_to_dataframe(words):
    return pd.DataFrame(words)

print("Setup Complete")

# Notebook code execution example
base_url = "https://www.liputan6.com/hot/read/5187403/70-bahasa-bugis-dan-artinya-dari-kosakata-hingga-contoh-kalimatnya?page="  # Replace with the actual URL
all_words = []

# Scrape multiple pages if needed
for page_num in range(1, 8):  # Adjust range according to the number of pages available
    print(f"Scraping page {page_num}...")
    url = f"{base_url}{page_num}"
    words = scrape_bahasa_bugis_words(url)
    all_words.extend(words)
    time.sleep(random.uniform(1, 3))  # Sleep between requests to avoid being blocked

Setup Complete
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...


In [4]:
# Convert scraped words to DataFrame for inspection
words_df = words_to_dataframe(all_words)
display(words_df)  # Use this to visually inspect the DataFrame in a notebook

# Save all the collected words to a CSV file
save_words_to_csv(all_words, 'bahasa_bugis_words.csv')
print("Scraping complete and saved to bahasa_bugis_words.csv!")

Scraping complete and saved to bahasa_bugis_words.csv!
