In [13]:
import requests
from bs4 import BeautifulSoup

def scrape_bangkok_post_article(url):
    # Set a user-agent header to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; scraperbot/1.0)'
    }
    
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page, status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the article title from the <h1> tag (usually inside article-headline)
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else 'No title found'
    
    # Extract publication date and writer info from the article-info div
    published_date = None
    writer = None
    info_div = soup.find('div', class_='article-info')
    if info_div:
        # Look for paragraphs that contain PUBLISHED or WRITER text
        for p in info_div.find_all('p'):
            text = p.get_text(strip=True)
            if text.startswith("PUBLISHED"):
                # Remove the label and extra spaces
                published_date = text.replace("PUBLISHED :", "").strip()
            elif text.startswith("WRITER"):
                writer = text.replace("WRITER:", "").strip()

    # Extract article content from the article-content div
    content_div = soup.find('div', class_='article-content')
    content = content_div.get_text(separator="\n", strip=True) if content_div else 'No content found'
    
    article_data = {
        'title': title,
        'published_date': published_date,
        'writer': writer,
        'content': content
    }
    
    return article_data

# Example usage
if __name__ == '__main__':
    # Replace the URL below with the actual Bangkok Post Business section article URL you want to scrape.
    url = "https://www.bangkokpost.com/business/investment/2970513/set-index-slides-below-1-200-points"
    article = scrape_bangkok_post_article(url)
    if article:
        for key, value in article.items():
            print(f"{key}: {value}\n")


title: SET index slides below 1,200 points

published_date: 1 Mar 2025 at 05:09

writer: Nareerat Wiriyapong & Nuntawun Polkuamdee

content: Analysts say a significant rebound is unlikely for the Thai stock market after another decline. (File photo)
Pressured by weak economic growth prospects, lower than expected company earnings and concerns over the impact of US President Donald Trump's tariffs, the Stock Exchange of Thailand (SET) index slid below 1,200 points on Friday, with analysts saying no significant rebound is in sight.
The Thai index dipped by 2.4% to a low of 1,186.66 points in early morning trade on Friday before recovering to finish at 1,195.26 points by midday, down 1.68%. Heavyweight Delta Electronics Thailand (DELTA) plunged 4.95% in the morning session to 76.75 baht, while Airports of Thailand (AOT) lost 3.45% to 42 baht.
A decline of more than 20% from a peak in October makes the Thai bourse Asia's worst performing equity market in 2025 after foreign fund outflows of

In [21]:
import requests
from bs4 import BeautifulSoup
import datetime
import time
import csv

def scrape_bangkok_post_article(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; scraperbot/1.0)'
    }
    
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve the page, status code: {response.status_code} for URL: {url}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract title from the <h1> tag.
    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else 'No title found'
    
    # Initialize published_date and writer.
    published_date = None
    writer = None
    info_div = soup.find('div', class_='article-info')
    if info_div:
        for p in info_div.find_all('p'):
            text = p.get_text(strip=True)
            if text.upper().startswith("PUBLISHED"):
                # Expected format: "PUBLISHED : 1 Mar 2025 at 05:09"
                try:
                    date_part = text.replace("PUBLISHED :", "").split("at")[0].strip()
                    published_date = datetime.datetime.strptime(date_part, "%d %b %Y")
                except Exception as e:
                    print(f"Error parsing date from text '{text}': {e}")
                    published_date = text  # fallback to raw text
            elif text.upper().startswith("WRITER"):
                writer = text.replace("WRITER:", "").strip()

    # Extract article content from the article-content div.
    content_div = soup.find('div', class_='article-content')
    content = content_div.get_text(separator="\n", strip=True) if content_div else 'No content found'
    
    return {
        'url': url,
        'title': title,
        'published_date': published_date,
        'writer': writer,
        'content': content
    }

def get_article_links_from_listing(page_number):
    base_url = "https://www.bangkokpost.com/business"
    url = base_url if page_number == 1 else f"{base_url}?page={page_number}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (compatible; scraperbot/1.0)'
    }
    print(f"Fetching listing page: {url}")
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve listing page {url} (status: {response.status_code})")
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    links = set()
    for a in soup.find_all('a', href=True):
        href = a['href']
        if '/business/' in href and any(char.isdigit() for char in href):
            full_url = href if href.startswith('http') else "https://www.bangkokpost.com" + href
            links.add(full_url)
    return list(links)

def scrape_articles_from_date(start_date=datetime.datetime(2024, 9, 1)):
    articles = []
    page = 1
    while True:
        article_links = get_article_links_from_listing(page)
        if not article_links:
            print("No more article links found on the listing page.")
            break
        
        stop_loop = False
        print(f"Found {len(article_links)} article links on page {page}.")
        for link in article_links:
            print(f"Scraping article: {link}")
            article = scrape_bangkok_post_article(link)
            if article and isinstance(article['published_date'], datetime.datetime):
                if article['published_date'] >= start_date:
                    articles.append(article)
                else:
                    # Since listing pages are assumed to be in reverse chronological order,
                    # encountering an older article means subsequent articles are too.
                    print(f"Encountered article from {article['published_date'].strftime('%d %b %Y')} which is older than {start_date.strftime('%d %b %Y')}. Stopping further scraping.")
                    stop_loop = True
                    break
            else:
                print(f"Skipping article with unrecognized publication date: {link}")
            time.sleep(1)
        
        if stop_loop:
            break
        page += 1
        time.sleep(2)
        
    return articles

def save_articles_to_csv(articles, filename="bangkokpost_business_articles.csv"):
    fieldnames = ['url', 'title', 'published_date', 'writer', 'content']
    with open(filename, mode='w', encoding='utf-8', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        for article in articles:
            if isinstance(article['published_date'], datetime.datetime):
                article['published_date'] = article['published_date'].strftime("%d %b %Y")
            writer.writerow(article)
    print(f"Saved {len(articles)} articles to {filename}")

# Main execution
if __name__ == '__main__':
    # Default start date is 1 Sep 2024; change if needed.
    start_date = datetime.datetime(2024, 9, 1)
    scraped_articles = scrape_articles_from_date(start_date=start_date)
    print(f"\nScraped {len(scraped_articles)} articles published on or after {start_date.strftime('%d %b %Y')} from the Business section.\n")
    
    # Save the scraped articles to a CSV file
    save_articles_to_csv(scraped_articles)
    
    # Optionally, print a summary of each article
    for article in scraped_articles:
        print(f"Title: {article['title']}")
        print(f"Published Date: {article['published_date']}")
        print(f"URL: {article['url']}\n")


Fetching listing page: https://www.bangkokpost.com/business
Found 30 article links on page 1.
Scraping article: https://www.bangkokpost.com/business/general/2971915/central-retail-corporation-sees-revenue-growth-of-6-in-2024
Scraping article: https://www.bangkokpost.com/business/general/2971636/crackdown-urged-on-corruption-foreigners-working-illegally-in-thailand
Scraping article: https://www.bangkokpost.com/business/general/2971915/central-retail-corporation-sees-revenue-growth-of-6-in-2024?view_comment=1
Scraping article: https://www.bangkokpost.com/business/general/2970683/the-lowdown-on-long-stay-visas?view_comment=1
Scraping article: https://www.bangkokpost.com/business/general/2970231/thai-economy-improved-in-january-from-previous-month-says-bank-of-thailand-bot-
Scraping article: https://www.bangkokpost.com/business/general/2967101/welcome-change
Scraping article: https://www.bangkokpost.com/business/general/2967381/baht-expected-to-dip-to-35-by-mid-year
Scraping article: https