In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# 定義要爬取的PTT板塊
board = 'Stock'

# 建立一個session來維持cookies
session = requests.Session()
session.cookies.set('over18', '1')  # 設定cookie以通過年齡驗證

# 定義抓取最新頁面的函數
def get_latest_page_number():
    url = f'https://www.ptt.cc/bbs/{board}/index.html'
    response = session.get(url)
    if response.status_code != 200:
        raise Exception(f'Failed to load page {url}')
    soup = BeautifulSoup(response.text, 'html.parser')
    prev_page_link = soup.find('a', string='‹ 上頁')
    if prev_page_link:
        prev_page_url = prev_page_link['href']
        latest_page_num = int(prev_page_url.split('index')[1].split('.html')[0]) + 1
        return latest_page_num
    else:
        raise Exception('Failed to find the latest page number')

# 定義抓取多頁的函數
def get_article_urls_and_titles(page):
    url = f'https://www.ptt.cc/bbs/{board}/index{page}.html'
    response = session.get(url)
    if response.status_code != 200:
        raise Exception(f'Failed to load page {url}')
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('div', class_='r-ent')
    article_info = []
    for article in articles:
        a_tag = article.find('a')
        if a_tag:
            title = a_tag.text.strip() if a_tag.text else 'No Title'
            article_info.append({'title': title, 'url': 'https://www.ptt.cc' + a_tag['href']})
    return article_info

# 定義抓取文章內容及留言的函數
def get_article_content(url):
    response = session.get(url)
    if response.status_code != 200:
        print(f'Failed to load article {url}')
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    main_content = soup.find(id='main-content')
    
    if main_content:
        for tag in main_content.find_all(['div', 'span', 'script', 'a']):
            tag.decompose()
        article_text = main_content.text.strip() if main_content.text else 'No main content found'
    else:
        article_text = 'No main content found'
    
    return article_text

# 定義抓取留言的函數
def get_article_comments(url):
    response = session.get(url)
    if response.status_code != 200:
        print(f'Failed to load article {url}')
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    pushes = soup.find_all('div', class_='push')
    push_tag = []
    push_userid = []
    push_content = []
    push_time = []
    
    for push in pushes:
        push_tag_text = push.find('span', class_='push-tag')
        push_userid_text = push.find('span', class_='push-userid')
        push_content_text = push.find('span', class_='push-content')
        push_time_text = push.find('span', class_='push-ipdatetime')
        
        push_tag.append(push_tag_text.text.strip() if push_tag_text else 'No Tag')
        push_userid.append(push_userid_text.text.strip() if push_userid_text else 'No User ID')
        push_content.append(push_content_text.text.strip().lstrip(': ') if push_content_text else 'No Content')
        push_time.append(push_time_text.text.strip() if push_time_text else 'No Time')
        
    comments = pd.DataFrame({
        "URL": url,
        "push_tag": push_tag,
        "push_userid": push_userid,
        "push_content": push_content,
        "push_time": push_time
    })
    return comments

# 開始抓取文章
all_articles = []
all_comments = []

# 取得最新的頁數
latest_page = get_latest_page_number()
print(f'Latest page number: {latest_page}')

# 這裡設定要抓取的頁數範圍，可以調整
start_page = latest_page
end_page = latest_page - 1  # 設定抓取的頁數範圍，例如從最新頁到前一頁

for page in range(start_page, end_page - 1, -1):
    print(f'Fetching page {page}')
    articles_info = get_article_urls_and_titles(page)
    for article_info in articles_info:
        url = article_info['url']
        title = article_info['title']
        print(f'Fetching article {url}')
        article_text = get_article_content(url)
        comments = get_article_comments(url)
        if article_text:
            all_articles.append({'URL': url, 'Title': title, 'Content': article_text})
            if comments is not None:
                all_comments.append(comments)
        time.sleep(0.5)  # 增加延遲以避免過快訪問

# 將結果轉換為DataFrame並顯示或保存
df_articles = pd.DataFrame(all_articles)
if all_comments:
    df_comments = pd.concat(all_comments).reset_index(drop=True)
else:
    df_comments = pd.DataFrame()

print(df_articles)
print(df_comments)

# 選擇性：將文章內容和留言資料保存到文件
df_articles.to_csv('./ptt_stock_articles.csv', index=False, encoding='utf-8-sig')
df_comments.to_csv('./ptt_stock_comments.csv', index=False, encoding='utf-8-sig')

Latest page number: 7549
Fetching page 7549
Fetching article https://www.ptt.cc/bbs/Stock/M.1721364282.A.ACB.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721364406.A.4FB.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721365467.A.4E9.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721365814.A.7AF.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1719872231.A.9BA.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721349002.A.3F4.html
Fetching page 7548
Fetching article https://www.ptt.cc/bbs/Stock/M.1721356359.A.52C.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721356684.A.291.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721357100.A.14C.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721357144.A.38E.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721357505.A.227.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721358283.A.C23.html
Fetching article https://www.ptt.cc/bbs/Stock/M.1721358692.A.953.html
Fetching article https://ww