In [2]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [12]:
import requests
import time
import json
from bs4 import BeautifulSoup


PTT_URL = 'https://www.ptt.cc'


def get_web_page(url):
    resp = requests.get(
        url=url,
        cookies={'over18': '1'}
    )
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None
    else:
        return resp.text


def get_articles(dom, date):
    soup = BeautifulSoup(dom, 'html5lib')


    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']

    articles = []  
    divs = soup.find_all('div', 'r-ent')
    for d in divs:
        if d.find('div', 'date').text.strip() == date:  
            
            push_count = 0
            push_str = d.find('div', 'nrec').text
            if push_str:
                try:
                    push_count = int(push_str) 
                except ValueError:

                    if push_str == '爆':
                        push_count = 99
                    elif push_str.startswith('X'):
                        push_count = -10

            if d.find('a'): 
                href = d.find('a')['href']
                title = d.find('a').text
                author = d.find('div', 'author').text if d.find('div', 'author') else ''
                articles.append({
                    'title': title,
                    'href': href,
                    'push_count': push_count,
                    'author': author
                })
    return articles, prev_url


def get_author_ids(posts, pattern):
    ids = set()
    for post in posts:
        if pattern in post['author']:
            ids.add(post['author'])
    return ids


In [13]:
if __name__ == '__main__':
    current_page = get_web_page(PTT_URL + '/bbs/Gossiping/index.html')
    if current_page:
        articles = []
        today = time.strftime("%m/%d").lstrip('0')
        current_articles, prev_url = get_articles(current_page, today)
        while current_articles:
            articles += current_articles
            current_page = get_web_page(PTT_URL + prev_url)
            current_articles, prev_url = get_articles(current_page, today)

        print('今天有', len(articles), '篇文章')
        threshold = 50
        print('熱門文章(> %d 推):' % (threshold))
        for a in articles:
            if int(a['push_count']) > threshold:
                print(a)
        with open('gossiping.json', 'w', encoding='utf-8') as f:
            json.dump(articles, f, indent=2, sort_keys=True, ensure_ascii=False)

今天有 2130 篇文章
熱門文章(> 50 推):
{'title': '[問卦] 人在德國 還有幾集可以逃？', 'href': '/bbs/Gossiping/M.1583848301.A.7F9.html', 'push_count': 54, 'author': 'jim18324'}
{'title': '[爆卦] 第二波武漢台商華航包機已起飛', 'href': '/bbs/Gossiping/M.1583848187.A.98A.html', 'push_count': 52, 'author': 'huangjyuan'}
{'title': '[問卦] 最想宮崎駿裡的誰當你女友？', 'href': '/bbs/Gossiping/M.1583847523.A.70D.html', 'push_count': 73, 'author': 'ericgary'}
{'title': '[新聞] 媽媽體溫37.1度未過關！武漢華航包機重開', 'href': '/bbs/Gossiping/M.1583846938.A.3FC.html', 'push_count': 98, 'author': 'kimiboy'}
{'title': '[問卦] 大家看過最詭異的書是什麼？', 'href': '/bbs/Gossiping/M.1583845920.A.EA1.html', 'push_count': 72, 'author': 'pennyleo'}
{'title': '[新聞] 第二批武漢包機延誤！旅客體溫過高 又滑', 'href': '/bbs/Gossiping/M.1583844688.A.FFC.html', 'push_count': 99, 'author': 'cheinshin'}
{'title': '[新聞] 華航專機旅客體溫過高 飛機後推後又滑回', 'href': '/bbs/Gossiping/M.1583844423.A.76C.html', 'push_count': 75, 'author': 'KKmex'}
{'title': '[問卦] 宮崎駿哪部必看啊？', 'href': '/bbs/Gossiping/M.1583843618.A.D96.html', 'push_count': 66, 'au