In [73]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import time
import os

In [122]:
class ScraperServer():
    def __init__(self, 
                 main_url: str = None,
                 headers: dict = None,
                 columns: dict = None,
                 ):
        self.main_url = main_url
        self.headers  = headers
        self.columns  = columns
        self.articles_report_columns = ['URL', 'Title', 'Status', 'RepNo.', 'Date', 'Summary', 'Background', 'Check', 'Result']
        self.articles_report_csvfile = './articles_report.csv'
        if os.path.exists(self.articles_report_csvfile):
            self.articles_report_df = pd.read_csv(self.articles_report_csvfile, index_col = 0)
            self.empty_df = False
        else:
            self.articles_report_df = pd.DataFrame(columns=self.articles_report_columns)
            self.empty_df = True

        self.special_urls = ['https://tfc-taiwan.org.tw/articles/1718', 'https://tfc-taiwan.org.tw/articles/349', 'https://tfc-taiwan.org.tw/articles/category/26/28']
        

    def get_articles(self):
        articles_url = self.main_url + self.columns['articles_report']
        try:
            new_df = pd.DataFrame(columns=self.articles_report_columns)
            concat_df = False

            time.sleep(1)
            res = requests.get(articles_url, headers=self.headers)
            soup = BeautifulSoup(res.text, 'html.parser')

            last_page_url = soup.find('div', attrs = {'class': 'item-list'}).\
                find('li', attrs = {'class': 'pager-last last'}).\
                    find('a').get('href')
            total_pages = int(last_page_url.split('=')[1])
            
            print('Total {} pages'.format(total_pages))
            
            for page in range(0, total_pages + 1):
            # for page in range(275, total_pages + 1):
                page_suff = ''
                if page != 0:
                    page_suff = '?page=' + str(page)
                
                page_url = articles_url + page_suff
                article_url = None

                try:
                    time.sleep(1)
                    print('Page: {}'.format(page_url))
                    res = requests.get(page_url, headers=self.headers)
                    soup = BeautifulSoup(res.text, 'html.parser')

                    articles = soup.find('div', attrs = {'class': 'view-content'}).\
                        findAll('div', attrs = {'class': 'views-row-inner entity-row-inner'})

                    for a in articles:
                        suff_url = str(a.findAll('a')[0].get('href'))
                        article_url = self.main_url + suff_url[1:]
                        if article_url == 'https://tfc-taiwan.org.tw/articles/category/26/28':
                            article_url = 'https://tfc-taiwan.org.tw/articles/84'
                        print('Article: {}'.format(article_url))

                        if not self.check_report_exists_by_url(article_url):
                            err, title, status, repno, date, summary, paragraph_dict = self.get_report(article_url)
                            if err:
                                return
                            
                            # print(summary)
                            # print(paragraph_dict)
                            
                            list_row = [article_url, title, status, repno, date, summary]
                            list_row.append(paragraph_dict['背景']) if '背景' in paragraph_dict else list_row.append('')
                            list_row.append(paragraph_dict['查核']) if '查核' in paragraph_dict else list_row.append('')
                            list_row.append(paragraph_dict['結論']) if '結論' in paragraph_dict else list_row.append('')
                            if self.empty_df:
                                self.articles_report_df.loc[len(self.articles_report_df)] = list_row
                            else:
                                new_df.loc[len(new_df)] = list_row
                                concat_df = True
                        # break

                    # if page == 0:
                    #     break

                except Exception as e:
                    print("Get page error: {}, page: {}, article: {})".format(e, page_url, article_url))
                    return

            # Save CSV file
            if self.empty_df:
                self.articles_report_df.to_csv(self.articles_report_csvfile, encoding='utf_8_sig')
            else:
                if concat_df:
                    self.articles_report_df = pd.concat([new_df, self.articles_report_df]).sort_index()
                    self.articles_report_df.to_csv(self.articles_report_csvfile, encoding='utf_8_sig')
                    

        except Exception as e:
            print("Get articles error: {})".format(e))
            return

    def get_report(self, article_url):

        try:
            time.sleep(1)
            res = requests.get(article_url, headers=self.headers)
            res.encoding = 'utf-8' # 確保抓下來的中文不是亂碼
            soup = BeautifulSoup(res.text, 'html.parser')
            content_main = soup.find('div', attrs = {'class': 'content-main'})
            
            if article_url in self.special_urls:
                if article_url == 'https://tfc-taiwan.org.tw/articles/1718' or article_url == 'https://tfc-taiwan.org.tw/articles/349':
                    title, status, repno, date, summary, paragraph_dict = self.get_special(content_main)
            else:
                title, status, repno, date, summary = self.get_summary(content_main)
                paragraph_dict = self.get_content(content_main)
            
            print('===============================================================================')

            return None, title, status, repno, date, summary, paragraph_dict

        except Exception as e:
            print("Get report error: {})".format(e))
            return e, None, None, None, None, None, None

    def get_summary(self, content_main):
        title = content_main.find('h2', attrs = {'class': 'node-title'}).get_text()
        print(title)
        kanban = content_main.find('div', attrs = {'id': 'kanban'})
        status, repno, date, summary = None, None, None, ''
        if kanban:
            status = kanban.get_attribute_list('class')[0]
            repno = kanban.find('div', attrs = {'class': 'entity-list-repno'}).get_text()
            date = kanban.find('div', attrs = {'class': 'entity-list-date'}).get_text().split('／')[1]
            print(status, repno, date)

        if content_main.find('div', attrs = {'class': 'node-preface'}):
            preface = content_main.find('div', attrs = {'class': 'node-preface'}).findAll('p')
            for st in preface:
                summary += (st.get_text() + '\n')
        # print(summary)

        return title, status, repno, date, summary
    
    def get_content(self, content_main):
        paragraphs_content = content_main.\
            find('div', attrs = {'class': 'field field-name-body field-type-text-with-summary field-label-hidden'}).\
                find('div', attrs = {'class': 'field-item even'})
        
        paragraph_lists = [] # 背景、查核/查證、結論
        paragraph_dict = {}
        headers = paragraphs_content.find_all('h2')
        if not headers:
            paragraphs = paragraphs_content.findAll('p')
            paragraph_list = []
            for paragraph in paragraphs:
                if paragraph.text == '背景':
                    paragraph_list.append("Header: {}".format(paragraph.text))
                elif paragraph.text == '查核' or paragraph.text == '查證' or paragraph.text == '結論':
                    paragraph_lists.append(paragraph_list) # 背景、查核/查證
                    paragraph_list = []
                    paragraph_list.append("Header: {}".format(paragraph.text))
                else:
                    if len(paragraph_list) > 0: # Check header is already in list
                        images = paragraph.find_all('img')
                        if len(images) > 0:
                            for image in images:
                                src_url = image.get('src', '')
                                # print(f"Image: {src_url}")
                                paragraph_list.append("Image: {}".format(src_url))
                        else:
                            paragraph_text = paragraph.text
                            # print(f"Paragraph: {paragraph_text}")
                            paragraph_list.append("Paragraph: {}".format(paragraph_text))
            paragraph_lists.append(paragraph_list) # 結論
        else:
            for header in headers:
                header_text = header.text
                # print(f"Header: {header_text}")
                paragraphs = header.find_all_next('p')

                # 遍历每个 <p> 标签并打印文本内容
                paragraph_list = ["Header: {}".format(header_text)]
                for paragraph in paragraphs:
                    
                    
                    if paragraph.text == '查核' or paragraph.text == '查證' or paragraph.text == '結論':
                        paragraph_lists.append(paragraph_list) # 背景、查核/查證
                        paragraph_list = []
                        paragraph_list.append("Header: {}".format(paragraph.text))
                    else:
                        if len(paragraph_list) > 0: # Check header is already in list
                    
                    
                            images = paragraph.find_all('img')
                            if len(images) > 0:
                                for image in images:
                                    src_url = image.get('src', '')
                                    # print(f"Image: {src_url}")
                                    paragraph_list.append("Image: {}".format(src_url))
                            else:
                                paragraph_text = paragraph.text
                                # print(f"Paragraph: {paragraph_text}")
                                paragraph_list.append("Paragraph: {}".format(paragraph_text))

                paragraph_lists.append(paragraph_list)

        # 整理
        for i in range(0, len(paragraph_lists) - 1):
            p1 = paragraph_lists[i]
            p2 = paragraph_lists[i + 1]

            for line in p2:
                if line in p1:
                    p1.remove(line)

        for i in range(0, len(paragraph_lists)):
            header = paragraph_lists[i][0].split(': ')[1]

            else_num = 0
            if '背景' in header:
                header = '背景'
            elif '查核' in header:
                header = '查核'
            elif '查證' in header:
                header = '查核'
            elif '結論' in header:
                header = '結論'
            else:
                header = '其他' + str(else_num)
                else_num += 1

            for ldx, line in enumerate(paragraph_lists[i]):
                #print(line)
                if ldx == 0:
                    paragraph_dict[header] = ''
                else:
                    paragraph_dict[header] += (line + '\n')

        # for p in paragraph_dict:
        #     print(p)
        #     print(paragraph_dict[p])
        return paragraph_dict
 

    def get_special(self, content_main):
        title, status, repno, date, summary = self.get_summary(content_main)

        paragraphs_content = content_main.\
            find('div', attrs = {'class': 'field field-name-body field-type-text-with-summary field-label-hidden'}).\
                find('div', attrs = {'class': 'field-item even'})
        
        paragraphs = paragraphs_content.findAll('p')

        paragraph_dict = {'查核': ''}
        paragraph_list = []
        for paragraph in paragraphs:
            
            images = paragraph.find_all('img')
            if len(images) > 0:
                for image in images:
                    src_url = image.get('src', '')
                    # print(f"Image: {src_url}")
                    paragraph_list.append("Image: {}".format(src_url))
            else:
                paragraph_text = paragraph.text
                # print(f"Paragraph: {paragraph_text}")
                paragraph_list.append("Paragraph: {}".format(paragraph_text))

        for line in paragraph_list:
            paragraph_dict['查核'] += (line + '\n')

        return title, status, repno, date, summary, paragraph_dict

    def check_report_exists_by_url(self, url):
        if url in self.articles_report_df['URL'].values:
            return True
        else:
            return False


main_url = 'https://tfc-taiwan.org.tw/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

columns = {'articles_report': 'articles/report', 'topics': 'topics'}

scraper = ScraperServer(main_url, headers, columns)
scraper.get_articles()
# print(scraper.check_report_exists_by_repno('事實查核報告#2817'))

Total 280 pages
Page: https://tfc-taiwan.org.tw/articles/report
Article: https://tfc-taiwan.org.tw/articles/10216
【事實釐清】網傳「雲林開票把票倒在桌上 有作票嫌疑」？
事實釐清 事實查核報告#2819 2024年1月22日
Article: https://tfc-taiwan.org.tw/articles/10213
【錯誤】網傳影片「大選後，中國向全世界發布最新莊嚴公告，解放軍把台灣領空畫定為防空識別區」？
錯誤 事實查核報告#2818 2024年1月19日
Article: https://tfc-taiwan.org.tw/articles/10212
【錯誤】網傳圖片「習近平致電祝賀賴清德當選中華民國台灣第16任總統」？
錯誤 事實查核報告#2817 2024年1月19日
Article: https://tfc-taiwan.org.tw/articles/10211
【事實釐清】網傳影片「疑似作票，南投67投開票所一直畫6號民進黨，沒唱票、亮票」？
事實釐清 事實查核報告#2816 2024年1月19日
Article: https://tfc-taiwan.org.tw/articles/10209
【易生誤解】網傳「柯文哲的票記給賴清德，台灣的選舉作弊」？
錯誤 事實查核報告#2815 2024年1月19日
Article: https://tfc-taiwan.org.tw/articles/10205
【錯誤】網傳「在國外遺失護照不要辦新護照，申請入國證明書。之後入境時，親友須持申請者的戶口名簿或身分證，交給機場移民局，才可入境」？
錯誤 事實查核報告#2814 2024年1月18日
Article: https://tfc-taiwan.org.tw/articles/10202
【部分錯誤】網傳「國際新聞，疫情在美國又開始嚴重了...Omicron BA的毒性比 Delta 變種高 5 倍，死亡率也比 Delta 高」？
部分錯誤 事實查核報告#2813 2024年1月17日
Article: https://tfc-taiwan.org.tw/articles/10201
【錯誤】網傳「總統投票數被電腦多灌了8%，約144

In [132]:
reports_df = pd.read_csv('articles_report.csv')
for i in range(0, len(reports_df)):
    print(reports_df.loc[i, 'Title'])

【事實釐清】網傳「雲林開票把票倒在桌上 有作票嫌疑」？
【錯誤】網傳影片「大選後，中國向全世界發布最新莊嚴公告，解放軍把台灣領空畫定為防空識別區」？
【錯誤】網傳圖片「習近平致電祝賀賴清德當選中華民國台灣第16任總統」？
【事實釐清】網傳影片「疑似作票，南投67投開票所一直畫6號民進黨，沒唱票、亮票」？
【易生誤解】網傳「柯文哲的票記給賴清德，台灣的選舉作弊」？
【錯誤】網傳「在國外遺失護照不要辦新護照，申請入國證明書。之後入境時，親友須持申請者的戶口名簿或身分證，交給機場移民局，才可入境」？
【部分錯誤】網傳「國際新聞，疫情在美國又開始嚴重了...Omicron BA的毒性比 Delta 變種高 5 倍，死亡率也比 Delta 高」？
【錯誤】網傳「總統投票數被電腦多灌了8%，約144萬票」？
【錯誤】網傳「總統票比投票人數多出248萬票」？
【易生誤解】網傳影片「唱票3號，你劃2號，選務人員很厲害」？
【錯誤】網傳開票影片「總統選票從大黑包拿出來，這是作票」？
【易生誤解】網傳「一整疊平整的選票，一直『2號賴清德一票』」？
【易生誤解】網傳圖片「昨天大選出現舞弊，這是舞弊證據，2個投開票所，民眾黨被偷了各200票」？
【易生誤解】網傳新聞影片「超扯！侯友宜票數從178萬，秒被調降變153萬」？
【易生誤解】網傳影片「水是能多深？唱票員說不用先公布總票數」？
【錯誤】網傳「總統投票數與政黨投票數差距116萬票，是總統票灌票」？
【錯誤】網傳影片「投票箱夾層有藏票，作票還擋拍」？
【事實釐清】網傳記票單擷圖「藍綠灌票？現場抓包後，藍綠才各扣回100票」?
【錯誤】網傳影片「一直喊柯文哲，結果劃票的人都沒有畫1號」？
【易生誤解】網傳影片「柯文哲75萬票變成35萬票，太離譜了」？
【事實釐清】網傳影片「這就是台灣選舉，喊一號畫二號...人工驗票就是會有很大的問題」？
【錯誤】網傳「台南投票日多處投票所突發當街連環砍人」？
【錯誤】網傳影片「捕獲中國衛星」？
【錯誤】網傳「民進黨又要做票了，小心...全台有17000多個票箱都有我的人」、「第一時間超級電腦就可以獲取選票，還可以提早宣布當選」？
【錯誤】網傳影片「我剛剛被一個畫面嚇到了...檢票員只拿他要的票，不要的又放回去...不禁懷疑選舉的公正性」？
【錯誤】網傳「美國派遣高科技團隊，接入中選會資料中心，幫助民進黨操