In [None]:
from google.colab import drive
import os
import time
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

drive.mount('/content/drive')
os.chdir(r"/content/drive/My Drive/Colab Notebooks/t-brain_2020_NLP_team_share_folder")


## News URLs

In [None]:
news_urls = pd.read_csv('raw_data/tbrain_train_final_0610.csv')
# news_urls = pd.read_csv('raw_data/train_data_custom_0626.csv')
news_urls.head()

In [None]:
news_sources = np.unique(news_urls['hyperlink'].apply(lambda x:x.split('/')[2])).tolist()
print("共", len(news_sources), "個新聞網站來源")

## Crawl Function

In [None]:
# 認status code == 200
def get_url_status_code(url):
    return requests.get(url).status_code

def crawl_news(url):

    domain = url.split("/")[2]
    special = False

    if get_url_status_code(url) != 200:
        return [], []

    # get the news
    res = requests.get(f'{url}')

    if domain in ["news.mingpao.com", "www.coolloud.org.tw", "hk.on.cc"]:
        res.encoding = 'UTF-8'

    if domain in ["news.tvbs.com.tw"]:
        soup = BeautifulSoup(res.text.replace("<br>", "<p>"))
    else:
        soup = BeautifulSoup(res.text)

    # get the title
    try:
        title = soup.find('h1').text
    except AttributeError as e:
        print("URL:", url, "title extraction failed.")
        title = []
        
    try:
        # get the content
        if domain == 'mops.twse.com.tw':
            contents = soup.find_all('td', attrs={'style':"text-align:left !important;"})
            special = True
        elif domain == "domestic.judicial.gov.tw":
            contents = soup.find_all('pre')
        elif domain == "udn.com":
            contents = soup.find('section', attrs={'class':'article-content__editor'})
            clean_text = re.sub('[\(\)\{\}<>\n\t\xa0\r\u3000]', '', contents.text)
            return title, clean_text
        elif domain == "sina.com.hk":
            contents = soup.find_all('p')
            title = contents[0].text
            special = True
        elif domain in ["www.hk01.com", "m.ctee.com.tw", "house.ettoday.net", "ec.ltn.com.tw", "www.wealth.com.tw"]:
            contents = soup.find_all('p')
            special = True
        elif domain == 'www.nownews.com':
            contents = soup.find_all('div','newsMsg')
            special = True
        elif domain in ["www.bnext.com.tw"]:
            try:
                contents = soup.find_all('div', attrs={"data-url": url})[0].find_all('p')
            except:
                contents = soup.find_all('p')
        elif domain in ["hk.on.cc"]:
            contents = soup.find_all('div', attrs={"class": "paragraph"})
            special = True

        elif domain in ["www.storm.mg"]:
            contents = soup.find_all('div', attrs={"id": "CMS_wrapper", "class": "article_content_inner"})[0].find_all('p')
            special = True
        else:
            contents = soup.find_all('p')
    except:
        print("URL:", url, "contents extraction failed.")
        return [], []

    # join the content
    sentences = []
    for content in contents:

        if domain in ["m.ltn.com.tw", "ec.ltn.com.tw", "news.ltn.com.tw"]:
            try:
                if content.attrs["class"] == ["appE1121"]:
                    break
            except:
                pass

        # if there is not a pure news content, skip it
        if len(content.attrs) == 0 or special:

            if domain in ["m.ctee.com.tw", "news.tvbs.com.tw"]:
                try:
                    content.find('a').text
                    continue
                except:
                    pass
            
            if domain in ["www.wealth.com.tw"]:
                try:
                    attrs = content.find("a").text
                    break
                except:
                    pass

            if domain in ["technews.tw", "money.udn.com", "estate.ltn.com.tw", "ccc.technews.tw", "finance.technews.tw"]:
                if len(content.find_all('span')) > 0 or len(content.find_all('a')) > 0:
                    continue

            # extract text and do basic clean
            clean_text = re.sub('[\(\)\{\}<>\n\t\xa0\r\u3000]', '', content.text)
            sentences.append(clean_text)
            
            if domain in ["udn.com"]:
                try:
                    attrs = content.find_all("figure")[0].attrs
                    if len(attrs) != 0:
                        break
                except:
                    pass

        if domain in ["www.hbrtaiwan.com"]:
            if len(content.attrs) != 0:
                break

    article = "".join(sentences).strip()
    
    return title, article


In [None]:
for i in range(news_urls.shape[0]):

    url = news_urls.loc[i, "hyperlink"]

    # if url.split("/")[2] in reload_domains:

    title, content = crawl_news(url)
    # print(title, content)
    # titles.append(title)
    # contents.append(content)

    print(f"No. {i}th news with URL:{url} has been crawled.")
    # time.sleep(1)
    if content is not None:
        news_urls.loc[i, "title"] = str(title)
        news_urls.loc[i, "content"] = str(content)
    else:
        print("content is None. Remain original content.")