In [14]:
import openai
import os
import uuid
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from selenium import webdriver
from datetime import date

#### 連OPENAI API

In [2]:
def translate_news(news: str)-> str:
    openai.api_key = os.getenv("CHATGPT_KEY")
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": f'翻譯以下文章，並用繁體中文回傳:\n{news}'}
        ]
    )
    return completion.choices[0].message.content

#### 爬取BBC新聞的URL

In [5]:
load_dotenv()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=chrome_options)

news_types = {
    "world": [], 
    "business": [], 
    "technology": [], 
    "science_and_environment": [], 
    "stories": [], 
    "entertainment_and_arts": [], 
    "health": []
}
news_temp_set = set()
for news_type in news_types.keys():
    driver.get("https://www.bbc.com/news/" + news_type)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    title_tags = soup.select("a.gs-c-promo-heading")
    for title_tag in title_tags:
        if "news" in title_tag["href"] and "http" not in title_tag["href"] and title_tag["href"] not in news_temp_set:
            news_types[news_type].append(title_tag["href"])
            news_temp_set.add(title_tag["href"])
            
    print(news_type)
print(news_types)

world
business
technology
science_and_environment
stories
entertainment_and_arts
health
{'world': ['/news/world-europe-64986486', '/news/world-europe-64984374', '/news/world-middle-east-64976639', '/news/world-asia-china-64985527', '/news/business-64986520', '/news/world-latin-america-64985010', '/news/world-australia-64984949', '/news/world-europe-64976079', '/news/world-asia-india-64974627', '/news/world-africa-64981875', '/news/world-europe-64985009', '/news/world-australia-64984948', '/news/world-us-canada-64970156', '/news/business-64897827', '/news/world-64976139', '/news/business-64783843', '/news/world-africa-64976952', '/news/world-asia-64987190', '/news/world-europe-64981376', '/news/world-64977487', '/news/world-us-canada-64974825', '/news/world-europe-64974827'], 'business': ['/news/world-us-canada-64984291', '/news/technology-64973156', '/news/technology-64970062', '/news/science-environment-64973383', '/news/uk-politics-64975672', '/news/world-us-canada-64968609', '/news/

#### 爬取BBC文章內容，並insert進elasticsearch

In [None]:
usename = os.getenv("ELASTICSEARCH_USERNAME")
password = os.getenv("ELASTICSEARCH_PASSWORD")
elastic_host = os.getenv("ELASTICSEARCH_HOST")
es = Elasticsearch(hosts=elastic_host, basic_auth=(usename, password))
base_url = "https://www.bbc.com"
for key in news_types.keys():
    num = 0
    for sub_url in news_types[key]:
        driver.get(base_url + sub_url)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        article = soup.select("article")
        try:
            # 網址有可能是新聞影片
            header = article[0].select("header > h1")[0].text
        except:
            pass
        else:
            contents = article[0].select("div[data-component='text-block']")
            news = " ".join([content.text for content in contents])
            # 控制新聞字數在800字內
            if len(news.split()) < 800:
                num += 1
                news_tw = translate_news(news)
                es.index(
                    index="bbc_news",
                    id=str(uuid.uuid4()),
                    document={
                        "type": key,
                        "title": header,
                        "content": news,
                        "zh_tw": news_tw,
                        "date": str(date.today())
                    }
                )
                print(key)
                # 每個主題的新聞只insert 1個
                if num == 1:
                    break
es.close()
