In [1]:
import openai
import os
import uuid
import psycopg
import requests
import json
import asyncio
import aiohttp
from elasticsearch import Elasticsearch
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from selenium import webdriver
from datetime import date

#### 用連OPENAI套件連API

In [None]:
def connect_gpt_api(prompt: str)-> str:
    openai.api_key = os.getenv("CHATGPT_KEY")
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    return completion.choices[0].message.content

#### 用requests連接API

In [2]:
def connect_gpt_api(prompt: str)-> str:
    url = "https://api.openai.com/v1/chat/completions"
    api_key = os.getenv("CHATGPT_KEY")
    headers = {"Authorization": "Bearer "+ api_key}
    data = {"model": "gpt-3.5-turbo","messages": [{"role": "user", "content": prompt}]}
    response = requests.post(url, headers=headers, json=data).text
    print(json.loads(response))
    return json.loads(response)["choices"][0]["message"]["content"]

In [None]:
news = "flask"
print(connect_gpt_api(f"早安，請簡單介紹{news}"))
news = "Djangle"
print(connect_gpt_api(f"早安，請簡單介紹{news}"))

#### 非同步連接API

In [None]:
async def fetch(session, url, headers, data):
    async with session.post(url, headers=headers, json=data) as response:
        return await response.text()

async def main(prompts):
    async with aiohttp.ClientSession() as session:
        url = "https://api.openai.com/v1/chat/completions"
        api_key = os.getenv("CHATGPT_KEY")
        headers = {"Authorization": "Bearer "+ api_key}
        tasks = []
		# 將coroutine包進去tasks裡面
        for prompt in prompts:
            data = {"model": "gpt-3.5-turbo","messages": [{"role": "user", "content": prompt}]}
            tasks.append(fetch(session, url, headers, data))
            print(data)

        results = await asyncio.gather(*tasks)
        for result in results:
            print(json.loads(result)["choices"][0]["message"]["content"])

# asyncio.run(main())
prompts = ["早安，請簡單介紹flask", "早安，請簡單介紹Djangle"]
await main(prompts)    # jupyter中使用

#### 爬取BBC新聞的URL

In [None]:
load_dotenv()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=chrome_options)

news_set = {
    "world": [], 
    "business": [], 
    "technology": [], 
    "science_and_environment": [], 
    "stories": [], 
    "entertainment_and_arts": [], 
    "health": []
}
news_temp_set = set()
for news_type in news_set.keys():
    driver.get("https://www.bbc.com/news/" + news_type)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    title_tags = soup.select("a.gs-c-promo-heading")
    for title_tag in title_tags:
        if "news" in title_tag["href"] and "http" not in title_tag["href"] and title_tag["href"] not in news_temp_set:
            news_set[news_type].append(title_tag["href"])
            news_temp_set.add(title_tag["href"])
            
    print(news_type)
print(news_set)

#### 爬取BBC文章內容，並insert進postgresql

In [None]:
conn = psycopg.connect(os.getenv("DATABASE_CONFIG"))
cursor = conn.cursor()
base_url = "https://www.bbc.com"
for key in news_set.keys():
    num = 0
    for sub_url in news_set[key]:
        driver.get(base_url + sub_url)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        article = soup.select("article")
        try:
            # 網址有可能是新聞影片
            header = article[0].select("header > h1")[0].text
        except:
            pass
        else:
            contents = article[0].select("div[data-component='text-block']")
            news = " ".join([content.text for content in contents])
            # 控制新聞字數在800字內
            if len(news.split()) < 800:
                num += 1
                news_origin = connect_gpt_api(f"將以下文章分段，請用英文回傳:\n{news}")
                news_tw = connect_gpt_api(f"翻譯以下文章並分段，請用繁體中文回傳:\n{news}")
                toeic_500 = connect_gpt_api(f"請用多益500的程度將文章改寫並分段，盡量不要刪減文章原意，請用英文回傳:\n{news}")
                toeic_700 = connect_gpt_api(f"請用多益700的程度將文章改寫並分段，盡量不要刪減文章原意，請用英文回傳:\n{news}")
                cursor.execute(f"""
                    INSERT INTO bbc_news.bbc_news 
                    ("news_id", "type", "title", "news_origin", "news_tw", "toeic_500", "toeic_700", "date") VALUES 
                    (%(id)s, %(type)s, %(title)s, %(news_origin)s, %(news_tw)s, %(toeic_500)s, %(toeic_700)s, %(today)s)
                """, {
                    "id": str(uuid.uuid4()),
                    "type": key,
                    "title": header,
                    "news_origin": news_origin,
                    "news_tw": news_tw,
                    "toeic_500": toeic_500,
                    "toeic_700": toeic_700,
                    "today": str(date.today())
                })
                conn.commit()
                print(key)
                # 每個主題的新聞只insert 1個
                if num == 1:
                    break
cursor.close()
conn.close()


#### 爬取BBC文章內容，並insert進elasticsearch

In [None]:
usename = os.getenv("ELASTICSEARCH_USERNAME")
password = os.getenv("ELASTICSEARCH_PASSWORD")
elastic_host = os.getenv("ELASTICSEARCH_HOST")
es = Elasticsearch(hosts=elastic_host, basic_auth=(usename, password))
base_url = "https://www.bbc.com"
for key in news_set.keys():
    num = 0
    for sub_url in news_set[key]:
        driver.get(base_url + sub_url)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        article = soup.select("article")
        try:
            # 網址有可能是新聞影片
            header = article[0].select("header > h1")[0].text
        except:
            pass
        else:
            contents = article[0].select("div[data-component='text-block']")
            news = " ".join([content.text for content in contents])
            # 控制新聞字數在800字內
            if len(news.split()) < 800:
                num += 1
                news_tw = connect_gpt_api(f"翻譯以下文章，並用繁體中文回傳:\n{news}")
                es.index(
                    index="bbc_news",
                    id=str(uuid.uuid4()),
                    document={
                        "type": key,
                        "title": header,
                        "content": news,
                        "zh_tw": news_tw,
                        "date": str(date.today())
                    }
                )
                print(key)
                # 每個主題的新聞只insert 1個
                if num == 1:
                    break
es.close()
