In [28]:
import requests
import os
import datetime
import time
import json
import openai
import numpy as np

from newsapi import NewsApiClient

def LLM_Query(system_prompt, user_query, temperature=0.9, top_p=1):
    response  = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_query},
            ],
        temperature=temperature,
        top_p=top_p
    )
    print(response)
    response_content = response.choices[0]["message"]["content"]
    query_tokens = response["usage"]["total_tokens"]
    return response_content, query_tokens

newsdata_api = "https://newsdata.io/api/1/news"
newsdata_archive_api = "https://newsdata.io/api/1/archive"

categories = "business,politics,technology"
newsdata_source = "wsj,bloomberg"
full_content_source = "usatoday"#"businessinsider_us,guardian,cnn,bbc"

def read_api_keys(key_file):
    keys = None
    with open(key_file) as f:
        keys = json.load(f)
    return keys

def GetRequestEmbedding(customer_query):
    response = openai.Embedding.create(
        input=customer_query,
        model="text-embedding-3-small"
    )
    embeddings = response['data'][0]['embedding']
    return embeddings

"""
    News Info needed:
    - Title
    - Content(Description, depends on the api)
    - URL
    - Source
    - Date
"""

# Read news from News IO source
# TODO: Needs to refactor this function
def read_news_io_api(api_key, api_endpoint, categories, domain, date_from):
    url = api_endpoint + "?apikey=" + api_key + "&category=" + categories + "&domain=" + domain + "&full_content=1"
    response = requests.get(url).json()
    if response["status"] != "success":
        print("Failed to read news from News IO API")
        print(response)
        return None
    
    nextPage = response["nextPage"]
    articles = []
    for article in response["results"]:
        if datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S") > date_from:
            articles.append(article)
    
    has_next_page = len(articles) > 0 and len(response["results"]) < response["totalResults"]
    read_results = len(articles)
    while has_next_page:
        response = requests.get(url + "&page=" + str(nextPage)).json()
        if response["status"] != "success":
            print("Failed to read news from News IO API")
            print(response)
            break
        nextPage = response["nextPage"]
        
        page_articles = []
        for article in response["results"]:
            if datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S") > date_from:
                page_articles.append(article)
            else:
                print(datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S"))

        read_results += len(response["results"])
        has_next_page = len(page_articles) > 0 and read_results < response["totalResults"]
        articles.extend(page_articles)

    # Transform the articles to required format
    formalized_articles = []
    for article in articles:\
        formalized_articles.append({
            "title": article["title"],
            "content": article["description"],
            "url": article["link"],
            "source": article["source_id"],
            "date": datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S")
        })
    return formalized_articles, response

In [29]:
api_keys = read_api_keys(os.environ["HOME"] + "/.api_keys.json")
news_data_api_key = api_keys["news_data_api"]
date = datetime.datetime.now() - datetime.timedelta(days=30)

In [32]:
news_articles, response = read_news_io_api(news_data_api_key, newsdata_archive_api, categories, "usa today", date)

In [33]:
len(news_articles)

0

In [62]:
news_articles[0]

{'title': "Ondo gets NDPC's nod on data protection training",
 'content': "In line with his administration’s agenda to tackle youth unemployment by creating more jobs, the Ondo State Governor, Lucky Aiyedatiwa, has initiated a partnership with the Nigeria Data Protection Commission (NDPC) to train youths of the state as data protection officers. The governor, who was represented by his deputy, Olayide Adelami, disclosed this after a […] The post Ondo gets NDPC's nod on data protection training appeared first on The Guardian Nigeria News - Nigeria and World News.",
 'url': 'https://guardian.ng/technology/ondo-gets-ndpcs-nod-on-data-protection-training/',
 'source': 'guardian',
 'date': datetime.datetime(2024, 3, 23, 14, 33, 8)}

In [54]:
response

{'status': 'success',
 'totalResults': 3123,
 'results': [{'article_id': '0ed7c0275f8bd78057017288c446f861',
   'title': 'Cartoon',
   'link': 'https://guardian.ng/opinion/cartoon-67/',
   'keywords': ['cartoons', 'opinion'],
   'creator': ['Guardian Nigeria'],
   'video_url': None,
   'description': 'The post Cartoon appeared first on The Guardian Nigeria News - Nigeria and World News.',
   'content': "You must be logged in to post a comment. Why are you flagging this comment? I disagree with this user Targeted harassment - posted harassing comments or discussions targeting me, or encouraged others to do so Spam - posted spam comments or discussions Inappropriate profile - profile contains inappropriate images or text Threatening content - posted directly threatening content Private information - posted someone else's personally identifiable information Before flagging, please keep in mind that Disqus does not moderate communities. Your username will be shown to the moderator, so you 

# Cluster

In [64]:
openai.api_key = api_keys["openai"]

In [69]:
embeddings = []
for article in news_articles:
    embeddings.append(GetRequestEmbedding(article["title"] + " " + article["content"]))

In [71]:
news_articles[0]

{'title': "Ondo gets NDPC's nod on data protection training",
 'content': "In line with his administration’s agenda to tackle youth unemployment by creating more jobs, the Ondo State Governor, Lucky Aiyedatiwa, has initiated a partnership with the Nigeria Data Protection Commission (NDPC) to train youths of the state as data protection officers. The governor, who was represented by his deputy, Olayide Adelami, disclosed this after a […] The post Ondo gets NDPC's nod on data protection training appeared first on The Guardian Nigeria News - Nigeria and World News.",
 'url': 'https://guardian.ng/technology/ondo-gets-ndpcs-nod-on-data-protection-training/',
 'source': 'guardian',
 'date': datetime.datetime(2024, 3, 23, 14, 33, 8)}

In [76]:
for i in range(0, len(news_articles)):
    news_articles[i]["date"] = news_articles[i]["date"].strftime("%Y-%m-%d %H:%M:%S")

In [77]:
np_embeddings = np.array(embeddings)
np.save("news_embeddings.npy", np_embeddings)
json.dump(news_articles, open("news_articles.json", "w"))

In [3]:
np_embeddings = np.load("news_embeddings.npy")
news_articles = json.load(open("news_articles.json"))

In [6]:
# import clustering for news articles
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Find the optimal number of clusters
scores = []
for i in range(2, 100):
    kmeans = KMeans(n_clusters=i, random_state=0).fit(np_embeddings)
    scores.append(silhouette_score(np_embeddings, kmeans.labels_))



In [2]:
import nest_asyncio
nest_asyncio.apply()

In [5]:
import nest_asyncio
nest_asyncio.apply()
import os
import json
from pyvirtualdisplay import Display
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from scrapegraphai.graphs import SmartScraperGraph
import argparse

os.environ['PYVIRTUALDISPLAY_DISPLAYFD'] = '0'
grab_news_link_prompt = "Grab all the news link and its title"
grab_news_content_prompt = "Grab the full news content and its title, author"

def smart_scraper_graph(source, prompt, graph_config):
    smart_scraper_graph = SmartScraperGraph(
        prompt=prompt,
        source=source,
        config=graph_config
    )

    result = smart_scraper_graph.run()
    return result

def get_news_link(source):
    return smart_scraper_graph(source, grab_news_link_prompt, graph_config)

def get_news_content(links):
    scraped_news = {}
    for link in links:
        result = smart_scraper_graph(link, grab_news_content_prompt, graph_config)
        print(result)
        scraped_news[result['title']] = {
            'link': link,
            'content': result['content'],
            'author': result['author']   
        }
        
    return scraped_news

if __name__ == "__main__":
    display = Display(visible=0, size=(1400, 900))
    display.start()

    api_keys_path = os.path.expanduser("~/.api_keys.json")
    try:
        with open(api_keys_path, "r") as api_keys_file:
            api_keys = json.load(api_keys_file)
            OPENAI_API_KEY = api_keys.get("openai")
        
            if not OPENAI_API_KEY:
                print("警告：在 ~/.api_keys.json 文件中未找到 'openai' API 密钥。")
    except FileNotFoundError:
        print(f"错误：未找到文件 {api_keys_path}")
        os.exit(1)
    except json.JSONDecodeError:
        print(f"错误：无法解析 {api_keys_path} 文件中的 JSON 数据")
        os.exit(1)
        

    graph_config = {
        "llm": {
            "api_key": OPENAI_API_KEY,
            "model": "openai/gpt-4o-mini",
        },
        "verbose": True,
        "headless": False,
    }

    news_source = "https://www.reuters.com/"
    news_links = get_news_link(news_source)
    news_content = get_news_content(news_links)
    print(news_content)
    display.stop()

--- Executing Fetch Node ---
--- (Fetching HTML from: https://www.reuters.com/) ---
--- Executing Parse Node ---
--- Executing GenerateAnswer Node ---
--- Executing Fetch Node ---
--- (Fetching HTML from: news) ---
--- Executing Parse Node ---
--- Executing GenerateAnswer Node ---


{'title': 'NA', 'author': 'NA', 'content': 'NA'}
{'NA': {'link': 'news', 'content': 'NA', 'author': 'NA'}}


In [5]:
import pandas as pd
data = pd.read_csv("下载原始数据_规则824584_2024-04-11_16-34-01-792106-20240411163530.csv")

In [9]:
data_list = list(data["Unnamed: 9"])

In [22]:
from bs4 import BeautifulSoup as bs 
import requests
import re

url = 'https://www.reuters.com/markets/companies/TSLA.OQ/key-metrics/price-and-volume'
page = requests.get(url)
soup = bs(page.text, 'html.parser')

# Locate the Table you wish to scrape
table = soup.select_one('table.table__table__2px_A')

# Locate the Keys and Value for each of the rows
keys = [i.text for i in table.select('tr th') if i]
values = [i.text for i in table.select('tr td') if i]

# Convert the two lists into a dictionary for a neater output
data = dict(zip(keys,values))

AttributeError: 'NoneType' object has no attribute 'select'

In [23]:
page

<Response [401]>

In [9]:
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
import time
import random

import re
from datetime import datetime
from bs4 import BeautifulSoup

def scrape_guardian_news_content(url, driver):
    # A: 抓取新闻内容
    # b: 获取标题
    title = extract_title_from_url(url)
    print(f"文章标题: {title}")
    
    # c: 获取内容、时间和作者
    content, time, author = extract_content_time_and_author(driver, url)
    
    return title, (time, content, author)

def extract_title_from_url(url):
    return url.split('/')[-1].replace('-', ' ').title()

def extract_content_time_and_author(driver, url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    main_section = soup.find('main')
    
    if not main_section:
        print("未找到 'main' 标签")
        return None, None, None
    
    maincontent_div = main_section.find('div', id='maincontent')
    
    if not maincontent_div:
        print("未找到 id 为 'maincontent' 的 div")
        return None, None, None
    
    # 提取时间
    time_pattern = r"Last modified on (\w{3} \d{1,2} \w{3} \d{4} \d{2}\.\d{2} \w{3})"
    time_match = re.search(time_pattern, soup.text)
    if time_match:
        time = time_match.group(1)
    else:
        time = datetime.now().strftime("%a %d %b %Y %H.%M %Z")
    
    # 提取作者
    author_tag = soup.find('a', rel='author')
    author = author_tag.text if author_tag else "unknow author"
    
    paragraphs = maincontent_div.find_all('p')
    content = []
    
    for p in paragraphs:
        text = ''.join(child.strip() for child in p.contents if isinstance(child, str))
        if text:
            content.append(text + "\n")
    
    return ' '.join(content), time, author

def scrape_news(url):
    try:
        return scrape_guardian_news_content(url, driver)
    except Exception as e:
        print(f"抓取 {url} 时出错: {str(e)}")
        return None

def find_sections(soup, exclude_sections):
    sections = []
    for section in soup.find_all('section'):
        section_id = section.get('id')
        if section_id == 'in-pictures':
            break
        if section_id and section_id not in exclude_sections:
            print(section_id)
            sections.append(section)
    return sections

def extract_links_from_container(soup, section_id):
    container_id = f"container-{section_id}"
    container = soup.find('div', id=container_id)
    
    if not container:
        print(f"\nsection id: {section_id}")
        print(f"未找到对应的container (id: {container_id})")
        return []
    
    links = container.find_all('a', href=True)
    valid_links = [link['href'] for link in links if link['href'].startswith('/') and not link['href'].endswith('#comments')]
    
    print(f"\nsection id: {section_id}")
    print(f"找到 {len(valid_links)} 个有效链接:")
    for link in valid_links:
        print(f"  - {link}")
    
    return valid_links

def scrape_news_articles(links, base_url, scrape_function, driver):
    scraped_news = {}
    for link in links:
        full_url = f"{base_url}{link}"
        news_content = scrape_function(full_url, driver)
        if news_content:
            scraped_news[news_content[0]] = news_content[1]
            print(f"    成功抓取: {news_content[0]}")
        else:
            print(f"    抓取失败: {full_url}")
        
        wait_time = random.uniform(1, 3)
        time.sleep(wait_time)
    return scraped_news

def print_scraped_news(scraped_news):
    print(f"\n抓取完成。总共抓取了 {len(scraped_news)} 条新闻。")
    print("\n抓取的新闻标题:")
    for title, content in scraped_news.items():
        print(f"- {title}")
        print(content)
        print()

def extract_headline_links(soup):
    headlines_container = soup.find('div', id='container-headlines')
    if not headlines_container:
        print("未找到 id 为 container-headlines 的 div")
        return {}

    headlines_uls = headlines_container.find_all('ul', limit=2)
    if not headlines_uls:
        print("在 container-headlines 下未找到 ul 元素")
        return {}

    first_ul_hrefs = [link.get('href') for link in headlines_uls[0].find_all('a')]
    second_ul_hrefs = [link.get('href') for link in headlines_uls[1].find_all('a')]

    print("第一个 ul 的链接 (头条新闻):")
    for href in first_ul_hrefs:
        print(f"  - {href}")

    print("\n第二个 ul 的链接:")
    for href in second_ul_hrefs:
        print(f"  - {href}")

    return {
        "头条新闻": first_ul_hrefs,
        "其他头条新闻": second_ul_hrefs
    }

def scrape_guardian_news():
    # A: 初始化设置
    # b: 设置常量和配置
    guardian_exclude_section = ["headlines", "wellness", "soccer", "sports", "podcasts", "lifestyle", "take-part", "from-our-global-editions", "video"]
    url = 'https://www.theguardian.com/us'
    
    # c: 初始化WebDriver
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    
    # A: 获取页面内容
    # b: 获取页面源代码
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # c: 提取头条新闻链接
    headline_links = extract_headline_links(soup)
    
    # d: 查找其他section
    sections = find_sections(soup, guardian_exclude_section)
    
    # A: 提取链接
    print("\n为每个section查找对应的container并提取链接:")
    all_valid_links = []
    for section in sections:
        section_id = section.get('id')
        valid_links = extract_links_from_container(soup, section_id)
        all_valid_links.extend(valid_links)

    # 将头条新闻链接添加到all_valid_links
    for category, urls in headline_links.items():
        all_valid_links.extend(urls)

    # A: 抓取新闻
    print("\n开始抓取新闻:")
    scraped_news = scrape_news_articles(all_valid_links, "https://www.theguardian.com", scrape_guardian_news_content, driver)
    return scraped_news

# 执行主函数
guardian_news = scrape_guardian_news()

第一个 ul 的链接 (头条新闻):
  - /us-news/article/2024/sep/05/georgia-school-shooting-father-arrested
  - /us-news/article/2024/sep/05/georgia-school-shooting-victims-remembered
  - /us-news/article/2024/sep/05/georgia-shooting-apalachee-boy-interviewed
  - /us-news/video/2024/sep/04/georgia-high-school-shooting-student-charged-with-after-students-teachers-killed-video
  - /us-news/article/2024/sep/05/hunter-biden-guilty-plea-tax-avoidance-case

第二个 ul 的链接:
  - /us-news/article/2024/sep/05/georgia-school-shooting-victims-remembered
  - /us-news/article/2024/sep/05/georgia-shooting-apalachee-boy-interviewed
  - /us-news/video/2024/sep/04/georgia-high-school-shooting-student-charged-with-after-students-teachers-killed-video
in-focus
spotlight
west-coast
opinion
paris-paralympic-games-2024
paralympic-medal-table
wordiply-thrasher
climate-crisis
across-the-country
around-the-world
trump-on-trial-(email-newsletter)
culture
documentaries
in-case-you-missed-it
newsletters

为每个section查找对应的container并提取链接

In [11]:
guardian_news["Arizona Maricopa Election Bill Gates Interview"]

('Thu 5 Sep 2024 10.07 EDT',
 'Bill Gates, an election official in,that he was diagnosed with post-traumatic stress from the threats that came along with his job.\n But he doesn’t want to focus on that. The soon-to-be retired Maricopa county supervisor wants to talk about his new role, training the next generation of elections officials.\n “I don’t love how every article, and I know this one will too, says ‘Bill Gates has been diagnosed with PTSD.’ I don’t want to be known by this experience that I’ve had, but I think it’s too important, and I don’t regret it for a second that I did tell this story,” he said.\n Gates oversees elections as part of a five-member board in Maricopa county,– the state’s largest county, which includes Phoenix and is known for close elections and election denialism.\n Gates, a Republican, decided not to run for the board again, and his term ends early next year. But he won’t be leaving elections entirely. He will help start a new lab at Arizona State Universi

In [None]:
import requests
import os
import datetime
import time
import json
import openai

def LLM_Query(system_prompt, user_query, temperature=0.9, top_p=1):
    response  = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_query},
            ],
        temperature=temperature,
        top_p=top_p
    )
    print(response)
    response_content = response.choices[0]["message"]["content"]
    query_tokens = response["usage"]["total_tokens"]
    return response_content, query_tokens

from newsapi import NewsApiClient

bing_endpoint = "https://api.bing.microsoft.com/v7.0/news/trendingtopics"
newsdata_api = "https://newsdata.io/api/1/news"

newsdata_source = "wsj,usatoday,businessinsider_us,bloomberg"

def read_api_keys(key_file):
    keys = None
    with open(key_file) as f:
        keys = json.load(f)
    return keys

def GetRequestEmbedding(customer_query):
    response = openai.Embedding.create(
        input=customer_query,
        model="text-embedding-3-small"
    )
    embeddings = response['data'][0]['embedding']
    return embeddings

"""
    News Info needed:
    - Title
    - Content(Description, depends on the api)
    - URL
    - Source
    - Date
"""

# Read news from NewsAPI source
def read_newsapi_single_page(api_client, source, page_size=100):
    top_headlines = api_client.get_top_headlines(sources=source, language='en', page_size=page_size)
    if top_headlines["status"] != "ok":
        print("Error reading news from NewsAPI")
        return None

    articles = []
    for article in top_headlines["articles"]:
        # Date format too varied, we can only use the first 19 characters 
        # %Y-%m-%dT%H:%M:%S would be guaranteed.
        publish_time = article["publishedAt"][:19] + "Z"
        new_article = {
            "title": article["title"],
            "content": article["description"],
            "url": article["url"],
            "source": article["source"]["id"],
            "date": datetime.datetime.strptime(publish_time, "%Y-%m-%dT%H:%M:%SZ")
        }
        articles.append(new_article)
    top_headlines["articles"] = articles
    return top_headlines

def read_newsapi(api_client, source, page_size=100):
    articles = read_newsapi_single_page(api_client, source, page_size)
    total_results = articles["totalResults"]
    read_results = len(articles["articles"])
    page = 1

    # Keep reading until we have all the results
    while read_results < total_results:
        page += 1
        next_page = read_newsapi_single_page(api_client, source, page_size)
        if next_page is None:
            break
        read_results += len(next_page["articles"])
        articles["articles"].extend(next_page["articles"])

    return articles["articles"]

# Read news from News IO source
# TODO: Needs to refactor this function
def read_news_io_api(api_key, categories, domain, date_from):
    url = newsdata_api + "?apikey=" + api_key \
        + "&category=" + categories + "&domain=" + domain
    response = requests.get(url).json()
    if response["status"] != "success":
        print("Failed to read news from News IO API")
        return None
    
    nextPage = response["nextPage"]
    articles = []
    for article in response["results"]:
        if datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S") > date_from:
            articles.append(article)
    url += "&page=" + str(nextPage)
    
    has_next_page = len(articles) > 0 and len(response["results"]) < response["totalResults"]
    read_results = len(articles)
    while has_next_page:
        response = requests.get(url + "&page=" + str(nextPage)).json()
        if response["status"] != 200:
            break
        nextPage = response["next_page"]
        
        page_articles = []
        for article in response.json()["results"]:
            if datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S") > date_from:
                page_articles.append(article)

        read_results += len(response["results"])
        has_next_page = len(page_articles) > 0 and len(page_articles) < response["totalResults"]
        articles.extend(page_articles)

    # Transform the articles to required format
    formalized_articles = []
    for article in articles:\
        formalized_articles.append({
            "title": article["title"],
            "content": article["description"],
            "url": article["link"],
            "source": article["source_id"],
            "date": datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S")
        })
    return formalized_articles

# Read news from Bing News API
def read_bing_news_api(api_key, query, count, date_from):
    headers = {"Ocp-Apim-Subscription-Key": api_key}
    params = {
        "mkt": "en-us",
        "since": time.mktime(date_from.timetuple())
    }
    response = requests.get(bing_endpoint, headers=headers, params=params)
    response.raise_for_status()
    search_results = response.json()

    news_result = search_results["value"]
    return news_result

## Read news from all sources
# This is not correct yet
# Also need to reformat the key file
def read_news(api_keys, date_from):
    news = []
    sources='bloomberg, financial-times, the-economist, business-insider, reuters, the-wall-street-journal, the-washington-post, time, usa-today'
    for key in api_keys:
        if key == "news_api":
            api_client = NewsApiClient(api_key=api_keys[key])
            news.extend(read_newsapi(api_client, sources, 100))
        elif key == "news_data_api":
            news.extend(read_news_io_api(api_keys[key], "business", "wsj,usatoday,businessinsider_us,bloomberg", date_from))
    return news

def main():
    while True:
        api_keys = read_api_keys(os.environ["HOME"] + "/.api_keys.json")
    
        date = datetime.datetime.now() - datetime.timedelta(days=1)
        news = read_news(api_keys, date)
        for p in news:
            p["date"] = p["date"].strftime("%Y-%m-%d %H:%M:%S")
        news_file_name = "news_" + date.strftime("%Y-%m-%d") + ".json"
        json.dump(news, open(os.environ["HOME"] + "/personal_assistant/newsDB/" + news_file_name, "w"), indent=4)

        time.sleep(86400)
if __name__ == "__main__":
    main()