In [28]:
import requests
import os
import datetime
import time
import json
import openai
import numpy as np

from newsapi import NewsApiClient

def LLM_Query(system_prompt, user_query, temperature=0.9, top_p=1):
    response  = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_query},
            ],
        temperature=temperature,
        top_p=top_p
    )
    print(response)
    response_content = response.choices[0]["message"]["content"]
    query_tokens = response["usage"]["total_tokens"]
    return response_content, query_tokens

newsdata_api = "https://newsdata.io/api/1/news"
newsdata_archive_api = "https://newsdata.io/api/1/archive"

categories = "business,politics,technology"
newsdata_source = "wsj,bloomberg"
full_content_source = "usatoday"#"businessinsider_us,guardian,cnn,bbc"

def read_api_keys(key_file):
    keys = None
    with open(key_file) as f:
        keys = json.load(f)
    return keys

def GetRequestEmbedding(customer_query):
    response = openai.Embedding.create(
        input=customer_query,
        model="text-embedding-3-small"
    )
    embeddings = response['data'][0]['embedding']
    return embeddings

"""
    News Info needed:
    - Title
    - Content(Description, depends on the api)
    - URL
    - Source
    - Date
"""

# Read news from News IO source
# TODO: Needs to refactor this function
def read_news_io_api(api_key, api_endpoint, categories, domain, date_from):
    url = api_endpoint + "?apikey=" + api_key + "&category=" + categories + "&domain=" + domain + "&full_content=1"
    response = requests.get(url).json()
    if response["status"] != "success":
        print("Failed to read news from News IO API")
        print(response)
        return None
    
    nextPage = response["nextPage"]
    articles = []
    for article in response["results"]:
        if datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S") > date_from:
            articles.append(article)
    
    has_next_page = len(articles) > 0 and len(response["results"]) < response["totalResults"]
    read_results = len(articles)
    while has_next_page:
        response = requests.get(url + "&page=" + str(nextPage)).json()
        if response["status"] != "success":
            print("Failed to read news from News IO API")
            print(response)
            break
        nextPage = response["nextPage"]
        
        page_articles = []
        for article in response["results"]:
            if datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S") > date_from:
                page_articles.append(article)
            else:
                print(datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S"))

        read_results += len(response["results"])
        has_next_page = len(page_articles) > 0 and read_results < response["totalResults"]
        articles.extend(page_articles)

    # Transform the articles to required format
    formalized_articles = []
    for article in articles:\
        formalized_articles.append({
            "title": article["title"],
            "content": article["description"],
            "url": article["link"],
            "source": article["source_id"],
            "date": datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S")
        })
    return formalized_articles, response

In [29]:
api_keys = read_api_keys(os.environ["HOME"] + "/.api_keys.json")
news_data_api_key = api_keys["news_data_api"]
date = datetime.datetime.now() - datetime.timedelta(days=30)

In [32]:
news_articles, response = read_news_io_api(news_data_api_key, newsdata_archive_api, categories, "usa today", date)

In [None]:
len(news_articles)

In [None]:
news_articles[0]

In [None]:
response

# Cluster

In [64]:
openai.api_key = api_keys["openai"]

In [69]:
embeddings = []
for article in news_articles:
    embeddings.append(GetRequestEmbedding(article["title"] + " " + article["content"]))

In [None]:
news_articles[0]

In [76]:
for i in range(0, len(news_articles)):
    news_articles[i]["date"] = news_articles[i]["date"].strftime("%Y-%m-%d %H:%M:%S")

In [77]:
np_embeddings = np.array(embeddings)
np.save("news_embeddings.npy", np_embeddings)
json.dump(news_articles, open("news_articles.json", "w"))

In [3]:
np_embeddings = np.load("news_embeddings.npy")
news_articles = json.load(open("news_articles.json"))

In [None]:
# import clustering for news articles
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Find the optimal number of clusters
scores = []
for i in range(2, 100):
    kmeans = KMeans(n_clusters=i, random_state=0).fit(np_embeddings)
    scores.append(silhouette_score(np_embeddings, kmeans.labels_))

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import nest_asyncio
nest_asyncio.apply()
import os
import json
from pyvirtualdisplay import Display
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from scrapegraphai.graphs import SmartScraperGraph
import argparse

os.environ['PYVIRTUALDISPLAY_DISPLAYFD'] = '0'
grab_news_link_prompt = "Grab all the news link and its title"
grab_news_content_prompt = "Grab the full news content and its title, author"

def smart_scraper_graph(source, prompt, graph_config):
    smart_scraper_graph = SmartScraperGraph(
        prompt=prompt,
        source=source,
        config=graph_config
    )

    result = smart_scraper_graph.run()
    return result

def get_news_link(source):
    return smart_scraper_graph(source, grab_news_link_prompt, graph_config)

def get_news_content(links):
    scraped_news = {}
    for link in links:
        result = smart_scraper_graph(link, grab_news_content_prompt, graph_config)
        print(result)
        scraped_news[result['title']] = {
            'link': link,
            'content': result['content'],
            'author': result['author']   
        }
        
    return scraped_news

if __name__ == "__main__":
    display = Display(visible=0, size=(1400, 900))
    display.start()

    api_keys_path = os.path.expanduser("~/.api_keys.json")
    try:
        with open(api_keys_path, "r") as api_keys_file:
            api_keys = json.load(api_keys_file)
            OPENAI_API_KEY = api_keys.get("openai")
        
            if not OPENAI_API_KEY:
                print("警告：在 ~/.api_keys.json 文件中未找到 'openai' API 密钥。")
    except FileNotFoundError:
        print(f"错误：未找到文件 {api_keys_path}")
        os.exit(1)
    except json.JSONDecodeError:
        print(f"错误：无法解析 {api_keys_path} 文件中的 JSON 数据")
        os.exit(1)
        

    graph_config = {
        "llm": {
            "api_key": OPENAI_API_KEY,
            "model": "openai/gpt-4o-mini",
        },
        "verbose": True,
        "headless": False,
    }

    news_source = "https://www.reuters.com/"
    news_links = get_news_link(news_source)
    news_content = get_news_content(news_links)
    print(news_content)
    display.stop()

In [5]:
import pandas as pd
data = pd.read_csv("下载原始数据_规则824584_2024-04-11_16-34-01-792106-20240411163530.csv")

In [9]:
data_list = list(data["Unnamed: 9"])

In [None]:
from bs4 import BeautifulSoup as bs 
import requests
import re

url = 'https://www.reuters.com/markets/companies/TSLA.OQ/key-metrics/price-and-volume'
page = requests.get(url)
soup = bs(page.text, 'html.parser')

# Locate the Table you wish to scrape
table = soup.select_one('table.table__table__2px_A')

# Locate the Keys and Value for each of the rows
keys = [i.text for i in table.select('tr th') if i]
values = [i.text for i in table.select('tr td') if i]

# Convert the two lists into a dictionary for a neater output
data = dict(zip(keys,values))

In [None]:
page

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import langchain

with open(os.path.expanduser("~/Google-Drive/AI-Brain/database/content/Wed 1 Jan 2025 16.47 EST_Justin Caporale Donald Trump.txt"), 'r') as f:
    content = f.read()
    print(content)

In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, Annotated, List
import operator
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

"""
class AgentState(TypedDict):
    messages: Annotated[List[AnyMessage], operator.add]

class Agent:
    def __init__(self, model, tools, checkpointer, system=""):
        self.system = system
        graph = StateGraph(AgentState)
        graph.add_node("llm", self.call_openai)
        graph.add_node("action", self.take_action)
        graph.add_conditional_edges("llm", self.exists_action, {True: "action", False: END})
        graph.add_edge("action", "llm")
        graph.set_entry_point("llm")
        self.graph = graph.compile(checkpointer=checkpointer)
        self.tools = {t.name: t for t in tools}
        self.model = model.bind_tools(tools)

    def call_openai(self, state: AgentState):
        messages = state['messages']
        if self.system:
            messages = [SystemMessage(content=self.system)] + messages
        message = self.model.invoke(messages)
        return {'messages': [message]}

    def exists_action(self, state: AgentState):
        result = state['messages'][-1]
        return len(result.tool_calls) > 0

    def take_action(self, state: AgentState):
        tool_calls = state['messages'][-1].tool_calls
        results = []
        for t in tool_calls:
            print(f"Calling: {t}")
            result = self.tools[t['name']].invoke(t['args'])
            results.append(ToolMessage(tool_call_id=t['id'], name=t['name'], content=str(result)))
        print("Back to the model!")
        return {'messages': results}
"""

In [None]:
news_or_opinion_classifier_prompt = ChatPromptTemplate.from_messages([
    ("system",  
    """
        You are a professional news editor. A news can be reporting news or publishing an opinion.
        Verify the following content is news or opinion.
        If it is reporting news, you need to return "news". 
        If it is publishing an opinion, you need to return "opinion".
    """),
    ("user", 
    """
        The content is:
        {content}
    """),
])

news_or_opinion_classifier = ChatOpenAI(model="gpt-4o-mini", temperature=0)
news_or_opinion_classifier.invoke(news_or_opinion_classifier_prompt.format(content=content))

In [None]:
news_analysis_prompt = ChatPromptTemplate.from_messages([
    ("system", 
    """You are a professional news editor. You are given an article content.
From the news, extract the following information:
What is the news about (key message)?
Organization or People involved in the news and how are they involved or what happened to them
What's the impact level of the news?
What's the sentiment of the news?
The return format should be a json with the following keys:
- key_message: the key message of the news
- entities: [
        "organization or people": organization or people involved in the news,
        "how involved": how are they involved or what happened to them
] (if there is no organization or people involved, the list should be empty)
- impact_level: the impact level of the news (low, medium, high)
- sentiment: the sentiment of the news (positive, negative, neutral)"""),
    ("user", "The content is:\n{content}")
])

news_analyzer = ChatOpenAI(model="gpt-4o-mini", temperature=0)
result = news_analyzer.invoke(news_analysis_prompt.format(content=content))
result.content


In [None]:
# Read result content as json
json.loads(result.content[7:-3])

