In [28]:
import requests
import os
import datetime
import time
import json
import openai
import numpy as np

from newsapi import NewsApiClient

def LLM_Query(system_prompt, user_query, temperature=0.9, top_p=1):
    response  = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_query},
            ],
        temperature=temperature,
        top_p=top_p
    )
    print(response)
    response_content = response.choices[0]["message"]["content"]
    query_tokens = response["usage"]["total_tokens"]
    return response_content, query_tokens

newsdata_api = "https://newsdata.io/api/1/news"
newsdata_archive_api = "https://newsdata.io/api/1/archive"

categories = "business,politics,technology"
newsdata_source = "wsj,bloomberg"
full_content_source = "usatoday"#"businessinsider_us,guardian,cnn,bbc"

def read_api_keys(key_file):
    keys = None
    with open(key_file) as f:
        keys = json.load(f)
    return keys

def GetRequestEmbedding(customer_query):
    response = openai.Embedding.create(
        input=customer_query,
        model="text-embedding-3-small"
    )
    embeddings = response['data'][0]['embedding']
    return embeddings

"""
    News Info needed:
    - Title
    - Content(Description, depends on the api)
    - URL
    - Source
    - Date
"""

# Read news from News IO source
# TODO: Needs to refactor this function
def read_news_io_api(api_key, api_endpoint, categories, domain, date_from):
    url = api_endpoint + "?apikey=" + api_key + "&category=" + categories + "&domain=" + domain + "&full_content=1"
    response = requests.get(url).json()
    if response["status"] != "success":
        print("Failed to read news from News IO API")
        print(response)
        return None
    
    nextPage = response["nextPage"]
    articles = []
    for article in response["results"]:
        if datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S") > date_from:
            articles.append(article)
    
    has_next_page = len(articles) > 0 and len(response["results"]) < response["totalResults"]
    read_results = len(articles)
    while has_next_page:
        response = requests.get(url + "&page=" + str(nextPage)).json()
        if response["status"] != "success":
            print("Failed to read news from News IO API")
            print(response)
            break
        nextPage = response["nextPage"]
        
        page_articles = []
        for article in response["results"]:
            if datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S") > date_from:
                page_articles.append(article)
            else:
                print(datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S"))

        read_results += len(response["results"])
        has_next_page = len(page_articles) > 0 and read_results < response["totalResults"]
        articles.extend(page_articles)

    # Transform the articles to required format
    formalized_articles = []
    for article in articles:
        formalized_articles.append({
            "title": article["title"],
            "content": article["description"],
            "url": article["link"],
            "source": article["source_id"],
            "date": datetime.datetime.strptime(article["pubDate"], "%Y-%m-%d %H:%M:%S")
        })
    return formalized_articles, response

In [29]:
api_keys = read_api_keys(os.environ["HOME"] + "/.api_keys.json")
news_data_api_key = api_keys["news_data_api"]
date = datetime.datetime.now() - datetime.timedelta(days=30)

In [32]:
news_articles, response = read_news_io_api(news_data_api_key, newsdata_archive_api, categories, "usa today", date)

In [None]:
len(news_articles)

In [None]:
news_articles[0]

In [None]:
response

# Cluster

In [64]:
openai.api_key = api_keys["openai"]

In [69]:
embeddings = []
for article in news_articles:
    embeddings.append(GetRequestEmbedding(article["title"] + " " + article["content"]))

In [None]:
news_articles[0]

In [76]:
for i in range(0, len(news_articles)):
    news_articles[i]["date"] = news_articles[i]["date"].strftime("%Y-%m-%d %H:%M:%S")

In [77]:
np_embeddings = np.array(embeddings)
np.save("news_embeddings.npy", np_embeddings)
json.dump(news_articles, open("news_articles.json", "w"))

In [3]:
np_embeddings = np.load("news_embeddings.npy")
news_articles = json.load(open("news_articles.json"))

In [None]:
# import clustering for news articles
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Find the optimal number of clusters
scores = []
for i in range(2, 100):
    kmeans = KMeans(n_clusters=i, random_state=0).fit(np_embeddings)
    scores.append(silhouette_score(np_embeddings, kmeans.labels_))

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import nest_asyncio
nest_asyncio.apply()
import os
import json
from pyvirtualdisplay import Display
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from scrapegraphai.graphs import SmartScraperGraph
import argparse

os.environ['PYVIRTUALDISPLAY_DISPLAYFD'] = '0'
grab_news_link_prompt = "Grab all the news link and its title"
grab_news_content_prompt = "Grab the full news content and its title, author"

def smart_scraper_graph(source, prompt, graph_config):
    smart_scraper_graph = SmartScraperGraph(
        prompt=prompt,
        source=source,
        config=graph_config
    )

    result = smart_scraper_graph.run()
    return result

def get_news_link(source):
    return smart_scraper_graph(source, grab_news_link_prompt, graph_config)

def get_news_content(links):
    scraped_news = {}
    for link in links:
        result = smart_scraper_graph(link, grab_news_content_prompt, graph_config)
        print(result)
        scraped_news[result['title']] = {
            'link': link,
            'content': result['content'],
            'author': result['author']   
        }
        
    return scraped_news

if __name__ == "__main__":
    display = Display(visible=0, size=(1400, 900))
    display.start()

    api_keys_path = os.path.expanduser("~/.api_keys.json")
    try:
        with open(api_keys_path, "r") as api_keys_file:
            api_keys = json.load(api_keys_file)
            OPENAI_API_KEY = api_keys.get("openai")
        
            if not OPENAI_API_KEY:
                print("警告：在 ~/.api_keys.json 文件中未找到 'openai' API 密钥。")
    except FileNotFoundError:
        print(f"错误：未找到文件 {api_keys_path}")
        os.exit(1)
    except json.JSONDecodeError:
        print(f"错误：无法解析 {api_keys_path} 文件中的 JSON 数据")
        os.exit(1)
        

    graph_config = {
        "llm": {
            "api_key": OPENAI_API_KEY,
            "model": "openai/gpt-4o-mini",
        },
        "verbose": True,
        "headless": False,
    }

    news_source = "https://www.reuters.com/"
    news_links = get_news_link(news_source)
    news_content = get_news_content(news_links)
    print(news_content)
    display.stop()

In [5]:
import pandas as pd
data = pd.read_csv("下载原始数据_规则824584_2024-04-11_16-34-01-792106-20240411163530.csv")

In [9]:
data_list = list(data["Unnamed: 9"])

In [None]:
from bs4 import BeautifulSoup as bs 
import requests
import re

url = 'https://www.reuters.com/markets/companies/TSLA.OQ/key-metrics/price-and-volume'
page = requests.get(url)
soup = bs(page.text, 'html.parser')

# Locate the Table you wish to scrape
table = soup.select_one('table.table__table__2px_A')

# Locate the Keys and Value for each of the rows
keys = [i.text for i in table.select('tr th') if i]
values = [i.text for i in table.select('tr td') if i]

# Convert the two lists into a dictionary for a neater output
data = dict(zip(keys,values))

In [None]:
page

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import langchain
import os
with open(os.path.expanduser("~/Google-Drive/AI-Brain/database/content/Wed 1 Jan 2025 16.47 EST_Justin Caporale Donald Trump.txt"), 'r') as f:
    content = f.read()
    print(content)

In [7]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, Annotated, List
import operator
from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

In [None]:
news_or_opinion_classifier_prompt = ChatPromptTemplate.from_messages([
    ("system",  
    """
        You are a professional news editor. A news can be reporting news or publishing an opinion.
        Verify the following content is news or opinion.
        If it is reporting news, you need to return "news". 
        If it is publishing an opinion, you need to return "opinion".
    """),
    ("user", 
    """
        The content is:
        {content}
    """),
])

news_or_opinion_classifier = ChatOpenAI(model="gpt-4o-mini", temperature=0)
news_or_opinion_classifier.invoke(news_or_opinion_classifier_prompt.format(content=content))

In [None]:
news_analysis_prompt = ChatPromptTemplate.from_messages([
    ("system", 
    """ You are a professional news editor. You are given an article content.
        From the news, extract the following information:
        What is the news about (key message)?
        Organization or People involved in the news and how are they involved or what happened to them
        What's the impact level of the news?
        What's the sentiment of the news?
        The return format should be json format without extra type indicator with the following keys:
        - key_message: the key message of the news
        - what happened: what happened at what time
        - entities: [
                "organization or people": organization or people involved in the news, if the entity has aliases here, don't put it here, use the direct entity name from the article, 
                                          like IMF or International Monetary Fund, you can use either of them but not International Monetary Fund(IMF), the aliases should be shown in entity_aliases
                "how involved": how are they involved or what happened to them
        ] (if there is no organization or people involved, the list should be empty)
        - entity_aliases: [
                "entity_name": entity name,
                "aliases": list of entity aliases that appear in the news, For example, if the entity is "Apple", the aliases could be "AAPL", "Apple Inc.", "Apple Computer, Inc.", etc.
        ]
        - impact_level: the impact level of the news (low, medium, high)
        - sentiment: the sentiment of the news (positive, negative, neutral)
    """),
    ("user", "The content is:\n{content}")
])

news_analyzer = ChatOpenAI(model="gpt-4o-mini", temperature=0)
result = news_analyzer.invoke(news_analysis_prompt.format(content=content))
result.content


In [None]:
json.loads(result.content)

In [None]:
opinion_analysis_prompt = ChatPromptTemplate.from_messages([
    ("system", 
    """ You are a professional news editor. You are given an article content. The article is an opinion to an event or an entity or some current situations over some place or the entire world.
        From the opinion, extract the following information:
        - What is the opinion about?
        - Who is the opinion for?
        - Who is the opinion against?
        - A short summary of the opinion
        - The logic / analysis that support the opinion
        The return format should be json format without extra type indicator with the following keys:
        - opinion_about: what is the opinion about
        - opinion_for: who is the opinion for
        - opinion_against: who is the opinion against
        - opinion_summary: a short summary of the opinion
        - supporting_reason: the evidence, analysis or logic that support the opinion
    """),
    ("user", "The content is:\n{content}")
])

opinion_analyzer = ChatOpenAI(model="gpt-4o-mini", temperature=0)
opinion_result = opinion_analyzer.invoke(opinion_analysis_prompt.format(content=content))
opinion_result.content


In [11]:
from datetime import datetime

def normalize_datetime(date_str):
    if pd.isna(date_str):
        return None
    
    try:
        # Handle 'YYYY-MM-DD' format
        if len(date_str) == 10 and date_str.count('-') == 2:
            return datetime.strptime(date_str, '%Y-%m-%d')
        
        # Handle both 'Day DD Mon YYYY HH.MM EST' and 'Day DD Mon YYYY HH.MM' formats
        elif any(day in date_str for day in ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']):
            # Remove 'EST' if present and clean up the string
            date_str = date_str.replace('EST', '').strip()
            return datetime.strptime(date_str, '%a %d %b %Y %H.%M')
        
        else:
            print(f"Unhandled date format: {date_str}")
            return None
            
    except Exception as e:
        print(f"Error parsing date '{date_str}': {e}")
        return None

In [None]:
# Read result content as json
import json
json.loads(result.content)



In [None]:
# Load the SQLite database
import sqlite3
import pandas as pd
conn = sqlite3.connect(os.path.expanduser("~/Google-Drive/AI-Brain/database/news_data.db"))  # Using the same db file as NewsCollector.py
cursor = conn.cursor()

# Query the database
cursor.execute("SELECT * FROM news_articles")  # Using the table name from NewsCollector.py
result = cursor.fetchall()
print(result)

df = pd.read_sql_query("SELECT * FROM news_articles", conn)
df["created_time"] = df["created_time"].apply(normalize_datetime)
df.head()



In [15]:
# Load all news content from files
import os

def load_news_content(file_path):
    full_path = os.path.expanduser(f"~/Google-Drive/AI-Brain/database/{file_path}")
    try:
        with open(full_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading file {full_path}: {e}")
        return None

# Add content column to dataframe
df['content'] = df['file_path'].apply(load_news_content)

# Remove rows where content failed to load
df = df.dropna(subset=['content'])


In [86]:
entity_coref_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a professional news editor. You are given a list of known entities and a new entity. Please identify if the new entity is the same as one of the known entities. If it is, return the known entity name. If it is not, return a single world None. No extra output allowed"""),
    ("user", "The known entities are:\n{known_entities}\nThe new entity is:\n{new_entity}")
])
entity_coref_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [None]:
json.loads(result.content)

In [125]:
from difflib import SequenceMatcher, get_close_matches
from fuzzywuzzy import fuzz  # pip install fuzzywuzzy
from Levenshtein import ratio

def smart_entity_matcher(str1, str2, threshold=90):
    """
    Combines multiple matching approaches for better entity matching
    Returns: (bool, float) - (is_match, confidence_score)
    """
    # Get scores from different methods
    fuzzy_token_score = fuzz.token_set_ratio(str1, str2)
    fuzzy_partial_score = fuzz.partial_ratio(str1, str2)
    levenshtein_score = ratio(str1, str2) * 100  # Convert to percentage
    
    # Weight the scores (adjust weights based on your needs)
    weighted_score = (
        fuzzy_token_score * 0.5 +    # Token matching is most important
        fuzzy_partial_score * 0.3 +   # Partial matching helps with abbreviations
        levenshtein_score * 0.2       # Levenshtein helps catch typos
    )
    
    return weighted_score >= threshold, weighted_score
    
# check if an entity can match one of the known entities
def check_entity_match(entity, known_entities_mapping, threshold=90):
    largest_score = 0
    largest_score_entity = None
    for known_entity, known_entity_aliases in known_entities_mapping.items():
        for known_entity_alias in known_entity_aliases:
            above_threshold, score = smart_entity_matcher(entity, known_entity_alias, threshold)
            if above_threshold and score > largest_score:
                largest_score = score
                largest_score_entity = known_entity
    return largest_score_entity



In [None]:
# Analyze the news
article_analysis_result = []
entity_aliases_mapping = {} # entity_name -> list of entity aliases
all_entities = set()
for i, news in enumerate(df.iterrows()):
    if i > 100:
        break
    title = news[1]["title"]
    content = news[1]["content"]
    print(f"Processing news {i} of {len(df)}, title: {title}")
    news_or_opinion = news_or_opinion_classifier.invoke(news_or_opinion_classifier_prompt.format(content=content))
    result = news_or_opinion.content
    if result == "opinion":
        succeed = False
        for _ in range(0, 3):   
            try:
                opinion_result = opinion_analyzer.invoke(opinion_analysis_prompt.format(content=content))
                opinion_result_json = json.loads(opinion_result.content)
                opinion_result_json["type"] = "opinion"
                article_analysis_result.append(opinion_result_json)
                succeed = True
                break
            except Exception as e:
                print(f"Error analyzing opinion: {e}")
        if not succeed:
            print(f"Failed to analyze opinion: {content}")
            article_analysis_result.append({"type": "opinion", "error": str(e)})
    else:
        succeed = False
        for _ in range(0, 3):
            try:
                news_result = news_analyzer.invoke(news_analysis_prompt.format(content=content))
                news_result_json = json.loads(news_result.content)
                news_result_json["type"] = "news"
                article_analysis_result.append(news_result_json)
                succeed = True
                break
            except Exception as e:
                print(f"Error analyzing news: {e}")
        if not succeed:
            print(f"Failed to analyze news: {content}")
            article_analysis_result.append({"type": "news", "error": str(e)})
        else:
            for entity in news_result_json["entity_aliases"]:
                if entity["entity_name"] not in all_entities:
                    match_entity = check_entity_match(entity["entity_name"], entity_aliases_mapping)
                    if match_entity is not None:
                        entity_aliases_mapping[match_entity].append(entity["entity_name"])
                    else:
                        match_entity = entity["entity_name"]
                        all_entities.add(match_entity)   
                        entity_aliases_mapping[match_entity] = [entity["entity_name"]]
                    for alias in entity["aliases"]:
                        if alias not in all_entities:
                            all_entities.add(alias)
                            entity_aliases_mapping[match_entity].append(alias)
            for entity in news_result_json["entities"]:
                if entity["organization or people"] not in all_entities:
                    match_entity = check_entity_match(entity["organization or people"], entity_aliases_mapping)
                    if match_entity is not None:
                        entity_aliases_mapping[match_entity].append(entity["organization or people"])
                    else:
                        all_entities.add(entity["organization or people"])
                        entity_aliases_mapping[entity["organization or people"]] = [entity["organization or people"]]


In [None]:
entity_aliases_mapping


In [None]:
[(entity, entity_aliases_mapping[entity]) for entity in entity_aliases_mapping if len(entity_aliases_mapping[entity]) > 1]

In [None]:
match_entity = smart_entity_matcher('New York City Mayor Eric Adams',
   'Eric Adams')
match_entity

In [18]:
# Group the entities and identify the same entity with different name
entities = []
entities_mapping = {} # key: entity name, value: common entity name in full

for article in article_analysis_result:
    if article["type"] == "news":
        for entity in article["entities"]:
            if entity["organization or people"] not in entities:
                entities.append(entity["organization or people"])


In [19]:
all_entities = list(set(entities))

In [None]:
print(all_entities)

In [None]:
group_resolution

In [None]:
group_result.content.replace("```json", "").replace("```", "")

In [70]:
def retry_LLM_request_json_format(ChatOpenAI_model, prompt, retry_count=3):
    for _ in range(0, retry_count):
        try:    
            result = ChatOpenAI_model.invoke(prompt)
            if result.content is not None:
                # Clean up the content first
                content = result.content.strip()
                # Remove any markdown code block markers
                content = content.replace("```json", "").replace("```", "")
                
                # Split into individual JSON objects and parse each one
                json_result = json.loads(content)
                return json_result
        except Exception as e:
            print(f"Error in retry_LLM_request_json_format: {e}\nContent:\n{result.content}")
    return None

In [17]:
news_compare_prompt = ChatPromptTemplate.from_messages([
    ("system", 
    """
        You are an information editor tasked with comparing two sets of articles to determine if they are discussing the same event or story. 
        To define the same event or story, imagine you are a news editor. And you have a special section for each event, and the event could evolve among time, you need to find if the news is related to the event
        Your job is to analyze the content and provide a relatedness score on a scale of 1-10, where 1 means completely unrelated and 10 means they are definitely about the same event or story.
    """),
    ("user", 
    """
        Here is the first article:
        <article1>
        {Event}
        </article1>

        Here is the second article or set of information:
        <article2>
        {news}
        </article2>

        Please carefully read and analyze both pieces of content. Look for key elements such as:
        - Similar events, incidents, or topics
        - Matching names of people, places, or organizations
        - Corresponding dates or time frames
        - Overlapping details or facts

        After your analysis, determine how related the two pieces of content are. Consider the following guidelines for scoring:
        - 1-3: Mostly or completely unrelated topics
        - 4-6: Some overlapping themes or tangentially related topics
        - 7-10: Highly likely or certainly about the same event or story

        Format your response as a single number from 1-10, no other text.

    """),
])


In [75]:
import random
maximum_news_count = 10
class Event:
    def __init__(self, event_id=0):
        self.event_id = event_id
        self.event_name = ""
        self.event_description = ""
        self.raw_news_array = []
        self.related_scores = []
        self.event_news_per_date = {} # date: [index of news in raw_news_array]
        # self.event_news_daily_summary = {} # date: summary of news in the day
        self.total_news_count = 0

    def _compare_between_events_and_news(self, news_to_compare, event_to_compare):
        model = ChatOpenAI(model="gpt-4o-mini", temperature=0)
        result = model.invoke(news_compare_prompt.format(Event=event_to_compare, news=news_to_compare))
        return result.content

    def _compare_by_random_news(self, news_to_compare, news_count=3):
        random_news = random.choices(self.raw_news_array, k=news_count)
        random_news_content = ""
        for news in random_news[:maximum_news_count]:
            random_news_content += news

        for _ in range(0, 3):   
            try:
                score = self._compare_between_events_and_news(random_news_content, news_to_compare)
                score = int(score)
                break
            except Exception as e:
                print(f"Error comparing news: {e}, error response: {score}")
                score = 0
        return score

    def _compare_by_latest_date(self, news_to_compare, date_count=3):
        dates = list(self.event_news_per_date.keys()) 
        dates.sort()
        chosen_dates = dates[-date_count:]
        chosen_news = []
        for date in chosen_dates:
            chosen_news.append(self.event_news_per_date[date])
        chosen_news_content = ""
        for news in chosen_news[:maximum_news_count]:
            chosen_news_content += self.raw_news_array[news]

        for _ in range(0, 3):
            try:
                score = self._compare_between_events_and_news(chosen_news_content, news_to_compare)
                score = int(score)
                break
            except Exception as e:
                print(f"Error comparing news: {e}, error response: {score}")
                score = 0
        return score

    def _compare_by_event_description(self, event_description):
        for _ in range(0, 3):
            try:
                score = self._compare_between_events_and_news(self.event_description, event_description)
                score = int(score)
                break
            except Exception as e:
                print(f"Error comparing news: {e}, error response: {score}")
                score = 0
        return score
        
    def add_news_to_store(self, news, score=0):
        self.raw_news_array.append(news)
        self.total_news_count += 1
        self.event_news_per_date[self.total_news_count] = news
        self.related_scores.append(score)
        
    def check_if_news_is_related(self, news, compare_strategy="random"):
        if compare_strategy == "random":
            score = self._compare_by_random_news(news)
        elif compare_strategy == "latest_date":
            score = self._compare_by_latest_date(news)
        elif compare_strategy == "event_description":
            score = self._compare_by_event_description(news)

        return score


In [76]:
News_clusters = {}

df.sort_values(by="created_time", ascending=True, inplace=True)

In [None]:
# Going through the news and start the clustering.
# if the News_cluster is empty, start a new cluster.
# if the News_cluster is not empty, compare the news with the news in the cluster.
# if the news is related to the news in the cluster, add the news to the cluster.
# if the news is not related to the news in the cluster, start a new cluster.

# Note / TODO: The news is too widely distributed, we need two things:
# 1. Need some fast recall mechanism to find the filter all the news clusters: name entities for example 
# 2. Hierarchical clustering. 
# 3. Garbage cluster that collect those single meaning less news which has no way to find other related news
News_clusters = {}

highest_cluster_id = 0
for i, news in enumerate(df.iterrows()):
    if (i > 100):
        break

    title = news[1]["title"]
    content = news[1]["content"]
    created_time = news[1]["created_time"]
    print(f"Processing news {i} of {len(df)}, title: {title}")

    if News_clusters == {}:
        picked_cluster = Event(highest_cluster_id)
        highest_cluster_id += 1
        News_clusters[picked_cluster.event_id] = picked_cluster
        max_score = 10
    else:
        cluster_score = {}
        max_score = 0
        for cluster_id, cluster in News_clusters.items():
            score = cluster.check_if_news_is_related(content)
            print(f"score: {score}, cluster_id: {cluster_id}")
            if score > max_score:
                max_score = score
                max_score_cluster_id = cluster_id
        if (max_score >= 7):
            picked_cluster = News_clusters[max_score_cluster_id]
        else:
            picked_cluster = Event(highest_cluster_id)
            highest_cluster_id += 1
            News_clusters[picked_cluster.event_id] = picked_cluster
            max_score = 10
    picked_cluster.add_news_to_store(content, max_score)
    print("total cluster count: ", len(News_clusters))


In [None]:
# Combine title and content for embedding
df['text_for_embedding'] = df['title'] + " " + df['content']

# Get embeddings using LangChain with OpenAI embeddings
from langchain_openai import OpenAIEmbeddings
from sklearn.cluster import KMeans
import numpy as np
from dotenv import load_dotenv

# Initialize OpenAI embeddings
load_dotenv()  # Load environment variables from .env file
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large"
)

# Get embeddings for all documents
print("Generating embeddings...")
embedded_docs = []
for text in df['text_for_embedding']:
    embedded_docs.append(embeddings.embed_query(text))

# Convert to numpy array
embedded_docs = np.array(embedded_docs)

In [None]:
# Perform K-means clustering
n_clusters = 15  # Adjust number of clusters as needed
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(embedded_docs)

# Display results with cluster assignments
print(f"\nDocuments clustered into {n_clusters} groups:")
df[['title', 'cluster', 'content']]

In [None]:
df[df['cluster'] == 1]["title"]