In [13]:
import os
import json
import requests
import chromadb
from sentence_transformers import SentenceTransformer

class GroqClient:
    def __init__(self, api_key):
        self.api_key = api_key
        self.api_url = "https://api.groq.com/openai/v1/chat/completions"

    def generate_summary(self, text, model="llama3-8b-8192", summary_type="brief"):
        payload = {
            "model": model,
            "messages": [{"role": "user", "content": f"Summarize this article: {text}"}],
            "temperature": 0.3,
            "max_tokens": 150 if summary_type == "brief" else 300,
        }
        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}

        try:
            response = requests.post(self.api_url, json=payload, headers=headers)
            response_json = response.json()
            return response_json.get("choices", [{}])[0].get("message", {}).get("content", "Error generating summary.")
        except Exception as e:
            print(f"Error generating summary: {e}")
            return "Error generating summary."


class EmbeddingEngine:
    def __init__(self, db_path="./vector_db"):
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.client = chromadb.PersistentClient(path=db_path)
        self.collection = self.client.get_or_create_collection(name="news_embeddings")

    def create_embedding(self, text):
        return self.model.encode(text).tolist()

    def store_embedding(self, article):
        embedding = self.create_embedding(article["content"])
        existing_ids = self.collection.get(ids=[article["url"]])

        # Prevent storing duplicate articles
        if not existing_ids["ids"]:
            self.collection.add(
                ids=[article["url"]],
                embeddings=[embedding],
                metadatas=[{
                    "title": article["title"],
                    "description": article["description"],
                    "url": article["url"]
                }]
            )

    def search_similar(self, query, top_k=3):
        query_embedding = self.create_embedding(query)
        results = self.collection.query(query_embeddings=[query_embedding], n_results=top_k)
        return results.get("metadatas", [[]])[0]


class NewsRetriever:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://newsapi.org/v2/everything"

    def fetch_news(self, query, max_results=5):
        params = {"q": query, "apiKey": self.api_key, "language": "en", "pageSize": max_results}
        response = requests.get(self.base_url, params=params)
        if response.status_code == 200:
            return [
                {"title": art["title"], "description": art["description"], "content": art["content"], "url": art["url"]}
                for art in response.json().get("articles", []) if art.get("content")
            ]
        print(f"Error fetching news: {response.status_code}")
        return []


class UserPreferences:
    def __init__(self, file_path="user_prefs.json", max_history=10):
        self.file_path = file_path
        self.max_history = max_history
        self.data = {"favorite_topics": [], "search_history": []}
        self._load_preferences()

    def _load_preferences(self):
        if os.path.exists(self.file_path):
            with open(self.file_path, "r") as file:
                self.data = json.load(file)

    def save_preferences(self):
        with open(self.file_path, "w") as file:
            json.dump(self.data, file, indent=4)

    def add_favorite_topic(self, topic):
        if topic not in self.data["favorite_topics"]:
            self.data["favorite_topics"].append(topic)
            self.save_preferences()

    def add_search_history(self, topic, articles):
        self.data["search_history"] = [h for h in self.data["search_history"] if h["topic"] != topic]
        self.data["search_history"].insert(0, {"topic": topic, "articles": articles})
        self.data["search_history"] = self.data["search_history"][:self.max_history]
        self.save_preferences()

    def get_favorites(self):
        return self.data["favorite_topics"]

    def get_history(self):
        return self.data["search_history"]

    def clear_history(self):
        self.data["search_history"] = []
        self.save_preferences()


if __name__ == "__main__":
    API_KEY = "f2c3a5633daf467e97896e3c31303e19"
    GROQ_API_KEY = "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"

    news_retriever = NewsRetriever(API_KEY)
    summarizer = GroqClient(GROQ_API_KEY)
    embedding_engine = EmbeddingEngine()
    user_prefs = UserPreferences()

    while True:
        choice = input("\nMENU\n1 - Search News\n2 - View History\n3 - View Favorites\n4 - Clear History\n5 - Exit\nSelect: ").strip()

        if choice == "1":
            topic = input("Enter topic: ").strip()
            articles = news_retriever.fetch_news(topic)
            if not articles:
                print("No articles found.")
                continue

            stored_articles = []
            for article in articles:
                embedding_engine.store_embedding(article)
                brief_summary = summarizer.generate_summary(article["content"], model="llama3-8b-8192", summary_type="brief")
                print(f"\nTitle: {article['title']}\nBrief Summary: {brief_summary}\n")
                stored_articles.append({"title": article["title"], "brief_summary": brief_summary})

            user_prefs.add_search_history(topic, stored_articles)
            if input("Add topic to favorites? (yes/no): ").strip().lower() == "yes":
                user_prefs.add_favorite_topic(topic)

        elif choice == "2":
            for entry in user_prefs.get_history():
                print(f"\n{entry['topic']} - {len(entry['articles'])} articles")

        elif choice == "3":
            print("\nFavorite Topics:", ', '.join(user_prefs.get_favorites()) or "None")

        elif choice == "4":
            user_prefs.clear_history()
            print("Search history cleared.")

        elif choice == "5":
            break

        else:
            print("Invalid choice. Try again.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


MENU
1 - Search News
2 - View History
3 - View Favorites
4 - Clear History
5 - Exit
Select: 1
Enter topic: computer science

Title: This New Algorithm for Sorting Books or Files Is Close to Perfection
Brief Summary: The article discusses a new algorithm in computer science that has significant implications for anyone who uses the internet. The algorithm, known as the "Distributed Gradient Descent" (DGD), is a method for training machine learning models that is faster and more efficient than previous methods.

The article explains that DGD is particularly important because it allows for the training of large machine learning models on distributed computing systems, such as clusters of computers or cloud computing services. This is significant because it enables the training of models that are too large to be trained on a single computer, which is a major limitation of current machine learning technology.

The article also notes that DGD has the potential to revolutionize many fields, i