In [96]:
import json
from groq import Groq

# Initialize Groq client with API key (replace with your key)
client = Groq(api_key="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")

def extract_topics(user_query):
    """
    Uses a small LLM to extract the main topic and geographic sub-topics from the user's query.
    
    Args:
        user_query (str): The user's input question.

    Returns:
        dict: JSON-formatted response with extracted topics.
    """
    prompt = f"""
    You are an AI assistant that extracts the main topic from a news-related query.
    
    **Rules for Extraction:**
    1. Identify the **main topic**.
    **Examples:**
    **Input:** "Tell me about elections in India"
    **Output:**
    {{
        "main_topic": "India Elections",
    }}

    **Input:** "What is happening in Uttar Pradesh?"
    **Output:**
    {{
        "main_topic": "Uttar Pradesh News",
    }}

    **Input:** "Give me the latest updates on the stock market"
    **Output:**
    {{
        "main_topic": "Stock Market Updates",
    }}

    Now, process the following query and return the result in **valid JSON format**:
    "{user_query}"
    """

    response = client.chat.completions.create(
        model="mixtral-8x7b-32768",  # Using Mistral on Groq
        messages=[{"role": "system", "content": "Extract key topics in JSON format."},
                  {"role": "user", "content": prompt}],
        temperature=0.3
    )

    result = response.choices[0].message.content.strip()
    
    try:
        return json.loads(result)  # Ensure JSON format
    except json.JSONDecodeError:
        return {"error": "Failed to parse response"}

# Example usage
query = "Tell me about Rajasthan Elections"
topics = extract_topics(query)
# print(json.dumps(topics, indent=2))


In [98]:
from pygooglenews import GoogleNews
import json

def fetch_news_links(query, max_articles=5):
    """
    Fetches news article links using PyGoogleNews.

    Args:
        query (str): The search query (e.g., 'India Elections').
        max_articles (int): Maximum number of articles to retrieve.

    Returns:
        dict: JSON-formatted response containing news articles.
    """
    gn = GoogleNews()
    search_results = gn.search(query)

    news_links = []
    count = 0

    for entry in search_results["entries"]:
        if count >= max_articles:
            break

        news_links.append({
            "title": entry.title,
            "url": entry.link,
            "published_at": entry.published
        })
        count += 1

    return {"query": query, "articles": news_links}

# Example Usage:
query_data = topics

news_results = fetch_news_links(query_data["main_topic"], max_articles=3)
#print(json.dumps(news_results, indent=2))


{
  "query": "Rajasthan Elections",
  "articles": [
    {
      "title": "Madan Rathore elected unopposed to post of Rajasthan BJP chief - Moneycontrol",
      "url": "https://news.google.com/rss/articles/CBMitwFBVV95cUxQQnBYWHM0ckZyWHJ0ZkwyQ0ozT0c4TWduekIyM2dxbERmMHpFZU9PaXZpcWpXUlZtQzlHT0RQMVl0TjRObTU2bzVfSllnQkZhYjhsVUZleUt2VTNjZ0VTZ0FjalpKbHNpaE13bGxXaHpDT3ItZ1U0X2lxblNKZEJWQldRYk04bEU3eUFvczRxNDVPN3BlMGMwN1JCbVhmbWxNVEM5dDBRUl9xSHlNMXZNaGlTbC0xNGvSAbwBQVVfeXFMTkxpSHhHWWRRRDBDcDhvUi12SHdPOXlFS2VsaUhOQ2RoallTZF81VVZGblNORHVySER6OTJqcDdwS3pnVDhsRHFSa0k2ZzhVU0pVN0dLbFZYVi04WVJ4VkUxMktnaVZGOURPa2RkanV3SEcxWXhhMDlRSDdrYW5xVVo2dlVWQWRoaVBCZDZCb0VabEY2c1ZQSnZ4eFFIQUVORkNjMDlxVGpYajZ4dXBNMEI1d1lDaUVvZ01vaGs?oc=5",
      "published_at": "Sat, 22 Feb 2025 10:15:09 GMT"
    },
    {
      "title": "Madan Rathore re-elected as Rajasthan BJP President - First India",
      "url": "https://news.google.com/rss/articles/CBMikAFBVV95cUxQR1FRU2RFQzdXTTNCcm50X1FPaFZsbk44b01tMjFqYUJqNGdRR1JtM25xdlRYWW

In [143]:
from firecrawl import FirecrawlApp
from pydantic import BaseModel, Field
import json
import time
import os 
from dotenv import load_dotenv  # ✅ Load environment variables
load_dotenv()
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")

# Initialize FireCrawl
app = FirecrawlApp(api_key = firecrawl_api_key)  # Replace with actual API key

# Define Schema for Extracting News Content
class NewsArticle(BaseModel):
    title: str = Field(description="The title of the news article")
    author: str = Field(description="The author of the news article")
    publication_date: str = Field(description="The publication date of the news article")
    content: str = Field(description="The full content of the news article")

def extract_full_news_content(articles):
    """
    Extracts the full content of news articles using FireCrawl.

    Args:
        articles (list): List of article dictionaries with URLs.

    Returns:
        dict: JSON-formatted response containing the extracted content.
    """
    extracted_news = []

    for article in articles:
        url = article["url"]
        try:
            # Scrape the URL with the defined schema
            data = app.scrape_url(
                url,
                params={
                    "formats": ["extract"],
                    "extract": {
                        "schema": NewsArticle.model_json_schema()
                    },
                    "actions": [
                        {"type": "wait", "milliseconds": 2000},  # Wait for content to load
                        {"type": "scroll", "behavior": "smooth"}  # Scroll to load full content
                    ]
                }
            )

            # Extract Data
            extracted_data = data.get("extract", {})
            extracted_news.append({
                "title": extracted_data.get("title", article["title"]),  # Fallback to PyGoogleNews title
                "url": url,
                "published_at": extracted_data.get("publication_date", article["published_at"]),
                "author": extracted_data.get("author", "Unknown"),
                "content": extracted_data.get("content", "Content not available.")
            })

        except Exception as e:
            print(f"Error extracting {url}: {e}")

        # Adding a small delay between requests to avoid being blocked
        time.sleep(1.5)

    return {"articles": extracted_news}

# Example Usage:
news_links = news_results["articles"]

news_content = extract_full_news_content(news_links)
# print(json.dumps(news_content, indent=2))


In [107]:
import chromadb
from sentence_transformers import SentenceTransformer
import json

# Initialize ChromaDB (Persistent Storage)
chroma_client = chromadb.PersistentClient(path="./vector_store")
collection = chroma_client.get_or_create_collection("news_articles")

# Initialize Mistral-compatible embeddings model
embedding_model = SentenceTransformer("thenlper/gte-small")  # Use Mistral-compatible embeddings

def store_in_vector_db(articles):
    """
    Stores news articles in a vector database using Mistral embeddings.

    Args:
        articles (list): List of dictionaries containing news content.

    Returns:
        str: Confirmation message.
    """
    for article in articles:
        content = article["content"]
        embedding = embedding_model.encode(content).tolist()  # Convert to list for ChromaDB storage

        # Store in vector database
        collection.add(
            documents=[content],
            metadatas=[{"title": article["title"], "url": article["url"], "published_at": article["published_at"], "author": article["author"]}],
            embeddings=[embedding],
            ids=[article["url"]]
        )

    return "✅ News articles stored in vector database!"


# Store in Vector Database
result = store_in_vector_db(news_content['articles'])
print(result)


✅ News articles stored in vector database!


In [144]:
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
import json

# Initialize ChromaDB (Persistent Storage)
chroma_client = chromadb.PersistentClient(path="./vector_store")
collection = chroma_client.get_or_create_collection("news_articles")

# Initialize Mistral-compatible embedding model
embedding_model = SentenceTransformer("thenlper/gte-small")  # Use Mistral-compatible embeddings
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")

# Initialize Groq LLM (Mistral-7B) for answer generation
llm = ChatGroq(model_name="mixtral-8x7b-32768", api_key= groq_api_key)

def generate_seo_friendly_content(user_query):
    """
    Generates an SEO-optimized answer using all retrieved articles (without grading).

    Args:
        user_query (str): The user's question.

    Returns:
        dict: JSON-formatted response with SEO-friendly content.
    """
    retrieved_articles = retrieve_relevant_articles(user_query)

    if retrieved_articles:
        # SEO-Optimized Prompt
        prompt = f"""
        You are a fact-checking journalist and content writer specializing in SEO optimization. Your task is to generate a fact-based response and news summary 
        that follows SEO best practices.

        **SEO Rules:**
        - Use the primary keyword naturally in the title and first 100 words.
        - Ensure a compelling, click-worthy title (max 60 characters).
        - Add a meta description (150-160 characters).
        - Use relevant H2 & H3 subheadings for structure.
        - Include at least one internal and external link.
        - Use short, engaging paragraphs for better readability.
        - Only use information directly from the retrieved articles.

        **News Articles:**
        {json.dumps(retrieved_articles, indent=2)}

        **Generate an SEO-optimized news article including:**
        1. Title (H1) with main keyword
        2. SEO-friendly meta description
        3. Structured article with subheadings (H2, H3)
        4. Internal & external links
        5. Keyword-rich content
        6. Only use information directly from the retrieved articles.
        7. Do not speculate or add missing details.

        **Topic:** {user_query}
        """

        response = llm.invoke(prompt)

        return {
            "query": user_query,
            "seo_optimized_article": response.content,
            "source": "vector_database",
            "retrieved_articles": retrieved_articles
        }
    
    else:
        return {
            "query": user_query,
            "seo_optimized_article": "No relevant news articles found in stored data.",
            "source": "vector_database"
        }

# Example Usage:
user_question = query
seo_article = generate_seo_friendly_content(user_question)
print(json.dumps(seo_article, indent=2))


{
  "query": "Tell me about Rajasthan Elections",
  "seo_optimized_article": "Title (H1): Madan Rathore Re-elected as Rajasthan BJP President Amid By-Election Candidates Finalization\n\nMeta Description: Madan Rathore secures another term as Rajasthan BJP President, while the party finalizes by-election candidates and addresses rumors of internal discord.\n\nIntroduction:\nIn a significant development for the Rajasthan Bharatiya Janata Party (BJP), Madan Rathore has been re-elected as the state president amidst the finalization of candidates for upcoming by-elections and ongoing discussions about the party's national leadership. However, the absence of former Chief Minister Vasundhara Raje Scindia from a key meeting has sparked rumors of internal discord within the party.\n\n(H2) Madan Rathore Re-elected as Rajasthan BJP President\n\nMadan Rathore, a senior BJP leader, was re-elected as the president of the Rajasthan unit of the party on Saturday. Rathore, who was also elected unoppose

In [115]:
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
import json

# Initialize ChromaDB (Persistent Storage)
chroma_client = chromadb.PersistentClient(path="./vector_store")
collection = chroma_client.get_or_create_collection("news_articles")

# Initialize Mistral-compatible embedding model
embedding_model = SentenceTransformer("thenlper/gte-small")  # Use Mistral-compatible embeddings

# Initialize Groq LLM (Mistral-7B) for answer generation
llm = ChatGroq(model_name="mixtral-8x7b-32768", api_key = groq_api_key)

def extract_location_from_content(news_content):
    """
    Extracts geographic locations (cities, states, countries) from news content.

    Args:
        news_content (str): The text of the news article.

    Returns:
        str: Extracted location or "Unknown".
    """
    location_prompt = f"""
    Identify the geographic location from the following news article content. If no specific location is mentioned, return "Unknown".

    Content: {news_content}

    Provide output in a single JSON format:
    {{"location": "City/State/Country"}}
    """

    response = llm.invoke(location_prompt)

    try:
        location_data = json.loads(response.content)
        return location_data.get("location", "Unknown")
    except json.JSONDecodeError:
        return "Unknown"

def generate_seo_friendly_content(user_query):
    """
    Generates an SEO-optimized answer using all retrieved articles (without grading).
    Also extracts location-based subtopics.

    Args:
        user_query (str): The user's question.

    Returns:
        dict: JSON-formatted response with SEO-friendly content.
    """
    retrieved_articles = retrieve_relevant_articles(user_query)

    if retrieved_articles:
        # Extract locations for each article
        for article in retrieved_articles:
            article["location"] = extract_location_from_content(article["content"])

        # SEO-Optimized Prompt
        prompt = f"""
        You are a content writer specializing in SEO optimization. Your task is to generate a news summary 
        that follows SEO best practices.

        **SEO Rules:**
        - Use the primary keyword naturally in the title and first 100 words.
        - Ensure a compelling, click-worthy title (max 60 characters).
        - Add a meta description (150-160 characters).
        - Use relevant H2 & H3 subheadings for structure.
        - Include at least one internal and external link.
        - Use short, engaging paragraphs for better readability.

        **News Articles:**
        {json.dumps(retrieved_articles, indent=2)}

        **Generate an SEO-optimized news article including:**
        1. Title (H1) with main keyword
        2. SEO-friendly meta description
        3. Structured article with subheadings (H2, H3)
        4. Internal & external links
        5. Keyword-rich content
        6. Assign subtopics based on locations extracted from the articles

        **Topic:** {user_query}
        """

        response = llm.invoke(prompt)

        return {
            "query": user_query,
            "seo_optimized_article": response.content,
            "source": "vector_database",
            "retrieved_articles": retrieved_articles
        }
    
    else:
        return {
            "query": user_query,
            "seo_optimized_article": "No relevant news articles found in stored data.",
            "source": "vector_database"
        }

# Example Usage:
user_question = query
seo_article = generate_seo_friendly_content(user_question)
print(json.dumps(seo_article, indent=2))


{
  "query": "Tell me about Rajasthan Elections",
  "seo_optimized_article": "Title (H1): **Rajasthan BJP Elections: Madan Rathore Re-elected as President**\n\nMeta Description: Madan Rathore has been re-elected as the President of Rajasthan BJP. Learn about the election process, key participants, and the impact on the upcoming by-elections.\n\nSubtopics based on locations: Jaipur, Rajasthan, India\n\n---\n\n**Rajasthan BJP Elections: Madan Rathore Re-elected as President**\n\nMadan Rathore, a seasoned Bharatiya Janata Party leader, has been re-elected as the President of the Rajasthan unit of the party [Internal Link: (Rank 3)](https://news.google.com/rss/articles/CBMitwFBVV95cUxQQnBYWHM0ckZyWHJ0ZkwyQ0ozT0c4TWduekIyM2dxbERmMHpFZU9PaXZpcWpXUlZtQzlHT0RQMVl0TjRObTU2bzVfSllnQkZhYjhsVUZleUt2VTNjZ0VTZ0FjalpKbHNpaE13bGxXaHpDT3ItZ1U0X2lxblNKZEJWQldRYk04bEU3eUFvczRxNDVPN3BlMGMwN1JCbVhmbWxNVEM5dDBRUl9xSHlNMXZNaGlTbC0xNGvSAbwBQVVfeXFMTkxpSHhHWWRRRDBDcDhvUi12SHdPOXlFS2VsaUhOQ2RoallTZF81VVZGblNORH

In [141]:
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
import json
import time  # Used for adding slight delay between retries

# Initialize ChromaDB (Persistent Storage)
chroma_client = chromadb.PersistentClient(path="./vector_store")
collection = chroma_client.get_or_create_collection("news_articles")

# Initialize Mistral-compatible embedding model
embedding_model = SentenceTransformer("thenlper/gte-small")  # Use Mistral-compatible embeddings

# Initialize Groq LLM (Mistral-7B) for fact-based response generation
llm = ChatGroq(model_name="llama3-70b-8192", api_key="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")

# Hallucination Grader Data Model
class GradeHallucinations(BaseModel):
    """Binary score for hallucination presence in the generated answer."""
    binary_score: str = Field(description="Answer is grounded in the facts, '1' (yes) or '0' (no)")
    explanation: str = Field(description="Explain the reasoning for the score")

def handle_rate_limit(func, *args, **kwargs):
    """
    Handles rate limit errors by waiting before retrying.
    """
    retries = 3
    for attempt in range(retries):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            if "rate_limit_exceeded" in str(e):
                wait_time = 65  # Wait 65 seconds
                print(f"⚠️ Rate limit exceeded. Waiting {wait_time}s before retrying...")
                time.sleep(wait_time)
            else:
                raise e
    raise Exception("🚨 Max retries exceeded. Could not complete the request.")


# Function to retrieve relevant articles
def retrieve_relevant_articles(user_query, top_k=5):
    query_embedding = embedding_model.encode(user_query).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
    
    retrieved_articles = []
    for i, doc in enumerate(results["documents"][0]):
        retrieved_articles.append({
            "rank": i + 1,
            "title": results["metadatas"][0][i]["title"],
            "url": results["metadatas"][0][i]["url"],
            "published_at": results["metadatas"][0][i]["published_at"],
            "author": results["metadatas"][0][i]["author"],
            "content": doc
        })

    return retrieved_articles

# Function to extract location from content
def extract_location_from_content(news_content):
    """
    Extracts geographic locations (cities, states, countries) from news content.

    Args:
        news_content (str): The text of the news article.

    Returns:
        str: Extracted location or "Unknown".
    """
    location_prompt = f"""
    Identify the geographic location from the following news article content. If no specific location is mentioned, return "Unknown".

    Content: {news_content}

    Provide output in a single JSON format:
    {{"location": "City/State/Country"}}
    """

    response = llm.invoke(location_prompt)

    try:
        location_data = json.loads(response.content)
        return location_data.get("location", "Unknown")
    except json.JSONDecodeError:
        return "Unknown"

# Function to generate SEO-optimized content with auto-retry for hallucination
def generate_fact_based_seo_content(user_query, max_retries=3):
    retrieved_articles = retrieve_relevant_articles(user_query)
    
    if not retrieved_articles:
        return {
            "query": user_query,
            "seo_optimized_article": "No relevant news articles found in stored data.",
            "hallucination_score": 0,
            "explanation": "No articles available for this topic.",
            "source": "vector_database"
        }

    for article in retrieved_articles:
        article["location"] = extract_location_from_content(article["content"])

    retry_count = 0
    while retry_count < max_retries:
        # SEO-Optimized Prompt
        prompt = f"""
        You are an expert SEO content writer. Generate an SEO-optimized news summary for the given topic.

        **SEO Rules:**
        - Use the main keyword in the title and first 100 words.
        - Create an engaging **H1** title (max 60 characters).
        - Write a **meta description** (150-160 characters).
        - Use **H2 & H3 subheadings** for structure.
        - Include **internal & external links**.
        - Readable paragraphs

        **News Articles (with locations extracted):**
        {json.dumps(retrieved_articles, indent=2)}

        **Generate the article with:**
        1. Title (H1)  
        2. SEO-optimized meta description  
        3. Structured article (H2, H3)  
        4. Keyword-rich content  
        6. **Assign location-based subtopics**  

        **Topic:** {user_query}
        """

        response = llm.invoke(prompt)
        generated_content = response.content

        # Hallucination Grading Prompt
        hallucination_grader_prompt = f"""
        FACTS: {json.dumps(retrieved_articles, indent=2)}

        STUDENT ANSWER: {generated_content}

        Grade this answer:
        - Score 1: Answer is fully grounded in retrieved facts.
        - Score 0: Answer contains hallucinated information.

        Output in JSON: {{"binary_score": "1" or "0", "explanation": "Reasoning for the score"}}
        """

        hallucination_response = llm.invoke(hallucination_grader_prompt)

        try:
            grading_result = json.loads(hallucination_response.content)
            hallucination_score = int(grading_result.get("binary_score", 0))
            explanation = grading_result.get("explanation", "No explanation provided.")

            if hallucination_score == 1:
                return {
                    "query": user_query,
                    "seo_optimized_article": generated_content,
                    "hallucination_score": hallucination_score,
                    "explanation": explanation,
                    "source": "vector_database",
                    "retrieved_articles": retrieved_articles
                }
            else:
                retry_count += 1
                print(f"⚠️ Hallucination detected! Retrying attempt {retry_count}/{max_retries}...")
                time.sleep(2)  # Small delay before retrying

        except json.JSONDecodeError:
            return {
                "query": user_query,
                "seo_optimized_article": "Error in hallucination grading.",
                "hallucination_score": 0,
                "explanation": "Could not parse hallucination response.",
                "source": "vector_database"
            }

    # If max retries exceeded, return last attempt response
    return {
        "query": user_query,
        "seo_optimized_article": "Failed to generate a fact-based response after multiple retries.",
        "hallucination_score": 0,
        "explanation": "Maximum retries exceeded. Unable to ensure factual correctness.",
        "source": "vector_database"
    }

# Example Usage:
user_question = query
seo_article = generate_fact_based_seo_content(user_question)
print(json.dumps(seo_article, indent=2))


{
  "query": "Tell me about Rajasthan Elections",
  "seo_optimized_article": "Error in hallucination grading.",
  "hallucination_score": 0,
  "explanation": "Could not parse hallucination response.",
  "source": "vector_database"
}


In [133]:
import time
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
import json

# Initialize ChromaDB (Persistent Storage)
chroma_client = chromadb.PersistentClient(path="./vector_store")
collection = chroma_client.get_or_create_collection("news_articles")

# Initialize Mistral-compatible embedding model
embedding_model = SentenceTransformer("thenlper/gte-small")

# ✅ Use Different LLMs for Different Tasks
llm_location = ChatGroq(model_name="llama3-8b-8192", api_key="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")  # Small LLM
llm_summary = ChatGroq(model_name="mixtral-8x7b-32768", api_key="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")  # Mistral 7x8B for Summarization
llm_seo = ChatGroq(model_name="llama3-70b-8192", api_key="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")  # Mixtral for SEO

# Hallucination Grader Data Model
class GradeHallucinations(BaseModel):
    binary_score: str = Field(description="Answer is grounded in the facts, '1' (yes) or '0' (no')")
    explanation: str = Field(description="Explain the reasoning for the score")

def handle_rate_limit(func, *args, **kwargs):
    """
    Handles rate limit errors by waiting before retrying.
    """
    retries = 3
    for attempt in range(retries):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            if "rate_limit_exceeded" in str(e):
                wait_time = 65
                print(f"⚠️ Rate limit exceeded. Waiting {wait_time}s before retrying...")
                time.sleep(wait_time)
            else:
                raise e
    raise Exception("🚨 Max retries exceeded. Could not complete the request.")

# ✅ Step 1: Retrieve Relevant Articles (Limit to 3 articles)
def retrieve_relevant_articles(user_query, top_k=3):
    query_embedding = embedding_model.encode(user_query).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)

    retrieved_articles = []
    for i, doc in enumerate(results["documents"][0]):
        retrieved_articles.append({
            "rank": i + 1,
            "title": results["metadatas"][0][i]["title"],
            "url": results["metadatas"][0][i]["url"],
            "published_at": results["metadatas"][0][i]["published_at"],
            "author": results["metadatas"][0][i]["author"],
            "content": doc[:1000]  # ✅ Limit article content to 1000 characters
        })

    return retrieved_articles

# ✅ Step 2: Extract Location Using Mistral-7B
def extract_location_from_content(news_content):
    location_prompt = f"""
    Extract the geographic location (city, state, country) from the following news article content.
    If no specific location is mentioned, return "Unknown".

    Content: {news_content[:300]}  # ✅ Limit input size

    Provide output in JSON format: {{"location": "City/State/Country"}}
    """

    response = handle_rate_limit(llm_location.invoke, location_prompt)

    try:
        location_data = json.loads(response.content)
        return location_data.get("location", "Unknown")
    except json.JSONDecodeError:
        return "Unknown"

# ✅ Step 3: Summarize Articles Using Mistral-7x8B
def summarize_article(news_content):
    summary_prompt = f"""
    Summarize the following news article in **50 words or less**, keeping only key details.

    Article: {news_content[:1500]}  # ✅ Limit input size

    Provide the summary as JSON: {{"summary": "Short article summary"}}
    """

    response = handle_rate_limit(llm_summary.invoke, summary_prompt)

    try:
        summary_data = json.loads(response.content)
        return summary_data.get("summary", news_content[:500])  # Fallback if LLM fails
    except json.JSONDecodeError:
        return news_content[:500]  # Fallback

# ✅ Step 4: Generate SEO-Optimized Content Using Mixtral-8x7B
def generate_fact_based_seo_content(user_query, max_retries=3):
    retrieved_articles = retrieve_relevant_articles(user_query)
    
    if not retrieved_articles:
        return {
            "query": user_query,
            "seo_optimized_article": "No relevant news articles found in stored data.",
            "hallucination_score": 0,
            "explanation": "No articles available for this topic.",
            "source": "vector_database"
        }

    for article in retrieved_articles:
        article["location"] = extract_location_from_content(article["content"])
        article["summary"] = summarize_article(article["content"])

    retry_count = 0
    while retry_count < max_retries:
        prompt = f"""
        You are an expert SEO content writer. Generate an SEO-optimized news summary.

        **SEO Rules:**
        - Use the main keyword in the title and first 100 words.
        - Create an engaging **H1** title (max 60 characters).
        - Write a **meta description** (150-160 characters).
        - Use **H2 & H3 subheadings** for structure.
        - Include **internal & external links**.
        - Readable paragraphs.

        **Summarized News Articles (with locations extracted):**
        {json.dumps(retrieved_articles, indent=2)}

        **Generate:**
        1. Title (H1)  
        2. SEO-optimized meta description  
        3. Structured article (H2, H3)  
        4. Keyword-rich content  
        5. **Assign location-based subtopics**  

        **Topic:** {user_query}
        """

        response = handle_rate_limit(llm_seo.invoke, prompt)
        generated_content = response.content

        # ✅ Step 5: Hallucination Grading Using Mistral-7B
        hallucination_grader_prompt = f"""
        FACTS: {json.dumps(retrieved_articles, indent=2)}

        STUDENT ANSWER: {generated_content[:700]}  # ✅ Limit input size

        Grade this answer:
        - Score 1: Answer is fully grounded in retrieved facts.
        - Score 0: Answer contains hallucinated information.

        Provide output in JSON: {{"binary_score": "1" or "0", "explanation": "Reasoning for the score"}}
        """

        hallucination_response = handle_rate_limit(llm_location.invoke, hallucination_grader_prompt)

        try:
            grading_result = json.loads(hallucination_response.content)
            hallucination_score = int(grading_result.get("binary_score", 0))
            explanation = grading_result.get("explanation", "No explanation provided.")

            if hallucination_score == 1:
                return {
                    "query": user_query,
                    "seo_optimized_article": generated_content,
                    "hallucination_score": hallucination_score,
                    "explanation": explanation,
                    "source": "vector_database",
                    "retrieved_articles": retrieved_articles
                }
            else:
                retry_count += 1
                print(f"⚠️ Hallucination detected! Retrying attempt {retry_count}/{max_retries}...")
                time.sleep(2)

        except json.JSONDecodeError:
            return {"error": "Hallucination grading failed."}

    return {"error": "Failed after retries."}


In [134]:
user_question = "What is the latest update on India Elections?"
seo_article = generate_fact_based_seo_content(user_question)

In [135]:
print(json.dumps(seo_article, indent=2))


{
  "error": "Hallucination grading failed."
}


In [137]:
import time
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
import json

# Initialize ChromaDB (Persistent Storage)
chroma_client = chromadb.PersistentClient(path="./vector_store")
collection = chroma_client.get_or_create_collection("news_articles")

# Initialize Mistral-compatible embedding model
embedding_model = SentenceTransformer("thenlper/gte-small")

# Initialize Groq LLMs (Mixtral for generation, Mistral for preprocessing)
llm_main = ChatGroq(model_name="mixtral-8x7b-32768", api_key="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")
llm_preprocess = ChatGroq(model_name="mixtral-8x7b-32768", api_key="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")

# Hallucination Grader Data Model
class GradeHallucinations(BaseModel):
    binary_score: str = Field(description="Answer is grounded in the facts, '1' (yes) or '0' (no')")
    explanation: str = Field(description="Explain the reasoning for the score")

def handle_rate_limit(func, *args, **kwargs):
    """
    Handles rate limit errors by waiting before retrying.
    """
    retries = 3
    for attempt in range(retries):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            if "rate_limit_exceeded" in str(e):
                wait_time = 65  # Wait 65 seconds
                print(f"⚠️ Rate limit exceeded. Waiting {wait_time}s before retrying...")
                time.sleep(wait_time)
            else:
                raise e
    raise Exception("🚨 Max retries exceeded. Could not complete the request.")

# Function to retrieve relevant articles (limit to 3 articles)
def retrieve_relevant_articles(user_query, top_k=3):
    query_embedding = embedding_model.encode(user_query).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)

    retrieved_articles = []
    for i, doc in enumerate(results["documents"][0]):
        retrieved_articles.append({
            "rank": i + 1,
            "title": results["metadatas"][0][i]["title"],
            "url": results["metadatas"][0][i]["url"],
            "published_at": results["metadatas"][0][i]["published_at"],
            "author": results["metadatas"][0][i]["author"],
            "content": doc[:500]  # ✅ Limit article content to 500 characters
        })

    return retrieved_articles

# Function to extract location from content using Mistral (to reduce token usage)
def extract_location_from_content(news_content):
    location_prompt = f"""
    Extract the geographic location (city, state, country) from the following news article content.
    If no specific location is mentioned, return "Unknown".

    Content: {news_content[:300]}  # ✅ Limit location extraction input to 500 characters

    Provide output in JSON format: {{"location": "City/State/Country"}}
    """

    response = handle_rate_limit(llm_preprocess.invoke, location_prompt)

    try:
        location_data = json.loads(response.content)
        return location_data.get("location", "Unknown")
    except json.JSONDecodeError:
        return "Unknown"

# Function to generate SEO-optimized content with auto-retry for hallucination
def generate_fact_based_seo_content(user_query, max_retries=3):
    retrieved_articles = retrieve_relevant_articles(user_query)
    
    if not retrieved_articles:
        return {
            "query": user_query,
            "seo_optimized_article": "No relevant news articles found in stored data.",
            "hallucination_score": 0,
            "explanation": "No articles available for this topic.",
            "source": "vector_database"
        }

    for article in retrieved_articles:
        article["location"] = extract_location_from_content(article["content"])

    retry_count = 0
    while retry_count < max_retries:
        # ✅ Reduce prompt size by limiting articles & only essential metadata
        prompt = f"""
        You are an expert SEO content writer. Generate an SEO-optimized news summary.

        **SEO Rules:**
        - Use the main keyword in the title and first 600 words.
        - Create an engaging **H1** title (max 60 characters).
        - Write a **meta description** (150-160 characters).
        - Use **H2 & H3 subheadings** for structure.
        - Include **internal & external links**.
        - Readable paragraphs.

        **News Articles (with locations extracted) [Limited to 3]:**
        {json.dumps(retrieved_articles, indent=2)}

        **Generate:**
        1. Title (H1)  
        2. SEO-optimized meta description  
        3. Structured article (H2, H3)  
        4. Keyword-rich content  
        5. **Assign location-based subtopics**  

        **Topic:** {user_query}
        """

        response = handle_rate_limit(llm_main.invoke, prompt)
        generated_content = response.content

        # ✅ Reduce hallucination grading input size
        hallucination_grader_prompt = f"""
        FACTS (Summarized): {json.dumps(retrieved_articles, indent=2)}

        STUDENT ANSWER: {generated_content[:700]}  # ✅ Limit LLM response input to 1000 characters

        Grade this answer:
        - Score 1: Answer is fully grounded in retrieved facts.
        - Score 0: Answer contains hallucinated information.

        Provide output in JSON: {{"binary_score": "1" or "0", "explanation": "Reasoning for the score"}}
        """

        hallucination_response = handle_rate_limit(llm_main.invoke, hallucination_grader_prompt)
        source = []
        for i in range(0,len(retrieved_articles)):
            source = retrieved_articles[i]['url']

        try:
            grading_result = json.loads(hallucination_response.content)
            hallucination_score = int(grading_result.get("binary_score", 0))
            explanation = grading_result.get("explanation", "No explanation provided.")

            if hallucination_score == 1:
                return {
                    "query": user_query,
                    "seo_optimized_article": generated_content,
                    "explanation": explanation,
                    "source": source,
                    "retrieved_articles": retrieved_articles
                }
            else:
                retry_count += 1
                print(f"⚠️ Hallucination detected! Retrying attempt {retry_count}/{max_retries}...")
                time.sleep(2)  # Small delay before retrying

        except json.JSONDecodeError:
            return {
                "query": user_query,
                "seo_optimized_article": "Error in hallucination grading.",
                "explanation": "Could not parse hallucination response.",
                "source": "vector_database"
            }

    return {
        "query": user_query,
        "seo_optimized_article": "Failed to generate a fact-based response after multiple retries.",
        "explanation": "Maximum retries exceeded. Unable to ensure factual correctness.",
        "source": "vector_database"
    }

# Example Usage:
user_question = query
seo_article = generate_fact_based_seo_content(user_question)
print(json.dumps(seo_article, indent=2))


{
  "query": "Tell me about Rajasthan Elections",
  "seo_optimized_article": "**Title (H1):** Rajasthan Elections: Madan Rathore Re-Elected as BJP President Amid By-Election Finalizations\n\n**SEO-optimized meta description:** Uncover the latest updates on Rajasthan elections as Madan Rathore is re-elected as BJP President, and by-election candidates are finalized. Dive into the political scenario in Jaipur, Rajasthan, and India.\n\n**Structured article (H2, H3):**\n\n**Madan Rathore Re-Elected as Rajasthan BJP President** (H2)\n\n* Madan Rathore's re-election (H3)\n* Rathore's values and unity (H3)\n\n**Rajasthan By-Elections: Candidates Finalized Amidst Rumors of Rift** (H2)\n\n* Finalized panel of candidates (H3)\n* Absence of Vasundhara Raje Scindia (H3)\n* Raje's distance from party activities (H3)\n\n**Location-based subtopics:** (H3)\n\n* Political scenario in Jaipur (H4)\n* Rajasthan's electoral dynamics (H4)\n* National implications (H4)\n\n**Keyword-rich content:**\n\nMadan R

In [142]:
import time
import chromadb
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
import json

# ✅ Initialize ChromaDB (Persistent Storage)
chroma_client = chromadb.PersistentClient(path="./vector_store")
collection = chroma_client.get_or_create_collection("news_articles")

# ✅ Use Mistral-compatible embeddings (Ensure consistency)
embedding_model = SentenceTransformer("thenlper/gte-small")

# ✅ Use Llama3-70B-8192 for Content Generation
llm = ChatGroq(model_name="llama3-70b-8192", api_key="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")

# ✅ Hallucination Grader Data Model
class GradeHallucinations(BaseModel):
    binary_score: str = Field(description="Answer is grounded in the facts, '1' (yes) or '0' (no')")
    explanation: str = Field(description="Explain the reasoning for the score")

def handle_rate_limit(func, *args, **kwargs):
    """Handles rate limit errors by waiting before retrying."""
    retries = 3
    for attempt in range(retries):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            if "rate_limit_exceeded" in str(e):
                wait_time = 65
                print(f"⚠️ Rate limit exceeded. Waiting {wait_time}s before retrying...")
                time.sleep(wait_time)
            else:
                raise e
    raise Exception("🚨 Max retries exceeded. Could not complete the request.")

# ✅ Step 1: Retrieve Relevant Articles from ChromaDB
def retrieve_relevant_articles(user_query, top_k=5):
    query_embedding = embedding_model.encode(user_query).tolist()
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)

    retrieved_articles = []
    if results["documents"] and results["documents"][0]:
        for i, doc in enumerate(results["documents"][0]):
            retrieved_articles.append({
                "rank": i + 1,
                "title": results["metadatas"][0][i]["title"],
                "url": results["metadatas"][0][i]["url"],
                "published_at": results["metadatas"][0][i].get("published_at", "Unknown"),
                "author": results["metadatas"][0][i].get("author", "Unknown"),
                "content": doc[:1000]  # ✅ Limit article content to 1000 characters
            })
    else:
        print("⚠️ No relevant articles found!")

    return retrieved_articles

# ✅ Step 2: Extract Location Using Llama3
def extract_location_from_content(news_content):
    location_prompt = f"""
    Identify the geographic location (city, state, country) from the following news article content.
    If no specific location is mentioned, return "Unknown".

    Content: {news_content[:300]}  # ✅ Limit input size

    Provide output in JSON: {{"location": "City/State/Country"}}
    """

    response = handle_rate_limit(llm.invoke, location_prompt)

    try:
        location_data = json.loads(response.content)
        return location_data.get("location", "Unknown")
    except json.JSONDecodeError:
        return "Unknown"

# ✅ Step 3: Generate SEO-Optimized Content with Auto-Retry for Hallucinations
def generate_fact_based_seo_content(user_query, max_retries=3):
    retrieved_articles = retrieve_relevant_articles(user_query)

    if not retrieved_articles:
        return {
            "query": user_query,
            "seo_optimized_article": "No relevant news articles found in stored data.",
            "hallucination_score": 0,
            "explanation": "No articles available for this topic.",
            "source": "vector_database"
        }

    for article in retrieved_articles:
        article["location"] = extract_location_from_content(article["content"])

    retry_count = 0
    while retry_count < max_retries:
        prompt = f"""
        You are an expert SEO content writer. Generate an SEO-optimized news summary.

        **SEO Rules:**
        - Use the main keyword in the title and first 100 words.
        - Create an engaging **H1** title (max 60 characters).
        - Write a **meta description** (150-160 characters).
        - Use **H2 & H3 subheadings** for structure.
        - Include **internal & external links**.
        - Readable paragraphs.

        **Summarized News Articles (with locations extracted):**
        {json.dumps(retrieved_articles, indent=2)}

        **Generate:**
        1. Title (H1)  
        2. SEO-optimized meta description  
        3. Structured article (H2, H3)  
        4. Keyword-rich content  
        5. **Assign location-based subtopics**  

        **Topic:** {user_query}
        """

        response = handle_rate_limit(llm.invoke, prompt)
        generated_content = response.content

        # ✅ Step 4: Hallucination Grading with Auto-Retry
        grading_result = grade_hallucination_with_retry(generated_content, retrieved_articles)

        if grading_result["binary_score"] == "1":
            return {
                "query": user_query,
                "seo_optimized_article": generated_content,
                "hallucination_score": 1,
                "explanation": grading_result["explanation"],
                "source": "vector_database",
                "retrieved_articles": retrieved_articles
            }
        else:
            retry_count += 1
            print(f"⚠️ Hallucination detected! Retrying attempt {retry_count}/{max_retries}...")
            time.sleep(2)

    return {"error": "Failed after retries."}

# ✅ Step 5: Hallucination Grading Function with Retry Mechanism
def grade_hallucination_with_retry(generated_content, retrieved_articles, max_retries=3):
    retry_count = 0
    while retry_count < max_retries:
        result = grade_hallucination(generated_content, retrieved_articles)

        if result["binary_score"] in ["0", "1"]:  # ✅ Valid response
            return result

        retry_count += 1
        print(f"⚠️ Hallucination grading failed. Retrying... {retry_count}/{max_retries}")
        time.sleep(2)

    return {"binary_score": "0", "explanation": "Max retries exceeded. Could not ensure factual correctness."}

# ✅ Step 6: Hallucination Grading LLM Call
def grade_hallucination(generated_content, retrieved_articles):
    hallucination_grader_prompt = f"""
    FACTS: {json.dumps(retrieved_articles, indent=2)[:2000]}  # ✅ Limit input size

    STUDENT ANSWER: {generated_content[:1000]}  # ✅ Limit LLM input size

    Grade this answer:
    - Score 1: Answer is fully grounded in retrieved facts.
    - Score 0: Answer contains hallucinated information.

    Provide output strictly in JSON:
    {{"binary_score": "1" or "0", "explanation": "Step-by-step reasoning"}}
    """

    response = handle_rate_limit(llm.invoke, hallucination_grader_prompt)

    try:
        return json.loads(response.content)
    except json.JSONDecodeError:
        print("⚠️ JSON Parsing Failed. Raw Response:", response.content)
        return {"binary_score": "0", "explanation": "Failed to parse JSON response"}

# ✅ Example Usage:
user_question = query
seo_article = generate_fact_based_seo_content(user_question)
print(json.dumps(seo_article, indent=2))


{
  "query": "Tell me about Rajasthan Elections",
  "seo_optimized_article": "**Title (H1):** Rajasthan Elections: Madan Rathore Re-Elected as BJP State President\n\n**Meta Description (150-160 characters):** Get the latest updates on Rajasthan elections, including Madan Rathore's re-election as BJP state president and the party's plans for the upcoming by-elections.\n\n**Structured Article:**\n\n### Rajasthan Elections: The Latest Developments\n\n#### Madan Rathore Re-Elected as BJP State President\n\nMadan Rathore, a senior Bharatiya Janata Party (BJP) leader, has been re-elected as the president of the Rajasthan unit of the party. This decision was made on Saturday, with Rathore being elected unopposed in the presence of state Chief Minister Bhajanlal Sharma at the party office. [Read more](https://news.google.com/rss/articles/CBMikAFBVV95cUxQR1FRU2RFQzdXTTNCcm50X1FPaFZsbk44b01tMjFqYUJqNGdRR1JtM25xdlRYWWVIazFDT0RTMktJMnp0VnZ0TFdXZkZXbnhTZ1VXYnA4N2RlM0xxYVpoTzdpQjJ2WHJqNTBxZmlLNWswY2