<a href="https://colab.research.google.com/github/Divya06-QE/AI-LearningInsights/blob/main/Simple_News_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This is a simple, all-in-one script for a news chatbot.
# It handles everything from scraping news to answering your questions with AI.
# The code is designed to be easy to follow for non-coders.

# --- 1. Install necessary libraries if they are not already installed ---
# The code below will check for and install the required libraries.
# This prevents the "ModuleNotFoundError" you encountered.
try:
    import requests
    from bs4 import BeautifulSoup
    import nltk
    from sklearn.feature_extraction.text import TfidfVectorizer
    from pymongo import MongoClient
    from openai import OpenAI
    import dotenv
    import time
    import json
    import os
except ImportError:
    print("One or more required modules not found. Installing now...")
    import subprocess
    import sys

    required_packages = [
        "requests",
        "beautifulsoup4",
        "nltk",
        "scikit-learn",
        "pymongo",
        "openai",
        "python-dotenv"
    ]

    for package in required_packages:
        try:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        except subprocess.CalledProcessError as e:
            print(f"Failed to install {package}. Error: {e}")
            sys.exit(1)

    print("\n--- Installation Complete ---")
    print("All required packages installed successfully.")
    print("Please run this script again to continue.")
    sys.exit(0)

# --- 2. Import all the necessary tools (Dependencies) ---
# These are like the building blocks we need for our project.
# `requests` helps us download web pages.
import requests
# `BeautifulSoup` helps us read and find information in web pages.
from bs4 import BeautifulSoup
# `nltk` is a library for working with human language.
import nltk
# These specific parts of `nltk` help us find common words and break sentences into words.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# `time` lets us pause the program for a moment.
import time
# `json` helps us work with data in a format a lot of programs understand.
import json
# `os` helps us get information from our computer, like API keys.
import os
# `dotenv` helps us read our secret keys from a special file named ".env".
from dotenv import load_dotenv

# For data processing and finding similar articles
# `TfidfVectorizer` is a tool that turns words into numbers.
# It helps the computer understand how important a word is in a document.
from sklearn.feature_extraction.text import TfidfVectorizer
# `cosine_similarity` is a math tool that measures how similar two sets of numbers are.
# We use it to find out if two articles are about the same topic.
from sklearn.metrics.pairwise import cosine_similarity

# For connecting to our database
# `MongoClient` is the main tool we use to talk to our MongoDB database.
from pymongo import MongoClient

# For our AI chatbot
# `OpenAI` is the library we use to talk to the powerful AI models from OpenAI.
from openai import OpenAI

# --- 3. Load API Keys from a secure file ---
# This loads your private keys from a file named ".env". This is safer than putting them directly in the code.
load_dotenv()
# We get the keys from the environment variables.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MONGO_URI = os.getenv("MONGO_URI")

# --- 4. Set up our AI and Database connections ---
# We create a client to talk to the OpenAI service. If the key is missing, we'll see an error message.
if not OPENAI_API_KEY:
    print("Error: Missing OpenAI API key. Please add it to your .env file.")
else:
    openai_client = OpenAI(api_key=OPENAI_API_KEY)

# We create a client to talk to our MongoDB database.
if not MONGO_URI:
    print("Error: Missing MongoDB URI. Please add it to your .env file.")
else:
    try:
        mongo_client = MongoClient(MONGO_URI)
        db = mongo_client.news_db
        highlights_collection = db.highlights
        print("Connected to MongoDB successfully!")
    except Exception as e:
        print(f"Error connecting to MongoDB: {e}")
        mongo_client = None
        db = None
        highlights_collection = None

# --- 5. Prepare the AI and Text tools ---
# We download the text processing models from NLTK.
try:
    # This checks if the 'punkt' and 'stopwords' models are already on your computer.
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Required NLTK packages not found. Downloading now...")
    nltk.download('punkt')
    nltk.download('stopwords')
    print("NLTK packages downloaded successfully.")

# --- 6. Define where we get our news from ---
# This is a list of news categories and the websites we will scrape.
NEWS_SOURCES = {
    'sports': 'https://www.theguardian.com/au/sport',
    'lifestyle': 'https://7news.com.au/lifestyle',
    'music': 'https://7news.com.au/entertainment/music',
    'finance': 'https://www.theguardian.com/au/business/finance'
}

# --- 7. Helper Functions (The magic happens here) ---

def get_articles_from_source(category, url):
    """
    This function visits a news website and scrapes the articles.
    It looks for titles and summaries and puts them into a list.
    """
    articles = []
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        article_elements = soup.find_all(['article', 'div'], class_=['media-body', 'c-card', 'c-card__content'])

        for elem in article_elements[:10]:
            title_tag = elem.find(['h2', 'h3', 'h4', 'a'], class_=['media-heading', 'c-card__headline', 'headline'])
            summary_tag = elem.find(['p', 'div'], class_=['media-description', 'c-card__summary', 'description'])
            if title_tag:
                title = title_tag.get_text(strip=True)
                summary = summary_tag.get_text(strip=True) if summary_tag else "No summary available."
                link = title_tag.find_parent('a')['href'] if title_tag.find_parent('a') else "No link."
                author_tag = elem.find(class_='byline')
                author = author_tag.get_text(strip=True) if author_tag else "Unknown Author"
                articles.append({
                    'category': category,
                    'source': url,
                    'title': title,
                    'summary': summary,
                    'author': author,
                    'link': link
                })
    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
    return articles

def find_duplicates_and_cluster(articles):
    """
    This function finds and groups articles that are about the same story.
    It uses a technique called TF-IDF and a similarity score (cosine similarity).
    Think of it as finding duplicates even if the words are slightly different.
    """
    if not articles:
        return []
    corpus = [a['title'] + " " + a['summary'] for a in articles]
    cleaned_corpus = [doc for doc in corpus if doc.strip()]
    if not cleaned_corpus:
        return articles

    # Converts text into numbers the computer can understand
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(cleaned_corpus)
    # Measures how similar the articles are (0 = not similar, 1 = identical)
    cosine_sim = cosine_similarity(tfidf_matrix)

    clusters = []
    processed_indices = set()
    for i in range(len(cosine_sim)):
        if i not in processed_indices:
            cluster = [articles[i]]
            processed_indices.add(i)
            for j in range(i + 1, len(cosine_sim)):
                if cosine_sim[i][j] > 0.5 and j not in processed_indices:
                    cluster.append(articles[j])
                    processed_indices.add(j)
            clusters.append(cluster)
    return clusters

def get_highlights_from_clusters(clusters):
    """
    This function turns the article groups into a final list of top highlights.
    It prioritizes stories that were reported by the most sources.
    """
    highlights = []
    for cluster in clusters:
        main_article = cluster[0]
        summary = main_article['summary']
        sources = list(set([a['source'] for a in cluster]))
        authors = list(set([a['author'] for a in cluster]))

        highlights.append({
            'title': main_article['title'],
            'summary': summary,
            'sources': sources,
            'authors': authors,
            'frequency': len(cluster)
        })
    # Sorts the highlights so the most frequent stories are at the top
    return sorted(highlights, key=lambda x: x['frequency'], reverse=True)

def save_highlights(highlights_data):
    """
    This function saves the processed news to our MongoDB database.
    It first clears the old news and then saves the new highlights.
    """
    if highlights_collection:
        highlights_collection.delete_many({})
        for category, highlights_list in highlights_data.items():
            for highlight in highlights_list:
                highlight['category'] = category
                highlights_collection.insert_one(highlight)
        print("Highlights saved to MongoDB!")
    else:
        print("Could not save to MongoDB. Is your connection working?")

def find_relevant_highlights(query):
    """
    This function looks for news in our database that matches your question.
    It finds articles where the title or summary contains your keywords.
    """
    if not highlights_collection:
        return []

    # We look for documents that have your query in the title or summary.
    # The `"$regex"` part allows us to find partial matches.
    relevant_docs = list(highlights_collection.find({
        "$or": [
            {"title": {"$regex": query, "$options": "i"}},
            {"summary": {"$regex": query, "$options": "i"}}
        ]
    }))
    return relevant_docs[:5]

def answer_query(query):
    """
    This is our chatbot. It gets your question and finds a smart answer.
    1. It first retrieves relevant information from our database.
    2. Then, it sends that information to the AI model along with your question.
    3. The AI uses the provided information to generate a helpful response.
    """
    if not openai_client:
        return "I can't answer your question because the OpenAI client is not set up."

    # Step 1: Retrieval (getting the info)
    relevant_highlights = find_relevant_highlights(query)

    if not relevant_highlights:
        return "I'm sorry, I can't find any news about that topic."

    # Step 2: Augmentation (preparing the info for the AI)
    # We turn the database results into a clear message for the AI.
    context = json.dumps(relevant_highlights, indent=2)
    prompt = f"""
    Answer the following question based *only* on the provided news highlights. If the information is not present, state that you don't know.

    Context:
    {context}

    Question: {query}
    """

    # Step 3: Generation (getting the answer)
    try:
        # This is where we talk to the OpenAI model. We send our prompt as a 'user' message.
        response = openai_client.chat.completions.create(
            model="gpt-4o",  # The specific AI model we are using
            messages=[
                {"role": "system", "content": "You are a helpful assistant who answers questions based on a provided context."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error during AI model call: {e}")
        return "I'm having trouble connecting to the AI model right now."

# --- 8. The main part of the program that runs everything ---
if __name__ == "__main__":
    print("Welcome to the Simple News Chatbot!")
    print("\n--- Let's get things ready ---")
    print("Checking for required libraries and NLTK data...")

    # We download the text processing models from NLTK.
    try:
        # This checks if the 'punkt' and 'stopwords' models are already on your computer.
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
        print("All NLTK packages are already installed!")
    except LookupError:
        print("Required NLTK packages not found. Downloading now...")
        nltk.download('punkt')
        nltk.download('stopwords')
        print("NLTK packages downloaded successfully.")

    print("\n--- Starting the news scraping process ---")

    # Get all the news articles from our sources
    all_articles = []
    for category, url in NEWS_SOURCES.items():
        print(f"Scraping {category} from {url}...")
        articles = get_articles_from_source(category, url)
        all_articles.extend(articles)
        time.sleep(1) # Wait a moment to be polite to the websites

    if all_articles:
        print("\nFinding and grouping similar news stories...")
        categorized_articles = {cat: [] for cat in NEWS_SOURCES.keys()}
        for article in all_articles:
            categorized_articles[article['category']].append(article)

        news_highlights = {}
        for category, articles in categorized_articles.items():
            clusters = find_duplicates_and_cluster(articles)
            highlights = get_highlights_from_clusters(clusters)
            news_highlights[category] = highlights

        # Save the processed news to our database
        save_highlights(news_highlights)

        print("\nNews highlights are ready!")
    else:
        print("\nNo articles were scraped. Please check your internet connection or news sources.")

    print("\n--- Start the Chatbot ---")
    print("You can ask questions about the news highlights. Type 'exit' to quit.")

    # The chatbot loop
    while True:
        user_query = input("\nAsk a question: ")
        if user_query.lower() == 'exit':
            print("Goodbye!")
            break

        # Get the AI to answer our question
        response = answer_query(user_query)
        print(f"\nAI: {response}")

Error: Missing OpenAI API key. Please add it to your .env file.
Error: Missing MongoDB URI. Please add it to your .env file.
Welcome to the Simple News Chatbot!

--- Let's get things ready ---
Checking for required libraries and NLTK data...
All NLTK packages are already installed!

--- Starting the news scraping process ---
Scraping sports from https://www.theguardian.com/au/sport...
Scraping lifestyle from https://7news.com.au/lifestyle...
Scraping music from https://7news.com.au/entertainment/music...
Scraping finance from https://www.theguardian.com/au/business/finance...
Error scraping https://www.theguardian.com/au/business/finance: 404 Client Error: Not Found for url: https://www.theguardian.com/au/business/finance

No articles were scraped. Please check your internet connection or news sources.

--- Start the Chatbot ---
You can ask questions about the news highlights. Type 'exit' to quit.
