<a href="https://colab.research.google.com/github/Amirthavarshini05/Rap_Rag/blob/main/SmartBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install streamlit transformers bs4 requests pyngrok


Collecting streamlit
  Downloading streamlit-1.47.1-py3-none-any.whl.metadata (9.0 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.47.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m451.5 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!ngrok config add-authtoken 30rgZqrKIqeE2M62bA6Z1Yk8p2d_4uiuN89n9ft7x2ohawyZQ

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
%%writefile app.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
from transformers import T5ForConditionalGeneration, T5Tokenizer
import re
import concurrent.futures
import time
from difflib import SequenceMatcher


Overwriting app.py


In [None]:
%%writefile -a app.py
def is_relevant(name, text):
    ratio = SequenceMatcher(None, name.lower(), text.lower()).ratio()
    return name.lower() in text.lower() or ratio > 0.6

def fetch_article(item, headers):
    try:
        resp = requests.get(item['link'], headers=headers, timeout=5)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        text = ' '.join([p.get_text() for p in soup.find_all('p')])
        if len(text) < 200:
            text = item['snippet']
        return {'title': item['title'], 'text': text, 'source_link': item['link']} if len(text) > 50 else None
    except:
        return None


Appending to app.py


In [None]:
%%writefile -a app.py
def search_and_scrape(company):
    url = f"https://www.bing.com/news/search?q={company.replace(' ', '+')}"
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
    except:
        return [], False

    results = soup.find_all('a', class_='title')[:5]
    snippets = soup.find_all('div', class_='snippet')

    if not results:
        return [], False

    articles = []
    relevant_count = 0

    for i, link in enumerate(results):
        title = link.get_text()
        snippet = snippets[i].get_text() if i < len(snippets) else ""
        if is_relevant(company, title) or is_relevant(company, snippet):
            relevant_count += 1
            articles.append({'title': title, 'link': link['href'], 'snippet': snippet})

    if relevant_count < 2:
        return [], False

    with concurrent.futures.ThreadPoolExecutor() as executor:
        fetched = executor.map(lambda a: fetch_article(a, headers), articles)
        final_articles = [r for r in fetched if r]

    return final_articles, True


Appending to app.py


In [None]:
%%writefile -a app.py
@st.cache_resource
def load_model():
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    return tokenizer, model

tokenizer, model = load_model()

def summarize(articles, style):
    summaries = []
    for art in articles[:3]:
        text = f"summarize: {art['text'][:700]}"
        inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
        ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0,
                             num_beams=4, early_stopping=True)
        summary = tokenizer.decode(ids[0], skip_special_tokens=True)

        if style == "Quick bullet points":
            summary = "- " + summary.replace(". ", "\n- ")
        elif style == "Casual conversation":
            summary = "Here's what I found: " + summary

        summaries.append({"title": art['title'], "summary": summary, "source_link": art['source_link']})
    return summaries


Appending to app.py


In [None]:
%%writefile -a app.py
def extract_names(user_input):
    keywords = ["news", "about", "latest", "updates", "tell", "information", "info", "on", "for"]
    cleaned = re.sub(r'\b(?:' + '|'.join(keywords) + r')\b', '', user_input.lower())
    cleaned = cleaned.replace(" and ", ",")
    names = re.split(r'[,\n]+', cleaned)
    return [n.strip().capitalize() for n in names if n.strip()]

def conversation_reply(user_input):
    greetings = ["hi", "hello", "how are you", "hey"]
    thanks = ["thank you", "thanks", "thx", "thank u"]
    goodbyes = ["bye", "goodbye", "see you"]

    text = user_input.lower()

    if any(g in text for g in greetings):
        return "👋 Hi! I can fetch the latest news about any company. Type one or multiple names."
    elif any(t in text for t in thanks):
        return "😊 You're welcome! Happy to help."
    elif any(gb in text for gb in goodbyes):
        return "👋 Goodbye! Have a great day!"
    return None


Appending to app.py


In [None]:
%%writefile -a app.py
def main():
    st.title("📰 Smart Company News Chatbot")

    if 'chat_history' not in st.session_state:
        st.session_state.chat_history = []

    output_style = st.radio("Output style:",
                            ('Formal business summary', 'Quick bullet points', 'Casual conversation'))

    for msg in st.session_state.chat_history:
        with st.chat_message("user" if msg["role"] == "user" else "assistant"):
            st.markdown(msg["text"])

    user_input = st.chat_input("Type your message...")
    if user_input:
        start = time.time()
        st.session_state.chat_history.append({"role": "user", "text": user_input})

        reply = conversation_reply(user_input)
        if reply:
            with st.chat_message("assistant"):
                st.markdown(reply)
            st.session_state.chat_history.append({"role": "assistant", "text": reply})
        else:
            names = extract_names(user_input)
            if not names:
                combined_response = "⚠️ Invalid input. Please enter company names."
            else:
                combined_response = ""
                with concurrent.futures.ThreadPoolExecutor() as executor:
                    results = list(executor.map(search_and_scrape, names))

                for name, (data, is_valid) in zip(names, results):
                    if not is_valid or not data:
                        combined_response += f"\n⚠️ Invalid company name: '{name}'.\n"
                    else:
                        summarized = summarize(data, output_style)
                        combined_response += f"\n### 📰 News for {name}:\n"
                        for item in summarized:
                            combined_response += f"- **{item['title']}**\n  {item['summary']}\n  [Source]({item['source_link']})\n\n"

                total_time = round(time.time() - start, 2)
                combined_response += f"\n⏱️ Response time: {total_time}s"

            with st.chat_message("assistant"):
                st.markdown(combined_response)
            st.session_state.chat_history.append({"role": "assistant", "text": combined_response})

if __name__ == "__main__":
    main()


Appending to app.py


In [None]:
%%writefile app.py

import streamlit as st
import requests
from bs4 import BeautifulSoup
from transformers import T5ForConditionalGeneration, T5Tokenizer
import re
import concurrent.futures
import time
from difflib import SequenceMatcher

# ---------------------------
# Helper for relevance check
# ---------------------------
def is_relevant(name, text):
    ratio = SequenceMatcher(None, name.lower(), text.lower()).ratio()
    return name.lower() in text.lower() or ratio > 0.6

# ---------------------------
# Fetch article content
# ---------------------------
def fetch_article(item, headers):
    try:
        resp = requests.get(item['link'], headers=headers, timeout=5)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        text = ' '.join([p.get_text() for p in soup.find_all('p')])
        if len(text) < 200:
            text = item['snippet']
        return {'title': item['title'], 'text': text, 'source_link': item['link']} if len(text) > 50 else None
    except:
        return None

# ---------------------------
# Search and scrape with strict invalid detection
# ---------------------------
def search_and_scrape(company):
    url = f"https://www.bing.com/news/search?q={company.replace(' ', '+')}"
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
    except:
        return [], False

    results = soup.find_all('a', class_='title')[:5]
    snippets = soup.find_all('div', class_='snippet')

    if not results:
        return [], False

    articles = []
    relevant_count = 0

    for i, link in enumerate(results):
        title = link.get_text()
        snippet = snippets[i].get_text() if i < len(snippets) else ""
        if is_relevant(company, title) or is_relevant(company, snippet):
            relevant_count += 1
            articles.append({'title': title, 'link': link['href'], 'snippet': snippet})

    # ✅ Strict invalid detection: Need at least 2 relevant matches
    if relevant_count < 2:
        return [], False

    with concurrent.futures.ThreadPoolExecutor() as executor:
        fetched = executor.map(lambda a: fetch_article(a, headers), articles)
        final_articles = [r for r in fetched if r]

    return final_articles, True

# ---------------------------
# Load summarizer
# ---------------------------
@st.cache_resource
def load_model():
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    return tokenizer, model

tokenizer, model = load_model()

# ---------------------------
# Summarize
# ---------------------------
def summarize(articles, style):
    summaries = []
    for art in articles[:3]:
        text = f"summarize: {art['text'][:700]}"
        inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
        ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0,
                             num_beams=4, early_stopping=True)
        summary = tokenizer.decode(ids[0], skip_special_tokens=True)

        if style == "Quick bullet points":
            summary = "- " + summary.replace(". ", "\n- ")
        elif style == "Casual conversation":
            summary = "Here's what I found: " + summary

        summaries.append({"title": art['title'], "summary": summary, "source_link": art['source_link']})
    return summaries

# ---------------------------
# Extract company names
# ---------------------------
def extract_names(user_input):
    keywords = ["news", "about", "latest", "updates", "tell", "information", "info", "on", "for"]
    cleaned = re.sub(r'\b(?:' + '|'.join(keywords) + r')\b', '', user_input.lower())
    cleaned = cleaned.replace(" and ", ",")
    names = re.split(r'[,\n]+', cleaned)
    return [n.strip().capitalize() for n in names if n.strip()]

# ---------------------------
# Handle conversation
# ---------------------------
def conversation_reply(user_input):
    greetings = ["hi", "hello", "how are you", "hey"]
    thanks = ["thank you", "thanks", "thx", "thank u"]
    goodbyes = ["bye", "goodbye", "see you"]

    text = user_input.lower()

    if any(g in text for g in greetings):
        return "👋 Hi! I can fetch the latest news about any company. Type one or multiple names."
    elif any(t in text for t in thanks):
        return "😊 You're welcome! Happy to help."
    elif any(gb in text for gb in goodbyes):
        return "👋 Goodbye! Have a great day!"
    return None

# ---------------------------
# Main App
# ---------------------------
def main():
    st.title("📰 Smart Company News Chatbot")

    if 'chat_history' not in st.session_state:
        st.session_state.chat_history = []

    output_style = st.radio("Output style:",
                            ('Formal business summary', 'Quick bullet points', 'Casual conversation'))

    # Display chat history
    for msg in st.session_state.chat_history:
        with st.chat_message("user" if msg["role"] == "user" else "assistant"):
            st.markdown(msg["text"])

    user_input = st.chat_input("Type your message...")
    if user_input:
        start = time.time()
        st.session_state.chat_history.append({"role": "user", "text": user_input})

        reply = conversation_reply(user_input)
        if reply:
            with st.chat_message("assistant"):
                st.markdown(reply)
            st.session_state.chat_history.append({"role": "assistant", "text": reply})
        else:
            names = extract_names(user_input)
            if not names:
                combined_response = "⚠️ Invalid input. Please enter company names."
            else:
                combined_response = ""
                with concurrent.futures.ThreadPoolExecutor() as executor:
                    results = list(executor.map(search_and_scrape, names))

                for name, (data, is_valid) in zip(names, results):
                    if not is_valid or not data:
                        combined_response += f"\n⚠️ Invalid company name: '{name}'.\n"
                    else:
                        summarized = summarize(data, output_style)
                        combined_response += f"\n### 📰 News for {name}:\n"
                        for item in summarized:
                            combined_response += f"- **{item['title']}**\n  {item['summary']}\n  [Source]({item['source_link']})\n\n"

                total_time = round(time.time() - start, 2)
                combined_response += f"\n⏱️ Response time: {total_time}s"

            with st.chat_message("assistant"):
                st.markdown(combined_response)
            st.session_state.chat_history.append({"role": "assistant", "text": combined_response})

if __name__ == "__main__":
    main()


Overwriting app.py


In [None]:
from pyngrok import ngrok

# Expose the port 8501
public_url = ngrok.connect(8501)
print("Streamlit App URL:", public_url)

!streamlit run app.py --server.port 8501 --server.enableCORS false --server.enableXsrfProtection false


Streamlit App URL: NgrokTunnel: "https://3e50237ffdc6.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.87.16.231:8501[0m
[0m
2025-08-05 10:58:11.553936: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754391491.592201    2293 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754391491.604014    2293 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
20