In [1]:
import os
import streamlit as st
import pickle
import time
import langchain
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader, UnstructuredURLLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq

In [None]:
from utils import get_groq_api_key
groq_api_key = get_groq_api_key()

In [32]:
# Loading Data
loader=UnstructuredURLLoader(urls=[
    'https://www.bbc.com/news/business-12345678',  
    'https://tribune.com.pk/story/1234567/specific-story',  
    'https://www.thenews.com.pk/print/123456-specific-news'  
])
data=loader.load()
data[2]


Document(metadata={'source': 'https://www.thenews.com.pk/print/123456-specific-news'}, page_content='')

In [33]:
# Splitting the Data
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200
)
chunk=splitter.split_documents(data)
len(chunk)

3

In [34]:
chunk[0]

Document(metadata={'source': 'https://www.bbc.com/news/business-12345678'}, page_content="404 Page cannot be found\n\nSorry, we're unable to bring you the page you're looking for. Please try:\n\nDouble checking the url\n\nHitting the refresh button in your browser\n\nSearching for this page using the BBC search bar\n\nAlternatively, please visit the BBC News homepage.")

In [35]:
# Applying the Embedding
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorindex=FAISS.from_documents(chunk, embedding)

In [36]:
# Store Vector database
file_path="vector_index.pkl"
with open(file_path, "wb") as f:
    pickle.dump(vectorindex,f)

In [37]:
# Load Vector Database
with open (file_path,"rb") as f:
    vectorindex=pickle.load(f)

In [None]:
# Load llm
llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0.5,
    max_tokens=512,
    api_key=groq_api_key
)

In [39]:
## Retrieving the data
chain=RetrievalQAWithSourcesChain.from_llm(
    llm=llm,
    retriever=vectorindex.as_retriever(),
    return_source_documents=True
)

In [40]:
chain



In [43]:
query="Which companies are discussed in these articles?"
langchain.debug=True

result =chain({'question': query}, return_only_outputs=False)
print("Answer:", result["answer"])
print("Sources:", result["sources"])

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Which companies are discussed in these articles?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "photo reuters\n\nRenault plans to cut 3,000 jobs\n\nrepairs of flood ravaged rail tracks to cost rs140m\n\nRepairs of flood-ravaged rail tracks to cost Rs140m\n\nsource reuters\n\nP&G exits Pakistan\n\nthe auto sector s profitability is expected to face further obstacles due to the imposition of a 10 super tax photo file\n\nAuto parts makers oppose IMF push for liberalisation\n\nWorld\n\nrescuers clear rubble photo afp\n\nIndonesia school collapse deaths top 17\n\ndrone scar

In [None]:
import gradio as gr
from langchain.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQAWithSourcesChain
import os

# Set your Groq API key
os.environ["GROQ_API_KEY"] = "GROQ_API_KEY"

def process_urls_and_question(urls_text, question):
    """Main function to process URLs and answer questions"""
    try:
        # Convert URLs text to list
        urls = [url.strip() for url in urls_text.split('\n') if url.strip()]
        
        if not urls or not question:
            return "❌ Please enter both URLs and a question"
        
        # Load documents
        loader = UnstructuredURLLoader(urls=urls)
        documents = loader.load()
        
        # Split into chunks
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = splitter.split_documents(documents)
        
        # Create embeddings and vector store
        embeddings = HuggingFaceEmbeddings()
        vectorstore = FAISS.from_documents(chunks, embeddings)
        
        # Create QA chain with sources - CORRECT VERSION
        llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0.5)
        qa_chain = RetrievalQAWithSourcesChain.from_llm(
            llm=llm,
            retriever=vectorstore.as_retriever()
        )
        
        # Get answer - CORRECT: Use dictionary with "question" key
        result = qa_chain({"question": question})  # Use "question" not "query"
        answer = f"Answer: {result['answer']}\n\nSources: {result['sources']}"
        
        return answer
        
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# 📰 URL Question Answering System")
    
    with gr.Row():
        with gr.Column():
            urls_input = gr.Textbox(
                label="Enter URLs (one per line)",
                placeholder="https://example.com/news1\nhttps://example.com/news2",
                lines=5
            )
            question_input = gr.Textbox(
                label="Your Question",
                placeholder="What are the main points from these articles?",
                lines=2
            )
            submit_btn = gr.Button("Get Answer")
        
        with gr.Column():
            answer_output = gr.Textbox(
                label="Answer",
                lines=10
            )
    
    submit_btn.click(
        fn=process_urls_and_question,
        inputs=[urls_input, question_input],
        outputs=answer_output
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.
