<a href="https://colab.research.google.com/github/AbhinavKumar0000/RAG_pipeline/blob/main/project_exibhition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-cpu
!pip install -U langchain-community
!pip install unstructured
!pip install langchain_google_genai
!pip install gtts
!pip install -q google-cloud-texttospeech

In [None]:
import os
import gradio as gr
import time
import faiss
import pickle
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader, PyPDFLoader
from langchain.vectorstores import FAISS
from dotenv import load_dotenv
from google.generativeai import list_models, configure

In [None]:
import os
os.environ["GOOGLE_API_KEY"] = ""

In [None]:
# Load environment variables
load_dotenv()
if not os.getenv("GOOGLE_API_KEY"):
    raise EnvironmentError("GOOGLE_API_KEY not found in .env file")

In [None]:
# Configure Google Generative AI
configure(api_key=os.getenv("GOOGLE_API_KEY"))

In [None]:
# Check available models
def check_available_models():
    try:
        available_models = [model.name for model in list_models() if "generateContent" in model.supported_generation_methods]
        return available_models
        available_models

    except Exception as e:
        return f"Error checking models: {str(e)}"
    return available_models

In [None]:
# Initialize Gemini LLM and embeddings
try:
    available_models = check_available_models()
    if "models/gemini-1.5-flash" not in available_models:
        raise ValueError("Model gemini-1.5-flash not available. Available models: " + ", ".join(available_models))
    llm = GoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.9, max_output_tokens=1000)
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
except Exception as e:
    raise Exception(f"Failed to initialize Gemini: {str(e)}")

In [None]:
# Function to process URLs and create FAISS index
def process_inputs(url1, url2, pdf1, pdf2):
    file_path = "faiss_store_gemini.index"
    metadata_path = "faiss_store_gemini_metadata.pkl"

    loaded_documents = []

    # Load from URLs
    urls = [url for url in [url1, url2] if url and url.strip()]
    if urls:
        try:
            url_loader = UnstructuredURLLoader(urls=urls)
            loaded_documents.extend(url_loader.load())
        except Exception as e:
            return f"Error loading URLs: {str(e)}"

    # Load from PDFs
    pdfs = [pdf for pdf in [pdf1, pdf2] if pdf is not None]
    for pdf_file in pdfs:
        try:
            # PyPDFLoader needs a file path, which is available in the Gradio File object's 'name' attribute
            pdf_loader = PyPDFLoader(pdf_file.name)
            loaded_documents.extend(pdf_loader.load())
        except Exception as e:
            return f"Error loading PDF {pdf_file.name}: {str(e)}"

    if not loaded_documents:
        return "Please provide at least one valid URL or PDF file."

    try:
        # Split data
        status = "Text Splitter...Started..."
        text_splitter = RecursiveCharacterTextSplitter(
            separators=['\n\n', '\n', '.', ','],
            chunk_size=1000
        )
        docs = text_splitter.split_documents(loaded_documents)
        if not docs:
            return "No text could be extracted from the provided sources."

        # Create embeddings and save to FAISS index
        status = "Embedding Vector Started Building..."
        vectorstore_gemini = FAISS.from_documents(docs, embeddings)
        time.sleep(2)

        # Save the FAISS index and metadata
        faiss.write_index(vectorstore_gemini.index, file_path)
        with open(metadata_path, "wb") as f:
            pickle.dump({
                "docstore": vectorstore_gemini.docstore,
                "index_to_docstore_id": vectorstore_gemini.index_to_docstore_id
            }, f)

        return "Vector Store is Ready. You can now ask questions."
    except Exception as e:
        return f"An error occurred during processing: {str(e)}"


In [None]:
# Function to query the FAISS index
def query_faiss(question):
    file_path = "faiss_store_gemini.index"
    metadata_path = "faiss_store_gemini_metadata.pkl"
    if not question or not question.strip():
        return "Please enter a valid question.", ""

    if not os.path.exists(file_path) or not os.path.exists(metadata_path):
        return "No FAISS index found. Please process URLs first.", ""

    try:
        # Load the FAISS index and metadata
        index = faiss.read_index(file_path)
        with open(metadata_path, "rb") as f:
            metadata = pickle.load(f)

        # Reconstruct the FAISS vectorstore
        vectorstore = FAISS(
            embedding_function=embeddings,
            index=index,
            docstore=metadata["docstore"],
            index_to_docstore_id=metadata["index_to_docstore_id"]
        )

        # Query the vectorstore
        chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
        result = chain({"question": question}, return_only_outputs=True)

        answer = result["answer"]
        sources = result.get("sources", "")
        sources_output = "Sources:\n" + "\n".join(sources.split("\n")) if sources else "No sources available."

        return answer, sources_output
    except Exception as e:
        return f"Error querying FAISS index: {str(e)}", ""


In [None]:
import os

# --- IMPORTANT ---
# This cell must be run in an environment like Google Colab or a Jupyter Notebook
# that supports file uploads.

try:
    from google.colab import files

    # 1. Run this cell. It will prompt you to upload your credentials JSON file.
    print("Please upload the credentials JSON file you downloaded from Google Cloud.")
    uploaded = files.upload()

    # 2. Get the name of the uploaded file and set the environment variable.
    if uploaded:
        file_name = next(iter(uploaded))
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = file_name
        print(f"\n✅ '{file_name}' uploaded successfully.")
        print("Authentication is now set. You can run the main application cell.")
    else:
        print("\nUpload canceled. Authentication was not set.")

except ImportError:
    print("This upload script is designed for Google Colab.")
    print("If you are running locally, you must set the 'GOOGLE_APPLICATION_CREDENTIALS' environment variable manually.")
    print('Example: export GOOGLE_APPLICATION_CREDENTIALS="/path/to/your/key.json"')

Please upload the credentials JSON file you downloaded from Google Cloud.


Saving project-exhibition-470717-8b8187723059.json to project-exhibition-470717-8b8187723059.json

✅ 'project-exhibition-470717-8b8187723059.json' uploaded successfully.
Authentication is now set. You can run the main application cell.


In [None]:
from google.cloud import texttospeech
import uuid

def speak_text(text):
    if not text or not text.strip():
        return None

    # This function now relies on the GOOGLE_APPLICATION_CREDENTIALS environment
    # variable being set by the setup script.
    if "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
        print("ERROR: Google Cloud authentication is not configured.")
        print("Please run the 'Authentication Setup' cell and upload your JSON credentials file first.")
        return None

    try:
        # The client automatically finds and uses the credentials from the
        # GOOGLE_APPLICATION_CREDENTIALS environment variable.
        client = texttospeech.TextToSpeechClient()

        # Set the text input to be synthesized
        synthesis_input = texttospeech.SynthesisInput(text=text)

        # Build the voice request, selecting a high-quality female WaveNet voice
        voice = texttospeech.VoiceSelectionParams(
            language_code="en-US",
            name="en-US-Wavenet-F"
        )

        # Select the type of audio file you want returned
        audio_config = texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3
        )

        # Perform the text-to-speech request
        response = client.synthesize_speech(
            input=synthesis_input, voice=voice, audio_config=audio_config
        )

        # Save the binary audio content to a temporary file
        filename = f"{uuid.uuid4()}.mp3"
        with open(filename, "wb") as out:
            out.write(response.audio_content)

        return filename

    except Exception as e:
        print(f"An error occurred in Google TTS function: {e}")
        print("Please ensure the 'Cloud Text-to-Speech API' is enabled for your project in the Google Cloud Console.")
        return None

In [None]:
with gr.Blocks() as demo:
    gr.Markdown("# WebInsight Querry: Analyze Articles & Documents")
    gr.Markdown("Provide up to two URLs or two PDF files to create a searchable knowledge base, then ask questions about the content.")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Step 1: Provide Content Sources")
            url1 = gr.Textbox(label="URL 1", placeholder="Enter a valid news article URL")
            url2 = gr.Textbox(label="URL 2", placeholder="Enter another valid URL (optional)")
            pdf1 = gr.File(label="Upload PDF 1", file_types=['.pdf'])
            pdf2 = gr.File(label="Upload PDF 2 (optional)", file_types=['.pdf'])
            process_button = gr.Button("Create Knowledge Base", variant="primary")
            status_output = gr.Textbox(label="Status", lines=2, interactive=False)

        with gr.Column(scale=2):
            gr.Markdown("## Step 2: Ask a Question")
            question_input = gr.Textbox(label="Question", placeholder="Ask a question about the content...")
            answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)
            sources_output = gr.Textbox(label="Sources", lines=3, interactive=False)

            with gr.Row():
                query_button = gr.Button("Submit Question")
                speak_button = gr.Button("Speak Answer")

            audio_output = gr.Audio(label="Spoken Answer", autoplay=False)

    # Connect components to functions
    process_button.click(
        fn=process_inputs,
        inputs=[url1, url2, pdf1, pdf2],
        outputs=status_output
    )

    query_button.click(
        fn=query_faiss,
        inputs=question_input,
        outputs=[answer_output, sources_output]
    )

    speak_button.click(
        fn=speak_text,
        inputs=answer_output,
        outputs=audio_output
    )

    question_input.submit(
        fn=query_faiss,
        inputs=question_input,
        outputs=[answer_output, sources_output]
    )

demo.launch(debug=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://07daa6a036f85d204e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
