<a href="https://colab.research.google.com/github/ARADHYA299/GenAI/blob/main/researchAssistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q faiss-cpu langchain sentence-transformers transformers
!pip install -q pypdf
!pip install -U -q langchain-community
!pip install -q gradio

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.3/302.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from pypdf import PdfReader

def extract_text_from_pdf(file_path):
  reader = PdfReader(file_path)
  text = ""

  for page in reader.pages:
    text += page.extract_text() + "\n"
  return text


In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


def split_text(text):
  text_splitter = CharacterTextSplitter(
      separator = "\n",
      chunk_size = 500,
      chunk_overlap = 100,
      length_function = len
  )

  return text_splitter.split_text(text)

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS

def embed_text(chunks):
  embedder = HuggingFaceBgeEmbeddings(model_name = "all-MiniLM-L6-v2")
  faiss_index = FAISS.from_texts(chunks, embedding=embedder)
  return faiss_index


In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForCausalLM , AutoTokenizer , pipeline

def load_local_llm():
  model_name = "google/flan-t5-base"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForCausalLM.from_pretrained(model_name , device_map = "auto" , trust_remote_code = True)
  pipe = pipeline("text2text-generation" , model = model, tokenizer = tokenizer , max_length = 512)
  llm = HuggingFacePipeline(pipeline = pipe)
  return llm

In [None]:
from langchain.chains import retrieval_qa

def build_qa_chain(llm , faiss_index):
  retriver = faiss_index.as_retriver()
  qa_chain = retrieval_qa.from_chain_type(
      llm = llm,
      retriver = retriver,
      return_source_documents = True
  )
  return qa_chain

In [None]:
import os

def process_pdf_and_create_qa(pdf_file):
    file_path = pdf_file.name

    text = extract_text_from_pdf(file_path)

    chunks = split_text(text)

    vectorstore = embed_text(chunks)

    llm  = load_local_llm()

    qa_chain = build_qa_chain(faiss_index , llm)

    return qa_chain

In [None]:
import gradio as gr

def handle_question(pdf_file, user_question):
    if pdf_file is None or user_question.strip() == "":
        return "Please upload a PDF and enter a question."

    try:
        # Always reprocess the uploaded PDF (safe in stateless Gradio mode)
        qa_chain = process_pdf_and_create_qa(pdf_file)
        result = qa_chain.run(user_question)
        return result

    except Exception as e:
        return f"❌ Error: {str(e)}"
gr.Interface(
    fn=handle_question,
    inputs=[
        gr.File(type="filepath", label="Upload Research Paper (PDF)"),
        gr.Textbox(lines=2, label="Ask a question")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="🧠 Research Paper Explorer (RAG Assistant)",
    description="Upload a research paper and ask anything about it."
).launch(share=True)  # share=True gives you a public link (especially for Colab)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2de8c538dbfa1f4c50.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


