<a href="https://colab.research.google.com/github/Aradhyakapil/RAG-Streamlit-app/blob/main/2_RAG_streamlit_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import streamlit as st
from openai import OpenAI
from PyPDF2 import PdfReader
from sentence_splitter import SentenceSplitter
import numpy as np
import faiss
from io import BytesIO

st.sidebar.title("🔐 OpenAI API Key")
api_key = st.sidebar.text_input("Enter your OpenAI API key", type="password")
if not api_key:
    st.warning("Please provide the key in the sidebar.")
    st.stop()
client = OpenAI(api_key=api_key)

needed_keys = ["pdf_bytes", "chunks", "embeddings", "index", "messages"]
for k in needed_keys:
    if k not in st.session_state:
        st.session_state[k] = None
if st.session_state.messages is None:
    st.session_state.messages = []

def extract_text_from_pdf(file_like):
    reader = PdfReader(file_like)
    return "\n".join(p.extract_text() for p in reader.pages if p.extract_text())

def chunk(text, size=500):
    splitter = SentenceSplitter(language="en")
    buf, out = "", []
    for sent in splitter.split(text):
        buf = f"{buf}{sent} "
        if len(buf) >= size:
            out.append(buf.strip()); buf = ""
    if buf: out.append(buf.strip())
    return out

def embed(text_blocks):
    res = client.embeddings.create(
        model="text-embedding-3-small",
        input=text_blocks
    )
    return np.array([d.embedding for d in res.data])

def build_index(embeds):
    idx = faiss.IndexFlatL2(embeds.shape[1])
    idx.add(embeds)
    return idx

def top_k_chunks(q, k=3):
    q_emb = client.embeddings.create(
        model="text-embedding-3-small",
        input=[q]
    ).data[0].embedding
    D, I = st.session_state.index.search(np.array([q_emb]), k)
    return [st.session_state.chunks[i] for i in I[0]]

def ask_llm(user_q):
    context = "\n".join(top_k_chunks(user_q))
    messages = (
        [{"role":"system",
          "content":"You are a helpful assistant. Use the context to answer."}]
        + st.session_state.messages[-20:]     # last 20 turns at most
        + [{"role":"user",
            "content":f"Context:\n{context}\n\nQuestion: {user_q}"}]
    )
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )
    return resp.choices[0].message.content

st.title("🧠  RAG Chatbot")
uploaded = st.file_uploader("📄  Upload a PDF", type="pdf", key="pdf_up")
if uploaded and st.session_state.pdf_bytes is None:
    # First time we see this PDF in this session
    st.session_state.pdf_bytes = uploaded.read()
    with st.spinner("Extracting & embedding …"):
        text               = extract_text_from_pdf(BytesIO(st.session_state.pdf_bytes))
        st.session_state.chunks = chunk(text)
        st.session_state.embeddings = embed(st.session_state.chunks)
        st.session_state.index = build_index(st.session_state.embeddings)
    st.success("Indexed! Ask away 👇")

if st.session_state.pdf_bytes and st.session_state.index is None:
    st.session_state.index = build_index(st.session_state.embeddings)


if st.session_state.index:
    for m in st.session_state.messages:
        st.chat_message(m["role"]).markdown(m["content"])

    if user_q := st.chat_input("Ask a question about the PDF"):
        st.chat_message("user").markdown(user_q)
        with st.spinner("Thinking…"):
            answer = ask_llm(user_q)
        st.chat_message("assistant").markdown(answer)

        st.session_state.messages += [
            {"role":"user", "content":user_q},
            {"role":"assistant", "content":answer},
        ]
else:
    st.info("Upload a PDF to start chatting.")
