# Multpile pdf reading Chat-bot


In [5]:
# Install dependencies
!pip install -q streamlit pymupdf chromadb pyngrok faiss-cpu sentence-transformers mistralai==0.4.2 python-docx

# Configuring ngrok authtoken
!ngrok config add-authtoken 2xDRF1RvT6YJDp4CJkWMzyp1I65_6yk76D4JxHm1do76eWGb2

# Killing previous processes
!pkill -f streamlit || echo "No old Streamlit process"
!pkill -f ngrok || echo "No old ngrok process"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m16.5 MB/s[0m eta [3

In [6]:
%%writefile app.py
import streamlit as st
import fitz  # PyMuPDF for PDF
from docx import Document  # For DOCX
import tempfile
import os
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
from sentence_transformers import SentenceTransformer
import chromadb

st.set_page_config(page_title="RAG Multi-Document Chatbot", layout="wide")
st.title("🤖 RAG Multi-Document Chatbot")

# Sidebar for API key and file uploads
api_key = st.sidebar.text_input("🔑 Enter Mistral API Key", type="password")
uploaded_files = st.sidebar.file_uploader("📄 Upload Documents (PDF, DOCX, TXT)",
                                         type=["pdf", "docx", "txt"],
                                         accept_multiple_files=True)

# Session state
if "db" not in st.session_state:
    st.session_state.db = None
if "chunks" not in st.session_state:
    st.session_state.chunks = []
if "history" not in st.session_state:
    st.session_state.history = []
if "collection_name" not in st.session_state:
    st.session_state.collection_name = "doc_chunks"

# Function to extract text from different file types
def extract_text(file, file_type):
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type}") as tmp:
            tmp.write(file.getbuffer())
            tmp_path = tmp.name

        if file_type == "pdf":
            with fitz.open(tmp_path) as doc:
                if len(doc) == 0:
                    st.error(f"PDF {file.name} is empty or corrupted.")
                    return ""
                text = "".join(page.get_text() for page in doc)
        elif file_type == "docx":
            doc = Document(tmp_path)
            text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
        elif file_type == "txt":
            with open(tmp_path, "r", encoding="utf-8") as f:
                text = f.read()
        else:
            st.error(f"Unsupported file type: {file_type}")
            return ""

        os.remove(tmp_path)
        return text
    except Exception as e:
        st.error(f"Error processing {file.name}: {str(e)}")
        return ""

# Document processing
if uploaded_files and api_key and st.sidebar.button("Process Documents"):
    with st.spinner("Processing Documents..."):
        st.session_state.chunks = []
        for i, file in enumerate(uploaded_files):
            file_type = file.name.split(".")[-1].lower()
            st.info(f"Extracting text from {file_type.upper()} {i+1}/{len(uploaded_files)}: {file.name}")
            text = extract_text(file, file_type)
            if not text:
                continue  # Skip to the next file if text extraction fails
            # Chunking: 2000 chars per chunk
            chunks = [text[i:i+2000] for i in range(0, len(text), 2000)]
            st.session_state.chunks.extend(chunks)

        if not st.session_state.chunks:
            st.error("No valid content extracted from documents.")
        else:
            st.info(f"Total chunks to embed: {len(st.session_state.chunks)}")
            # Embeddings
            embedder = SentenceTransformer("all-MiniLM-L6-v2")
            embeddings = embedder.encode(st.session_state.chunks, show_progress_bar=True)
            # ChromaDB (persistent storage)
            client = chromadb.PersistentClient(path="./chroma_db")
            if st.session_state.collection_name in [col.name for col in client.list_collections()]:
                client.delete_collection(st.session_state.collection_name)
            collection = client.create_collection(st.session_state.collection_name)
            for i, (chunk, emb) in enumerate(zip(st.session_state.chunks, embeddings)):
                collection.add(documents=[chunk], embeddings=[emb.tolist()], ids=[str(i)])
            st.session_state.db = collection
            st.success(f"Processed {len(uploaded_files)} documents and indexed {len(st.session_state.chunks)} chunks.")

# Chat interface
if api_key and st.session_state.db:
    for role, msg in st.session_state.history:
        with st.chat_message(role):
            st.write(msg)
    user_input = st.chat_input("Ask a question about your documents...")
    if user_input:
        try:
            embedder = SentenceTransformer("all-MiniLM-L6-v2")
            # Fix: Extract the single embedding vector by taking the first element
            q_emb = embedder.encode([user_input]).tolist()[0]
            results = st.session_state.db.query(query_embeddings=[q_emb], n_results=4)
            context = "\n\n".join(results["documents"][0])
            client = MistralClient(api_key=api_key)
            messages = [
                ChatMessage(role="system", content="You are a helpful assistant. Use the provided context from documents to answer."),
                ChatMessage(role="user", content=f"Context:\n{context}\n\nQuestion: {user_input}")
            ]
            response = client.chat(model="mistral-small-latest", messages=messages)
            answer = response.choices[0].message.content
            st.session_state.history.append(("user", user_input))
            st.session_state.history.append(("assistant", answer))
            # Limit history to last 50 entries (25 exchanges)
            if len(st.session_state.history) > 50:
                st.session_state.history = st.session_state.history[-50:]
            st.rerun()
        except Exception as e:
            st.error(f"Error generating response: {str(e)}")
elif not api_key:
    st.info("Enter your Mistral API key in the sidebar.")
elif not uploaded_files:
    st.info("Upload documents (PDF, DOCX, TXT) and click 'Process Documents'.")

Writing app.py


In [7]:
import subprocess
import time
import requests
from pyngrok import ngrok

# Start Streamlit
process = subprocess.Popen(["streamlit", "run", "app.py", "--server.port=8501"])

# Wait for Streamlit to start
for _ in range(10):
    try:
        requests.get("http://localhost:8501")
        break
    except:
        time.sleep(1)
else:
    print("Streamlit failed to start")
    process.terminate()
    exit(1)

# Start ngrok
try:
    public_url = ngrok.connect(8501)
    print(f"🌐 Your app is live at: {public_url}")
except Exception as e:
    print(f"Error starting ngrok: {str(e)}")
    process.terminate()
    exit(1)

import atexit
atexit.register(process.terminate)

🌐 Your app is live at: NgrokTunnel: "https://73d8-34-133-229-21.ngrok-free.app" -> "http://localhost:8501"
