Task - 1
Text extraction from image

In [3]:
# Install dependencies
!pip install -q streamlit pymupdf chromadb pyngrok faiss-cpu sentence-transformers mistralai==0.4.2 python-docx

# Configuring ngrok authtoken
!ngrok config add-authtoken 2xDRF1RvT6YJDp4CJkWMzyp1I65_6yk76D4JxHm1do76eWGb2

# Killing previous processes
!pkill -f streamlit || echo "No old Streamlit process"
!pkill -f ngrok || echo "No old ngrok process"

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
^C
^C


In [4]:
%%writefile app.py
import streamlit as st
import fitz  # PyMuPDF for PDF
from docx import Document  # For DOCX
import tempfile
import os
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
from sentence_transformers import SentenceTransformer
import chromadb

st.set_page_config(page_title="RAG Multi-Document Chatbot", layout="wide")
st.title("🤖 RAG Multi-Document Chatbot")

# Sidebar for API key and file uploads
st.sidebar.markdown("### Configuration")
api_key = st.sidebar.text_input("🔑 Enter Mistral API Key", type="password")
st.sidebar.markdown("---")
uploaded_files = st.sidebar.file_uploader(
    "📄 Upload Documents (PDF, DOCX, TXT)",
    type=["pdf", "docx", "txt"],
    accept_multiple_files=True
)
st.sidebar.markdown("---")
process_btn = st.sidebar.button("Process Documents")

# Session state
if "db" not in st.session_state:
    st.session_state.db = None
if "chunks" not in st.session_state:
    st.session_state.chunks = []
if "history" not in st.session_state:
    st.session_state.history = []
if "collection_name" not in st.session_state:
    st.session_state.collection_name = "doc_chunks"
if "embedder" not in st.session_state:
    st.session_state.embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Function to extract text from different file types
def extract_text(file, file_type):
    """
    Extract text from PDF, DOCX, or TXT file.
    Returns extracted text or empty string on failure.
    """
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type}") as tmp:
            tmp.write(file.getbuffer())
            tmp_path = tmp.name

        if file_type == "pdf":
            with fitz.open(tmp_path) as doc:
                if len(doc) == 0:
                    st.error(f"PDF {file.name} is empty or corrupted.")
                    return ""
                text = "".join(page.get_text() for page in doc)
        elif file_type == "docx":
            doc = Document(tmp_path)
            text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
        elif file_type == "txt":
            with open(tmp_path, "r", encoding="utf-8") as f:
                text = f.read()
        else:
            st.error(f"Unsupported file type: {file_type}")
            return ""

        os.remove(tmp_path)
        return text
    except Exception as e:
        st.error(f"Error processing {file.name}: {str(e)}")
        return ""

def chunk_text(text, chunk_size=2000, overlap=400):
    """
    Split text into chunks with specified size and optional overlap.
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks


# Document processing
if uploaded_files and api_key and process_btn:
    with st.spinner("Processing Documents..."):
        st.session_state.chunks = []
        for i, file in enumerate(uploaded_files):
            file_type = file.name.split(".")[-1].lower()
            doc_label = f"Document {i+1}: {file.name}"
            if file.size > 10 * 1024 * 1024:  # 10MB size limit
                st.warning(f"Skipping {file.name}: File too large.")
                continue
            st.info(f"Extracting text from {file_type.upper()} {i+1}/{len(uploaded_files)}: {file.name}")
            text = extract_text(file, file_type)
            if not text:
                continue  # Skip to the next file if text extraction fails
            # Chunking with overlap for better context
            chunks = chunk_text(text, chunk_size=2000, overlap=400)
            labeled_chunks = [f"[{doc_label}]\n{chunk}" for chunk in chunks]
            st.session_state.chunks.extend(labeled_chunks)

        if not st.session_state.chunks:
            st.error("No valid content extracted from documents.")
        else:
            st.info(f"Total chunks to embed: {len(st.session_state.chunks)}")
            # Embeddings
            embedder = st.session_state.embedder
            embeddings = embedder.encode(st.session_state.chunks, show_progress_bar=True)
            # ChromaDB (persistent storage)
            client = chromadb.PersistentClient(path="./chroma_db")
            # Clean up existing collection
            if st.session_state.collection_name in [col.name for col in client.list_collections()]:
                client.delete_collection(st.session_state.collection_name)
            collection = client.create_collection(st.session_state.collection_name)
            # Batch add all chunks and embeddings
            ids = [str(i) for i in range(len(st.session_state.chunks))]
            collection.add(
                documents=st.session_state.chunks,
                embeddings=[emb.tolist() for emb in embeddings],
                ids=ids
            )
            st.session_state.db = collection
            st.success(f"Processed {len(uploaded_files)} documents and indexed {len(st.session_state.chunks)} chunks.")

# Chat interface
if api_key and st.session_state.db:
    for role, msg in st.session_state.history:
        with st.chat_message(role):
            st.write(msg)
    user_input = st.chat_input("Ask a question about your documents...")
    if user_input:
        if len(user_input.strip()) < 3:
            st.warning("Please enter a more specific question.")
        else:
            try:
                embedder = st.session_state.embedder
                q_emb = embedder.encode([user_input])[0].tolist()
                results = st.session_state.db.query(query_embeddings=[q_emb], n_results=4)
                context = "\n\n".join(results["documents"][0])
                client = MistralClient(api_key=api_key)
                # Prompt engineering for RAG
                system_prompt = (
                    "You are a helpful assistant. When answering, perform all reasoning, outlining, and analysis internally. "
                    "Only display the final answer to the user, without showing your thought process or intermediate steps. "
                    "If the context does not contain enough information to answer, say: 'I could not find the answer in the uploaded documents.'"
                    )
                chat_history = ""
                for role, msg in st.session_state.history[-6:]:  # last 3 exchanges (user + assistant)
                  speaker = "User" if role == "user" else "Assistant"
                  chat_history += f"{speaker}: {msg}\n"
                messages = [
                    ChatMessage(role="system", content=system_prompt),ChatMessage(role="user",
                    content=(
                        f"Context:\n{context}\n\n"
                        f"Chat History:\n{chat_history}\n"
                        f"User Question: {user_input}"
                        )
                    )
                    ]
                response = client.chat(model="mistral-small-latest", messages=messages)
                answer = response.choices[0].message.content
                st.session_state.history.append(("user", user_input))
                st.session_state.history.append(("assistant", answer))
                # Limit history to last 50 entries (25 exchanges)
                if len(st.session_state.history) > 50:
                    st.session_state.history = st.session_state.history[-50:]
                st.rerun()
            except Exception as e:
                st.error(f"Error generating response: {str(e)}")
elif not api_key:
    st.info("Enter your Mistral API key in the sidebar.")
elif not uploaded_files:
    st.info("Upload documents (PDF, DOCX, TXT) and click 'Process Documents'.")

Overwriting app.py


In [6]:
import subprocess
import time
import requests
from pyngrok import ngrok

# Start Streamlit
process = subprocess.Popen(["streamlit", "run", "app.py", "--server.port=8501"])

# Wait for Streamlit to start
for _ in range(10):
    try:
        requests.get("http://localhost:8501")
        break
    except:
        time.sleep(1)
else:
    print("Streamlit failed to start")
    process.terminate()
    exit(1)

# Start ngrok
try:
    public_url = ngrok.connect(8501)
    print(f"🌐 Your app is live at: {public_url}")
except Exception as e:
    print(f"Error starting ngrok: {str(e)}")
    process.terminate()
    exit(1)

import atexit
atexit.register(process.terminate)

🌐 Your app is live at: NgrokTunnel: "https://00bd-34-55-248-241.ngrok-free.app" -> "http://localhost:8501"
