In [18]:
pip install streamlit groq pdfplumber python-docx -q


Note: you may need to restart the kernel to use updated packages.


In [19]:
%%writefile groq_llm.py
import os
import requests
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("GROQ_API_KEY")
if not api_key:
    raise ValueError("GROQ_API_KEY not found in environment.")

def ask_groq(query, context):
    url = "https://api.groq.com/openai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    prompt = f"""
You are a financial analyst assistant. The user may ask about specific section numbers or bullet points (e.g. '5.3.3', 'Section 6', etc.).
Use the document structure to find and answer the question clearly.

Context:
{context}

Question:
{query}

If the user references a section number or pointer, locate the relevant content and explain it step-by-step.
"""
    data = {
        "model": "llama3-8b-8192",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant that answers questions based on context."},
            {"role": "user", "content": prompt}
        ]
    }
    response = requests.post(url, headers=headers, json=data)
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"]
    else:
        raise Exception(f"Groq API Error: {response.status_code} - {response.text}")


Overwriting groq_llm.py


In [20]:
%%writefile doc_utils.py
import pdfplumber
import docx
import re

def extract_text_from_pdf(uploaded_file):
    text = ""
    with pdfplumber.open(uploaded_file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += "\n--- Page Break ---\n" + page_text
    return text.strip()

def extract_text_from_docx(uploaded_file):
    doc = docx.Document(uploaded_file)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_headings(text):
    headings = re.findall(r'(^\d+(\.\d+)*\s+.+)', text, re.MULTILINE)
    return [h[0] for h in headings]


Overwriting doc_utils.py


In [None]:
%%writefile app.py
import streamlit as st
from doc_utils import extract_text_from_pdf, extract_text_from_docx, extract_headings
from groq_llm import ask_groq

st.set_page_config(page_title="Doc Chatbot ", layout="centered")
st.title("📄🤖 Structured Document Q&A ")

uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"])

if uploaded_file:
    if uploaded_file.type == "application/pdf":
        document_text = extract_text_from_pdf(uploaded_file)
    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        document_text = extract_text_from_docx(uploaded_file)
    else:
        st.error("Unsupported file type.")
        st.stop()

    st.success("✅ File uploaded and processed!")



    query = st.text_input("Ask a question based on the uploaded document:")
    if query:
        with st.spinner("Getting answer "):
            chunks = [document_text[i:i+3000] for i in range(0, len(document_text), 3000)]
            answers = [ask_groq(query, chunk) for chunk in chunks[:3]]
            final_answer = "\n---\n".join(answers)
            st.markdown("### 📢 Answer")
            st.write(final_answer)


Overwriting app.py
