### Install Required Libraries

In [24]:
!pip install easyocr
!pip install torch torchvision torchaudio  # already needed
!pip install transformers
!pip install sentence-transformers
!pip install rank_bm25
!pip install python-dotenv
!pip install groq
!pip install pillow
!pip install pymupdf




### Load .env and Groq API Key

In [25]:
import os
from dotenv import load_dotenv

load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")


### Load EasyOCR Model for Handwriting OCR

In [26]:
import easyocr
reader = easyocr.Reader(['en'], gpu=False)


Using CPU. Note: This module is much faster with a GPU.
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

### Universal Text Extractor (Image / Typed PDF / Scanned Handwritten PDF)

In [27]:
from pathlib import Path
from PIL import Image, ImageEnhance
import fitz  # PyMuPDF
import numpy as np

def easyocr_image(image: Image.Image):
    image = image.convert("L")  # grayscale
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)  # increase contrast
    results = reader.readtext(np.array(image), detail=0, paragraph=True)
    return "\n".join(results)

def extract_text(file_path):
    ext = Path(file_path).suffix.lower()

    if ext == ".pdf":
        doc = fitz.open(file_path)
        full_text = ""
        for i, page in enumerate(doc):
            typed_text = page.get_text().strip()
            if len(typed_text) > 50:
                full_text += typed_text + "\n"
            else:
                pix = page.get_pixmap(dpi=400)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                full_text += easyocr_image(img) + "\n"
        return full_text.strip()

    elif ext in [".jpg", ".jpeg", ".png"]:
        img = Image.open(file_path)
        return easyocr_image(img)

    else:
        raise ValueError(f"Unsupported file type: {ext}")


### Chunk the Extracted Text

In [28]:
def chunk_text(text, chunk_size=100, overlap=20):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks


### Create Vector Embeddings and BM25 Index

In [29]:
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def build_indexes(chunks):
    embeddings = embedding_model.encode(chunks)
    tokenized_chunks = [chunk.split(" ") for chunk in chunks]
    bm25 = BM25Okapi(tokenized_chunks)
    return embeddings, bm25


### Adaptive Retrieval (Vector + Keyword Fusion)


In [30]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def adaptive_retrieve(query, chunks, embeddings, bm25, top_k=3, alpha=0.5):
    query_embedding = embedding_model.encode([query])[0]
    vector_scores = cosine_similarity([query_embedding], embeddings)[0]
    keyword_scores = bm25.get_scores(query.split())

    vector_scores = (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores) + 1e-8)
    keyword_scores = (keyword_scores - np.min(keyword_scores)) / (np.max(keyword_scores) - np.min(keyword_scores) + 1e-8)

    combined_scores = alpha * vector_scores + (1 - alpha) * keyword_scores
    top_indices = np.argsort(combined_scores)[::-1][:top_k]
    return [chunks[i] for i in top_indices]


### Ask Groq API

In [31]:
from groq import Groq

groq_client = Groq(api_key=GROQ_API_KEY)

def ask_groq(prompt):
    response = groq_client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content


### Preview Text

In [32]:
file_path = "Akib Hasan 2011892642.pdf"  # Your uploaded file
extracted_text = extract_text(file_path)

print("📝 Preview Extracted Text:\n")
print(extracted_text[:1000])  # Show first 1000 characters




📝 Preview Extracted Text:

Cwe } Iio [03 Akb Hlaoam See : 22 Appianmext-2 T0 ; 26/4892642 9k) Enxplai ~ki We"need Yood, ;  , (6) Nam € Ae` enexttial and MOn enpewtial amino acidb ~ ;; Lmlysi $ : Anc (:} 1,, | 0.266.9)61 6' ' W} ;, 4 Food ia ' #e: mox-ncwtee pf 'enendi: It in @nn  emtial (on W to ake {ood 2o beeawe: 3 . he_Poin of: body ) Photection lenendj 1  dtovt ladairt . dineanea . ) ' healky , nAxt" otnont bonen , awd l cl the Yeell blood; , F Fcodo Jien w al 1 0 cf ,f 356360 Iutfve JeA-iuezerVe # ot bodj: Foo & fy Dlppa; en enengt_ka Piad-etion pf hest and 'Jwf)k . 3 all 04 ackiv tien iX Ow b6lj' # in (apen enh enkd fa {nonk 6( human 46lj' i4 i @n^ enticy Jon   d:]y wean and /eci in O ve-' 18 alo Phokeetn ot bodj 8hom
Iv6 CH 1081 ; : (r
{^1ii ;',,1') , ' c{', ~ v; i| Nan iol tjpen '{( ofvdlneaser, exas) pke : K }   :0c wVJ ' ~hicketo , etc Ir deneal Wodj - i{ ia Ixollved ;n Ane {nekon o al #ke bol- Plocenlen. Food i indinedly Reldded t6 #e * meta_bol}nm Ow&I vAj' Food  ha 9' 0 mi

### Full RAG Flow

In [34]:
file_path = "Akib Hasan 2011892642.pdf"  # or "note.jpg", etc.
text = extract_text(file_path)
chunks = chunk_text(text)
embeddings, bm25 = build_indexes(chunks)

query = "What is diabetes?"
retrieved = adaptive_retrieve(query, chunks, embeddings, bm25)

context = "\n".join(retrieved)
final_prompt = f"Use the context below to answer the question:\n\nContext:\n{context}\n\nQuestion: {query}"

response = ask_groq(final_prompt)
print(response)


  return forward_call(*args, **kwargs)


According to the provided context, diabetes is a condition where the body has abnormally high levels of glucose in the blood, making it difficult to metabolize glucose and regulate blood sugar levels.
