In [35]:
import os, re, glob, json
import ollama
from pymilvus import MilvusClient

from pypdf import PdfReader

In [36]:
# ---------- Config ----------
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "phi3:latest")
EMBED_MODEL  = os.getenv("EMBED_MODEL",  "nomic-embed-text")
DB_PATH      = os.getenv("MILVUS_LITE_PATH", "./milvus.db")
COLLECTION   = os.getenv("MILVUS_COLLECTION", "docs")
CHUNK_CHARS  = int(os.getenv("CHUNK_CHARS", "1200"))
CHUNK_OVER   = int(os.getenv("CHUNK_OVERLAP", "200"))
TOP_K        = int(os.getenv("TOP_K", "5"))

In [85]:

def read_pdf(path:str):

    '''
    Read input PDF file, pagewise, and return text :str
    '''
    
    out = []
    r = PdfReader(path)
    
    for page in r.pages:

        t = page.extract_text() or ""
        out.append(t)

    return "\n".join(out)


def load_docs(folder: str):

    '''
    Load the .pdf and .txt documents
    '''
    docs = []

    for path in glob.glob(os.path.join(index,"**","*"),recursive=True):

        if not os.path.isfile(path):

            continue

        ext = os.path.splitext(path)[1].lower().strip()

        try:
            if ext in [".pdf"]:
    
                text = read_pdf(path)
    
            elif ext in [".txt"]:
    
                continue
    
            else:
                continue
    
            if text.strip():
    
                docs.append({"doc_id": os.path.basename(path),
                            "text": text
                            })
        except Exception as e:
            print(e)
            print(f"[skip] {path}")
            
            
    return docs

            

        
    
def chunk_text(text: str, size=1200, overlap=200):

    '''
    Chunk the input text for the minimum given size (default: 1200 chars), with overlap (default: 200 chars) for the next
    '''

    sents = re.split(r'(?<=[\.\!\?])\s+', text.strip())

    # print("sentence level split text loss chgeck")
    # print(len(text),[len(i) for i in sents], sum([len(i) for i in sents]) )

    chunks, cur = [], ""

    for s in sents:

        
        if len(s)+len(cur) <= size:

            cur = cur+(" " if cur else "") + s

        else:

            if cur:
                chunks.append(cur)

            tail = cur[-overlap:] if overlap >0 else ""

            cur = (tail + " " + s).strip()

    if cur:
        chunks.append(cur)

    return chunks

            
                

def index_folder(folder: str):

    '''

    '''
    
    docs = load_docs(folder)

    if not docs:

        print("[index] no documents found")

    print([i["doc_id"] for i in docs])


    #---------Chunk & Embed Documents -------------------------------

    for d in docs:

        chunks = chunk_text(d["text"], CHUNK_CHARS, CHUNK_OVER)
        print([len(i) for i in chunks])
        

    

    

In [86]:
# agent initialize

# if __name__ == "__main__":

#     import argparse

#     ap = argparse.ArgumentParser()

#     ap.add_argument("--index",type= str,help="Path to folder of docs to ingest")
#     ap.add_argument("--ask", type= str, help="ASk a question, to get an answer based on documents using LLM")

#     args = ap.parse_args()

#     if args.index:
#         index_folder(args.index)
#     if args.ask:
#         print(chat_once(args.ask))


index = "data"
ask = "Tell me about radhika"

if index:
    index_folder(index)
# if ask:
#     print(chat_once(ask))




['Ramesh_Naik Resume.pdf', 'Abhishek_Prasanna_Walavalkar Resume.pdf', 'Radhika_Ganesh Resume.pdf']
[1079, 1187, 1154, 897, 837, 809]
[554, 1261, 1156, 1133, 1115, 816]
[2119, 1151, 1106, 1015, 853]
