In [2]:
import os
import glob
import fitz  # pymupdf
import faiss
import numpy as np
import pickle
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from dotenv import load_dotenv
from tqdm.auto import tqdm


In [3]:
# Load environment variables from .env
load_dotenv()

# Directories for your project
BASE_DIR = os.getcwd()
PDF_DIR = os.path.join(BASE_DIR, 'pdfs')
DATA_DIR = os.path.join(BASE_DIR, 'rag_data')
INDEX_DIR = os.path.join(DATA_DIR, 'faiss_index')
META_PATH = os.path.join(DATA_DIR, 'metadata.pkl')
os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(INDEX_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)


In [4]:
# Load the BAAI embedding model
model_name = "BAAI/bge-base-en-v1.5"
encode_kwargs = {'normalize_embeddings': True}
embeddings_model = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs={'device': 'cpu'}, encode_kwargs=encode_kwargs
)


  embeddings_model = HuggingFaceBgeEmbeddings(


In [5]:
def extract_text_from_pdf(path: str) -> str:
    """Extract plain text from a PDF file using PyMuPDF (fitz)."""
    text_parts = []
    doc = fitz.open(path)
    for page_no in range(len(doc)):
        page = doc.load_page(page_no)
        page_text = page.get_text("text")
        if page_text:
            page_text = "\n".join([line.strip() for line in page_text.splitlines() if line.strip()])
            text_parts.append(page_text)
    doc.close()
    return "\n\n".join(text_parts)

pdf_paths = sorted(glob.glob(os.path.join(PDF_DIR, "*.pdf")))
all_docs = []
for p in tqdm(pdf_paths, desc='Reading PDFs'):
    txt = extract_text_from_pdf(p)
    doc = {
        'id': os.path.basename(p),
        'text': txt,
        'source': p
    }
    all_docs.append(doc)


Reading PDFs:   0%|          | 0/16 [00:00<?, ?it/s]

In [7]:
from typing import List

def chunk_text(text: str, chunk_size: int = 800, overlap: int = 200) -> List[str]:
    if not text:
        return []
    tokens = text.split()
    avg_word_len = max(1, sum(len(w) for w in tokens) / len(tokens))  # Average word length
    words_per_chunk = max(50, int(chunk_size / avg_word_len))  # Number of words per chunk
    overlap_words = max(10, int(overlap / avg_word_len))  # Number of words to overlap

    chunks = []
    start = 0
    while start < len(tokens):
        end = min(len(tokens), start + words_per_chunk)
        chunk = " ".join(tokens[start:end])
        chunks.append(chunk)
        if end == len(tokens):
            break
        start = end - overlap_words
    return chunks


In [8]:
chunked_docs = []
for doc in all_docs:  # Assuming all_docs contains a list of documents, each with 'id' and 'text'
    chunks = chunk_text(doc['text'])
    for i, c in enumerate(chunks):
        chunked_docs.append({
            'chunk_id': f"{doc['id']}_chunk_{i}",
            'text': c,
            'source': doc['id'],
            'chunk_index': i
        })


In [9]:
texts = [c['text'] for c in chunked_docs]  # Extracting the text from chunked_docs
embeddings = embeddings_model.embed_documents(texts)  # Generate embeddings for each chunk
emb_matrix = np.array(embeddings, dtype='float32')  # Converting the embeddings into a numpy array
faiss.normalize_L2(emb_matrix)  # Normalizing the embeddings


In [10]:
# Create the FAISS index using Inner Product (IP)
index = faiss.IndexFlatIP(emb_matrix.shape[1])

# Add the embeddings to the index
index.add(emb_matrix)

# Save the FAISS index to disk
faiss.write_index(index, os.path.join(INDEX_DIR, 'faiss.index'))


In [11]:
# Save the chunked documents (metadata) to pickle for later use
with open(META_PATH, 'wb') as f:
    pickle.dump(chunked_docs, f)


In [12]:
# Save embeddings to a file (optional, to load later in Streamlit)
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(emb_matrix, f)
