In [None]:
import os
import glob
from PyPDF2 import PdfReader
from pathlib import Path
from typing import List, Dict
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import faiss

LOADING FILES FROM FOLDER

In [None]:
Folder_Name="./Folder"

def load_text(path):
    text_pdf=[]
    try:
        reader=PdfReader(path)
        for page in reader.pages:
            text=page.extract_text() or ""
            text_pdf.append(text)
    except Exception as e:
        print(f"Error reading {path}: {e}")
    return "\n".join(text_pdf).strip()

def load_pdf(folder_name):
    docs=[]
    for p in Path(folder_name).rglob("*.pdf"):
        text=load_text(p)
        if text:
            docs.append({"text": text, "source": str(p)})
    return docs

CHUNKING

In [None]:
corpus=load_pdf(Folder_Name)

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    length_function=len,
)

In [None]:
chunks=[]
for doc in corpus:
    text_chunks = splitter.split_text(doc["text"])
    for i,chunk in enumerate(text_chunks):
        chunks.append({"text": chunk, "source": doc["source"], "chunk_id": i})

GENERATE EMBEDDINGS

In [None]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode([chunk["text"] for chunk in chunks])

tfidf = TfidVectorizer()
tfidf_matrix = tfidf.fit_transform([chunk["text"] for chunk in chunks])