In [1]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()

True

In [3]:
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [4]:
embeddings = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')

  embeddings = HuggingFaceEmbeddings(model_name = 'all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [12]:
loader = PyPDFLoader("src/TextBooks/sem4/Theory of Computation.pdf")
docs = loader.load()
docs

[Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.2', 'creator': 'Adobe Acrobat Pro 10.1.2', 'creationdate': '2012-01-26T18:18:45-05:00', 'moddate': '2012-01-26T18:18:45-05:00', 'title': '', 'source': 'src/TextBooks/sem4/Theory of Computation.pdf', 'total_pages': 550, 'page': 0, 'page_label': '1'}, page_content='INTRODUCTION TO\nAutomata Theory,\nLanguages, and\nComputation\n3rd Edition\nhopcroft_titlepgs  5/8/06  12:43 PM  Page 1'),
 Document(metadata={'producer': 'Adobe Acrobat Pro 10.1.2', 'creator': 'Adobe Acrobat Pro 10.1.2', 'creationdate': '2012-01-26T18:18:45-05:00', 'moddate': '2012-01-26T18:18:45-05:00', 'title': '', 'source': 'src/TextBooks/sem4/Theory of Computation.pdf', 'total_pages': 550, 'page': 1, 'page_label': '2'}, page_content='INTRODUCTION TO\nAutomata Theory,\nLanguages, and\nComputation\nJOHN E. HOPCROFT\nCornell University\nRAJEEV MOTWANI\nStanford University\nJEFFREY D. ULLMAN\nStanford University\n3rd Edition\nhopcroft_titlepgs  5/8/06  12:43 PM  Page 2'

In [14]:
books = os.listdir("src/TextBooks/sem4")
books

['Data Communication.pdf',
 'Design and Analysis of Algorithms.pdf',
 'Linear Algebra.pdf',
 'Operating Systems.pdf',
 'Software Engineering.pdf',
 'Theory of Computation.pdf']

In [15]:
books_dir = "src/TextBooks/sem4"
rag_dir = "src/RAG/sem4"

os.makedirs(rag_dir, exist_ok=True)

books = os.listdir(books_dir)
for book in books:
    book_path = os.path.join(books_dir, book)
    loader = PyPDFLoader(book_path)
    docs = loader.load()


    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
    final_docs = text_splitter.split_documents(docs)


    vector_db = FAISS.from_documents(final_docs, embeddings)
    book_name = os.path.splitext(book)[0]  
    vector_db.save_local(os.path.join(rag_dir, book_name))

    print(f"FAISS index saved for {book_name} at {rag_dir}/{book_name}")

FAISS index saved for Data Communication at src/RAG/sem4/Data Communication
FAISS index saved for Design and Analysis of Algorithms at src/RAG/sem4/Design and Analysis of Algorithms


Ignoring wrong pointing object 61 0 (offset 0)
Ignoring wrong pointing object 369 0 (offset 0)
Ignoring wrong pointing object 390 0 (offset 0)
Ignoring wrong pointing object 972 0 (offset 0)
Ignoring wrong pointing object 1145 0 (offset 0)
Ignoring wrong pointing object 1151 0 (offset 0)
Ignoring wrong pointing object 1483 0 (offset 0)
Ignoring wrong pointing object 1630 0 (offset 0)
Ignoring wrong pointing object 1918 0 (offset 0)
Ignoring wrong pointing object 2313 0 (offset 0)
Ignoring wrong pointing object 2326 0 (offset 0)
Ignoring wrong pointing object 2875 0 (offset 0)
Ignoring wrong pointing object 3045 0 (offset 0)
Ignoring wrong pointing object 3352 0 (offset 0)


FAISS index saved for Linear Algebra at src/RAG/sem4/Linear Algebra
FAISS index saved for Operating Systems at src/RAG/sem4/Operating Systems
FAISS index saved for Software Engineering at src/RAG/sem4/Software Engineering
FAISS index saved for Theory of Computation at src/RAG/sem4/Theory of Computation


In [6]:
from pdf2image import convert_from_path

pdf_path = "scanned.pdf"
# poppler_path = r"C:\Users\Asus\Downloads\Release-24.08.0-0\poppler-24.08.0\Library\bin"  # Update this path

# images = convert_from_path(pdf_path, poppler_path=poppler_path)

def pdf_to_images(pdf_path, dpi=500):
    return convert_from_path(pdf_path, dpi)

pdf_path = 'scanned.pdf'
images = pdf_to_images(pdf_path)


In [7]:
import pytesseract
from PIL import Image

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def image_to_text(image):
    return pytesseract.image_to_pdf_or_hocr(image, extension='pdf')


In [8]:
from io import BytesIO
from PyPDF2 import PdfMerger
import pytesseract

ocr_pdfs = [BytesIO(pytesseract.image_to_pdf_or_hocr(image, extension='pdf')) for image in images]

merger = PdfMerger()
for ocr_pdf in ocr_pdfs:
    merger.append(ocr_pdf) 


merged_pdf_path = "output_searchable.pdf"
with open(merged_pdf_path, "wb") as f:
    merger.write(f)

merger.close()

print(f"Searchable PDF saved as {merged_pdf_path}")



Searchable PDF saved as output_searchable.pdf
