In [1]:
import os
from pathlib import Path
import PyPDF2
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re
import warnings

warnings.filterwarnings('ignore', category=Warning)

class PDFSemanticSearch:
    def __init__(self, folder_path, chunk_size=50):
        self.folder_path = Path(folder_path)
        self.chunk_size = chunk_size
        self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
        self.text_chunks = []
        self.embeddings = None
        self.sources = []
        
    def extract_text_from_pdf(self, pdf_path):
        with open(pdf_path, 'rb') as file:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def split_into_chunks(self, text):
        sentences = re.split(r'([.!?])\s+', text)
        sentences = [''.join(i) for i in zip(sentences[0::2], sentences[1::2] + [''])]
        
        chunks = []
        current_chunk = []
        current_word_count = 0
        
        for sentence in sentences:
            sentence_words = sentence.split()
            sentence_word_count = len(sentence_words)
            
            if current_word_count + sentence_word_count > self.chunk_size * 1.5 and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_word_count = 0
            
            current_chunk.append(sentence)
            current_word_count += sentence_word_count
            
            if current_word_count >= self.chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_word_count = 0
        
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        return chunks
    
    def process_all_pdfs(self):
        for pdf_file in self.folder_path.glob('*.pdf'):
            try:
                text = self.extract_text_from_pdf(pdf_file)
                chunks = self.split_into_chunks(text)
                self.text_chunks.extend(chunks)
                self.sources.extend([pdf_file.name] * len(chunks))
                print(f"Processed: {pdf_file.name}, extracted {len(chunks)} chunks")
            except Exception as e:
                print(f"Error processing {pdf_file.name}: {str(e)}")
        
        if self.text_chunks:
            self.embeddings = self.model.encode(self.text_chunks)
            print(f"Total chunks processed: {len(self.text_chunks)}")
        else:
            print("No valid text found!")
    
    def search(self, query, top_k=5):
        if self.embeddings is None:
            print("Please process PDF files first!")
            return []
        
        query_embedding = self.model.encode([query])
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        results = []
        for idx in top_indices:
            results.append({
                'score': similarities[idx],
                'text': self.text_chunks[idx],
                'source': self.sources[idx]
            })
        
        return results

  from .autonotebook import tqdm as notebook_tqdm





In [None]:
searcher = PDFSemanticSearch(r'D:\Admin\Desktop\RC11\bookslist', chunk_size=50)
searcher.process_all_pdfs()

unknown widths : 
[0, IndirectObject(407, 0, 2551492679952)]


Processed: 10.4324_9780203764077_previewpdf.pdf, extracted 398 chunks
Processed: 2053951716674238.pdf, extracted 85 chunks


unknown widths : 
[0, IndirectObject(419, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(421, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(423, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(419, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(413, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(415, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(417, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(415, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(407, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(413, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(411, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(413, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(407, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(413, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(411, 0, 2551492679952)]
unknown widths : 
[0, IndirectObject(413, 0, 2551492679952)]
unknown widths : 
[0, In

Processed: 247 Late Capitalism and the Ends of Sleep.pdf, extracted 594 chunks
Processed: Algorithmic_culture.pdf, extracted 172 chunks
Processed: BraveNewWorld-1.pdf, extracted 1128 chunks
Processed: Christian-Alignment-Problem-Intro-and-Ch1.pdf, extracted 254 chunks
Processed: Cory_Doctorow_-_Down_and_Out_in_the_Magic_Kingdom.pdf, extracted 814 chunks
Processed: Cory_Doctorow_-_Little_Brother.pdf, extracted 2122 chunks
Processed: Critique, Social Media and the Information Society.pdf, extracted 398 chunks
Processed: Critique_of_Everyday_Life_-_Henri_Lefebvre.pdf, extracted 5299 chunks
Processed: Dave_Eggers_The_Circle.pdf, extracted 2385 chunks
Processed: david-foster-wallace-infinite-jest-v2-0.pdf, extracted 9304 chunks
Processed: deleuze_control.pdf, extracted 4 chunks
Processed: feed.pdf, extracted 869 chunks
Processed: FERNNO-2v1.pdf, extracted 1149 chunks
Processed: foucault-panopticism.pdf, extracted 270 chunks


unknown widths : 
[0, IndirectObject(1239, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1229, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1227, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1241, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1227, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1229, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1247, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1244, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1249, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1241, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1227, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1244, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1258, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1229, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1261, 0, 2551492622032)]
unknown widths : 
[0, IndirectObject(1268, 0, 2551492622032)]
unknown 

Processed: Horkheimer_Max_Adorno_Theodor_W_Dialectic_of_Enlightenment_Philosophical_Fragments.pdf, extracted 2076 chunks
Processed: L-0003887079-pdf.pdf, extracted 102 chunks
Processed: Lovereading-_A_Visit_from_the_Goon_Squad_by_Jennifer_Egan.pdf, extracted 22 chunks
Processed: Neal Stephenson - Snow Crash.pdf, extracted 2668 chunks
Processed: Network Culture Politics for the Information Age Tiziana Terranova.pdf, extracted 1295 chunks
Processed: Neuromancer - William Gibson.pdf, extracted 1427 chunks
Processed: New Dark Age.pdf, extracted 1483 chunks
Processed: Noble-Conclusion-2018.pdf, extracted 82 chunks
Processed: Noble-FutureInformationCulture-2018.pdf, extracted 110 chunks
Processed: Noble-FutureKnowledgePublic-2018.pdf, extracted 113 chunks
Processed: Noble-Introduction-2018.pdf, extracted 86 chunks
Processed: Noble-SearchingBlackGirls-2018.pdf, extracted 220 chunks
Processed: Noble-SearchingPeopleCommunities-2018.pdf, extracted 45 chunks
Processed: Noble-SearchingProtectionsS

In [3]:
query = "person"  
results = searcher.search(query, top_k=5)


for i, result in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Similarity score: {result['score']:.4f}")
    print(f"Source file: {result['source']}")
    print(f"Content: {result['text']}\n")
    print("-" * 80)


Result 1:
Similarity score: 0.4135
Source file: Algorithmic_culture.pdf
Content: San Diego, CA: Harcourt. Milgram S (2010) Some conditions on obedience and disobedience to authority (3rd edn). In: Blass T (ed.) The Individual in a Social World: Essays and Experiments. London: Pinter & Martin Ltd, pp.128–150. by guest on July 7, 2015 ecs.sagepub.com Downloaded from Striphas 411 Milgram S and Toch H (2010) Crowds.

--------------------------------------------------------------------------------

Result 2:
Similarity score: 0.4079
Source file: Algorithmic_culture.pdf
Content: In: Blass T (ed.) The Individual in a Social World: Essays and Experiments (3rd edn). London: Pinter & Martin Ltd, pp.237–305. Moretti F (2005) Graphs, Maps, Trees: Abstract Models for a Literary History. London; New York: Verso. Olson M (1971) The Logic of Collective Action: Public Goods and the Theory of Groups (Revised edn).

--------------------------------------------------------------------------------

Result