In [7]:
!pip install pypdf2 pdf2image pdfplumber pytesseract opencv-python-headless pillow sentence-transformers faiss-cpu nltk pymupdf




In [9]:
import os
import io
import fitz  # PyMuPDF provides 'fitz'
import pdfplumber
import numpy as np
from PIL import Image
import cv2
from google.colab import files
import faiss
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any
import json

class DocumentChunker:
    def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def create_chunks(self, text: str) -> List[str]:
        if not text:
            return []

        words = text.split()
        if not words:
            return []

        chunks = []
        current_chunk = []
        current_length = 0

        for word in words:
            current_length += len(word) + 1  # +1 for space
            current_chunk.append(word)

            if current_length >= self.chunk_size:
                chunks.append(' '.join(current_chunk))

                overlap_words = current_chunk[-self.chunk_overlap:]
                current_chunk = overlap_words
                current_length = sum(len(word) + 1 for word in overlap_words)

        if current_chunk and current_length > self.chunk_overlap:
            chunks.append(' '.join(current_chunk))

        return chunks

class VectorStore:
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        self.encoder = SentenceTransformer(model_name)
        self.index = None
        self.chunks = []
        self.metadata = []

    def add_documents(self, chunks: List[str], metadata: List[Dict[str, Any]] = None):
        if not chunks:
            return

        embeddings = self.encoder.encode(chunks)

        if self.index is None:
            dimension = embeddings.shape[1]
            self.index = faiss.IndexFlatL2(dimension)

        self.index.add(np.array(embeddings).astype('float32'))

        start_idx = len(self.chunks)
        self.chunks.extend(chunks)

        if metadata is None:
            metadata = [{"index": i} for i in range(start_idx, start_idx + len(chunks))]
        self.metadata.extend(metadata)

    def search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
        query_embedding = self.encoder.encode([query])
        distances, indices = self.index.search(np.array(query_embedding).astype('float32'), k)

        results = []
        for idx, distance in zip(indices[0], distances[0]):
            if idx < len(self.chunks):
                results.append({
                    'chunk': self.chunks[idx],
                    'metadata': self.metadata[idx],
                    'distance': float(distance)
                })

        return results

    def save(self, path: str):
        if self.index is not None:
            faiss.write_index(self.index, f"{path}_index.faiss")
            with open(f"{path}_data.json", 'w', encoding='utf-8') as f:
                json.dump({
                    'chunks': self.chunks,
                    'metadata': self.metadata
                }, f, ensure_ascii=False)

    @classmethod
    def load(cls, path: str):
        instance = cls()
        instance.index = faiss.read_index(f"{path}_index.faiss")
        with open(f"{path}_data.json", 'r', encoding='utf-8') as f:
            data = json.load(f)
            instance.chunks = data['chunks']
            instance.metadata = data['metadata']
        return instance

def extract_all_from_pdf(pdf_path):
    extracted_data = {
        'text': [],
        'tables': [],
        'images': []
    }

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text:
                    extracted_data['text'].append({
                        'page': page_num + 1,
                        'content': text
                    })

                tables = page.extract_tables()
                if tables:
                    extracted_data['tables'].extend([{
                        'page': page_num + 1,
                        'content': table
                    } for table in tables])

        doc = fitz.open(pdf_path)
        for page_num, page in enumerate(doc):
            image_list = page.get_images()

            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]

                image = Image.open(io.BytesIO(image_bytes))

                extracted_data['images'].append({
                    'page': page_num + 1,
                    'index': img_index,
                    'image': image
                })

        doc.close()

    except Exception as e:
        print(f"Error during extraction: {str(e)}")

    return extracted_data

def process_pdf_complete(pdf_path: str, output_dir: str, chunk_size: int = 500):
    """
    Process PDF by extracting content, chunking, and storing in a vector store.
    """
    print(f"Processing PDF: {pdf_path}")

    extracted_data = extract_all_from_pdf(pdf_path)
    chunker = DocumentChunker(chunk_size=chunk_size)

    # Combine text chunks
    chunks = []
    metadata = []

    for item in extracted_data['text']:
        page_chunks = chunker.create_chunks(item['content'])
        chunks.extend(page_chunks)
        metadata.extend([{'page': item['page'], 'type': 'text'}] * len(page_chunks))

    print(f"Created {len(chunks)} chunks from text.")

    # Initialize and populate vector store
    vector_store = VectorStore()
    vector_store.add_documents(chunks, metadata)

    # Save vector store to disk
    os.makedirs(output_dir, exist_ok=True)
    vector_store.save(os.path.join(output_dir, 'vector_store'))

    print(f"Vector store saved to {output_dir}")
    return vector_store

def main():
    print("Upload a PDF file:")
    uploaded = files.upload()

    for filename in uploaded.keys():
        output_dir = f'extracted_{os.path.splitext(filename)[0]}'
        try:
            vector_store = process_pdf_complete(filename, output_dir)

            print("\nTesting vector search...")
            query = "What is the main topic?"
            results = vector_store.search(query, k=3)

            print(f"\nTop 3 results for query: '{query}'")
            for i, result in enumerate(results):
                print(f"\nResult {i+1}:")
                print(f"Content: {result['chunk'][:200]}...")
                print(f"Page: {result['metadata']['page']}")
                print(f"Type: {result['metadata']['type']}")
                print(f"Distance: {result['distance']:.4f}")

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
    main()


Upload a PDF file:


Saving Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life - 2017-2018.pdf to Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life - 2017-2018 (2).pdf
Processing PDF: Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life - 2017-2018 (2).pdf
Created 20 chunks from text.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store saved to extracted_Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life - 2017-2018 (2)

Testing vector search...

Top 3 results for query: 'What is the main topic?'

Result 1:
Content: Tables, Charts, and Graphs with Examples from History, Economics, Education, Psychology, Urban Affairs and Everyday Life REVISED: MICHAEL LOLKUS 2018...
Page: 1
Type: text
Distance: 1.2984

Result 2:
Content: Example from Education What percent of the total class received grades of 72 or 77? Which grade showed the largest difference between males and females?...
Page: 18
Type: text
Distance: 1.4355

Result 3:
Content: Example from Everyday Life The following chart shows how a family spends its yearly income of $31,000. How much money does this family spend on transportation? Family Budget of $31,000 19% 25% Other R...
Page: 9
Type: text
Distance: 1.6036
