In [1]:
import pytesseract

In [8]:
pip install pdf2image


Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Using cached pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install pymupdf


Collecting pymupdf
  Using cached PyMuPDF-1.24.14-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Using cached PyMuPDF-1.24.14-cp39-abi3-macosx_11_0_arm64.whl (18.4 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.24.14
Note: you may need to restart the kernel to use updated packages.


In [10]:
from pdf2image import convert_from_path

images = convert_from_path("/Users/aruniga.baskaran/Downloads/Mixed Multipage Document.pdf", dpi=300)
for i, img in enumerate(images):
    img.save(f"page_{i + 1}.png")


In [11]:
import cv2

resized_images = []
for i in range(len(images)):
    image = cv2.imread(f'page_{i + 1}.png', cv2.IMREAD_GRAYSCALE)
    resized = cv2.resize(image, (50, 50))  # Reduce resolution to 50x50 pixels
    resized_images.append(resized)


In [12]:
import numpy as np

flattened_images = [img.flatten() for img in resized_images]


In [13]:
flattened_images

[array([255, 255, 255, ..., 255, 255, 255], dtype=uint8),
 array([255, 255, 255, ..., 255, 255, 255], dtype=uint8),
 array([255, 255, 255, ..., 255, 255, 255], dtype=uint8),
 array([255, 255, 255, ..., 255, 255, 255], dtype=uint8),
 array([255, 255, 255, ..., 255, 255, 255], dtype=uint8),
 array([255, 255, 255, ..., 255, 255, 255], dtype=uint8),
 array([255, 255, 255, ..., 255, 255, 255], dtype=uint8)]

In [14]:
def mse(image1, image2):
    return np.mean((image1 - image2) ** 2)

similarities = []
for i in range(len(flattened_images) - 1):
    similarity = mse(flattened_images[i], flattened_images[i + 1])
    similarities.append(similarity)


In [15]:
breakpoints = []
threshold = 500  # Adjust based on testing
for i, similarity in enumerate(similarities):
    if similarity > threshold:
        breakpoints.append(i + 1)  # Page i+1 starts a new document


In [16]:
documents = []
start = 0
for bp in breakpoints:
    documents.append(range(start, bp))
    start = bp
documents.append(range(start, len(flattened_images)))  # Last document


Document 1: Start Page = 1, End Page = 7


In [18]:
import pymupdf # imports the pymupdf library
doc = pymupdf.open("/Users/aruniga.baskaran/Downloads/Mixed Multipage Document.pdf") # open a document
for page in doc: # iterate the document pages
  text = page.get_text() # get plain text encoded as UTF-8


In [7]:
!pip list


Package            Version
------------------ ---------
annotated-types    0.7.0
blis               1.0.1
catalogue          2.0.10
certifi            2024.8.30
charset-normalizer 3.4.0
click              8.1.7
cloudpathlib       0.20.0
confection         0.1.5
cymem              2.0.10
en_core_web_sm     3.8.0
idna               3.10
Jinja2             3.1.4
langcodes          3.5.0
language_data      1.3.0
marisa-trie        1.2.1
markdown-it-py     3.0.0
MarkupSafe         3.0.2
mdurl              0.1.2
murmurhash         1.0.11
numpy              2.0.2
opencv-python      4.10.0.84
packaging          24.2
pdf2image          1.17.0
pillow             11.0.0
pip                24.2
preshed            3.0.9
pydantic           2.10.3
pydantic_core      2.27.1
Pygments           2.18.0
PyMuPDF            1.24.14
pytesseract        0.3.13
requests           2.32.3
rich               13.9.4
setuptools         75.6.0
shellingham        1.5.4
smart-open         7.0.5
spacy              3.8.2

In [1]:
## k-means clustering

In [3]:
import numpy as np
from pdf2image import convert_from_path
from sklearn.cluster import KMeans
import cv2

# Step 1: Convert PDF to Images
pdf_path = "/Users/aruniga.baskaran/Downloads/Mixed Multipage Document.pdf"
images = convert_from_path(pdf_path, dpi=100)

# Step 2: Preprocess Images (Resize and Flatten)
processed_images = []
for img in images:
    img_resized = img.resize((50, 50))  # Resize to 50x50 for faster comparison
    img_array = np.array(img_resized).flatten()  # Flatten into a 1D array
    processed_images.append(img_array)

processed_images = np.array(processed_images)

# Step 3: Apply Clustering Algorithm
num_clusters = 3  # Adjust based on the number of document types
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
labels = kmeans.fit_predict(processed_images)

# Step 4: Group Pages by Clusters
clusters = {i: [] for i in range(num_clusters)}
for page_num, label in enumerate(labels):
    clusters[label].append(page_num + 1)  # Convert to 1-based page indexing

# Step 5: Output the Groups
for cluster_id, pages in clusters.items():
    print(f"Document Type {cluster_id + 1}: Pages {pages}")


Document Type 1: Pages [2, 3]
Document Type 2: Pages [4, 5, 6, 7]
Document Type 3: Pages [1]


In [2]:
import fitz  # PyMuPDF
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def group_pages_by_visual_layout(pdf_path, resolution=150, similarity_threshold=0.95):
    """
    Group similar pages in a PDF based on their visual layout.

    Parameters:
    - pdf_path (str): Path to the input PDF file.
    - resolution (int): DPI for rendering pages into images.
    - similarity_threshold (float): Threshold for grouping similar pages (0-1 scale).

    Returns:
    - List of grouped page ranges based on visual similarity.
    """
    # Open the PDF file
    doc = fitz.open(pdf_path)

    # Render pages into pixmaps and convert them to feature arrays
    page_features = []
    for page_num in range(len(doc)):
        pixmap = doc[page_num].get_pixmap(dpi=resolution)
        # Normalize pixel data to reduce size and focus on layout
        img = np.frombuffer(pixmap.samples, dtype=np.uint8).astype(float)
        img = img / 255.0  # Normalize pixel values to [0, 1]
        page_features.append(img)

    doc.close()

    # Compare page features and calculate similarity matrix
    n_pages = len(page_features)
    similarity_matrix = np.zeros((n_pages, n_pages))

    for i in range(n_pages):
        for j in range(i, n_pages):
            sim = cosine_similarity(
                page_features[i].reshape(1, -1), page_features[j].reshape(1, -1)
            )[0, 0]
            similarity_matrix[i, j] = sim
            similarity_matrix[j, i] = sim

    # Group pages based on similarity
    groups = []
    visited = set()

    for i in range(n_pages):
        if i in visited:
            continue
        group = [i + 1]  # Pages are 1-indexed
        visited.add(i)
        for j in range(i + 1, n_pages):
            if j not in visited and similarity_matrix[i, j] >= similarity_threshold:
                group.append(j + 1)
                visited.add(j)
        groups.append(group)

    # Merge groups into page ranges
    grouped_ranges = []
    for group in groups:
        grouped_ranges.append((group[0], group[-1]))

    return grouped_ranges

# Example usage
pdf_path = "/Users/aruniga.baskaran/Downloads/Mixed Multipage Document.pdf"  # Replace with your input PDF file
groups = group_pages_by_visual_layout(pdf_path)

# Display results
print("Grouped page ranges based on visual layout:")
for group_id, (start, end) in enumerate(groups, start=1):
    print(f"Group {group_id}: Start Page {start}, End Page {end}")


Grouped page ranges based on visual layout:
Group 1: Start Page 1, End Page 3
Group 2: Start Page 4, End Page 7


In [11]:
import cv2
import numpy as np
import pytesseract
from pdf2image import convert_from_path
from skimage.metrics import structural_similarity as ssim
import re

# Convert PDF to images
def pdf_to_images(pdf_path):
    pages = convert_from_path(pdf_path, 300)  # Convert with 300 DPI for better quality
    return [np.array(page) for page in pages]  # Convert PIL images to NumPy arrays

# Preprocess image: Convert to grayscale and enhance for OCR
def preprocess_image(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert image to grayscale
    # Enhance contrast and remove noise to improve OCR accuracy
    gray_image = cv2.equalizeHist(gray_image)  # Histogram equalization for contrast enhancement
    return gray_image

# Compare pixel intensity similarity using SSIM
def compare_intensity_similarity(top_half, bottom_half):
    similarity_index, _ = ssim(top_half, bottom_half, full=True)  # SSIM between top and bottom halves
    return similarity_index

# Compare mean pixel intensities
def compare_mean_intensity(top_half, bottom_half):
    mean_top = np.mean(top_half)
    mean_bottom = np.mean(bottom_half)
    intensity_diff = np.abs(mean_top - mean_bottom)
    return intensity_diff

# Extract text from image using OCR
def extract_text(image):
    text = pytesseract.image_to_string(image)
    print("Extracted Text:", text)  # Debug: Print the extracted text to inspect
    return text

# Check if the text matches Aadhaar format
def is_aadhaar(text):
    aadhaar_pattern = r'\d{4}\s\d{4}\s\d{4}'  # Aadhaar format: 0000 0000 0000
    return bool(re.search(aadhaar_pattern, text))

# Check if the text matches PAN format
def is_pan(text):
    # Improved PAN regex pattern for better matching
    pan_pattern = r'[A-Z]{5}[0-9]{4}[A-Z]{1}'  # PAN format: AAAAA1234A
    return bool(re.search(pan_pattern, text))

# Function to classify the document as Aadhaar or PAN
def classify_document(image):
    # Step 1: Preprocess the image
    gray_image = preprocess_image(image)

    # Step 2: Divide the image into two halves (top and bottom)
    height, width = gray_image.shape

    # Ensure both halves are of the same size by adjusting if height is odd
    half_height = height // 2
    if height % 2 != 0:  # If the height is odd, adjust to ensure even split
        half_height += 1

    top_half = gray_image[:half_height, :]
    bottom_half = gray_image[half_height:, :]

    # Resize bottom half to match top half in case the height was adjusted
    bottom_half = cv2.resize(bottom_half, (top_half.shape[1], top_half.shape[0]))

    # Step 3: Compare pixel intensity using SSIM
    ssim_similarity = compare_intensity_similarity(top_half, bottom_half)

    # Step 4: Compare mean pixel intensities (alternative to SSIM)
    mean_intensity_diff = compare_mean_intensity(top_half, bottom_half)

    # Step 5: Classification based on threshold
    ssim_threshold = 0.9  # Threshold for SSIM similarity
    mean_intensity_threshold = 10  # Threshold for mean intensity difference

    if ssim_similarity > ssim_threshold or mean_intensity_diff < mean_intensity_threshold:
        print("Document has similar intensity on both sides (Aadhaar/PAN likely)")

        # Step 6: Extract text from the image
        text = extract_text(image)

        # Step 7: Further classify based on text
        if is_aadhaar(text):
            print("Document is Aadhaar")
        elif is_pan(text):
            print("Document is PAN")
        else:
            print("Document is of unknown type")
    else:
        print("Document has different intensity on both sides (Not Aadhaar/PAN)")

# Main function to process PDF and classify
def process_and_classify_pdf(pdf_path):
    images = pdf_to_images(pdf_path)  # Convert PDF pages to images

    for i, image in enumerate(images):
        print(f"Classifying page {i+1}")
        classify_document(image)

# Example usage
pdf_path = '/Users/aruniga.baskaran/Downloads/Mixed Multipage Document.pdf'  # Path to the PDF document
process_and_classify_pdf(pdf_path)


Classifying page 1
Document has similar intensity on both sides (Aadhaar/PAN likely)
Extracted Text: 
Document is of unknown type
Classifying page 2
Document has similar intensity on both sides (Aadhaar/PAN likely)
Extracted Text:  GOVERNMENTOFINDIC,
aa GM

Salim Khan

wy fafel/DOB: 01/01/1998
es/ MALE

Mobile No: 9555679554

5475 7089 7656 >

VID : 9195 7699 1241 4064

Issue Date: 19/01/2018

Download Date: 22/12/2020


Document is Aadhaar
Classifying page 3
Document has similar intensity on both sides (Aadhaar/PAN likely)
Extracted Text: Batra

, OR
- 110074

Address :

S/O Abdur Rahim, F-70-C, KH NO-584,
CHHATTARPUR EXTN, Chattar Pur, South
Delhi,

Delhi - 110074

5475 7089 7656
VID : 9195 7699 1241 4064


Document is Aadhaar
Classifying page 4
Document has different intensity on both sides (Not Aadhaar/PAN)
Classifying page 5
Document has different intensity on both sides (Not Aadhaar/PAN)
Classifying page 6
Document has different intensity on both sides (Not Aadhaar/PAN)
Classifyi

In [None]:
pip uninstall -y numpy


In [18]:
!pip install numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
!pip install numpy==1.26.4

[0mCollecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl (13.7 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.1.3
    Uninstalling numpy-2.1.3:
      Successfully uninstalled numpy-2.1.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blis 1.0.1 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
thinc 8.3.2 requires numpy<2.1.0,>=2.0.0; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update

In [2]:
!pip uninstall -y scikit-learn


Found existing installation: scikit-learn 1.5.2
Uninstalling scikit-learn-1.5.2:
  Successfully uninstalled scikit-learn-1.5.2


In [3]:
!pip install numpy --no-cache-dir

Collecting numpy
  Downloading numpy-2.1.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Downloading numpy-2.1.3-cp312-cp312-macosx_14_0_arm64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m180.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.2 requires numpy<2.1.0,>=2.0.0; python_version >= "3.9", but you have numpy 2.1.3 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.1.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
!python3 -m pip install scikit-learn



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [9]:
!pip3 install -U scikit-learn scipy matplotlib



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
