<a href="https://colab.research.google.com/github/khaliesahazmin/DataExtraction/blob/main/DataExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install -y tesseract-ocr poppler-utils
!pip install pytesseract pdf2image sentence-transformers


from google.colab import files
from pdf2image import convert_from_path
import pytesseract
import cv2
import numpy as np
from PIL import Image
import json

# Upload file
uploaded = files.upload()
pdf_file = list(uploaded.keys())[0]

# Convert PDF to image pages
pages = convert_from_path(pdf_file, 400)

# Preprocessing function
def preprocess_image(image):
    image = np.array(image)
    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    denoised = cv2.medianBlur(binary, 3)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,2))
    sharpened = cv2.morphologyEx(denoised, cv2.MORPH_GRADIENT, kernel)
    return Image.fromarray(sharpened)

# OCR text extraction
def extract_text_from_pages(pages):
    full_text = ""
    for page in pages:
        processed_page = preprocess_image(page)
        full_text += pytesseract.image_to_string(processed_page)
    return full_text

# Process and save first document
text1 = extract_text_from_pages(pages)
with open("output1.json", "w") as f:
    json.dump({"extracted_text": text1}, f)

print("✅ Document 1 extracted.\n")


uploaded = files.upload()
pdf_file2 = list(uploaded.keys())[0]

pages2 = convert_from_path(pdf_file2, 400)
text2 = extract_text_from_pages(pages2)

with open("output2.json", "w") as f:
    json.dump({"extracted_text": text2}, f)

print("✅ Document 2 extracted.\n")


from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load extracted text
with open("output1.json") as f1, open("output2.json") as f2:
    doc1 = json.load(f1)["extracted_text"]
    doc2 = json.load(f2)["extracted_text"]

# BERT Similarity
bert_model = SentenceTransformer('all-mpnet-base-v2')
embedding1 = bert_model.encode(doc1)
embedding2 = bert_model.encode(doc2)
bert_sim = util.cos_sim(embedding1, embedding2).item() * 100

# TF-IDF Similarity
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([doc1, doc2])
tfidf_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0] * 100

# Display results
print(f"🤖 BERT Similarity: {bert_sim:.2f}%")
print(f"📊 TF-IDF Similarity: {tfidf_sim:.2f}%")




Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (286 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126102 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading p