# Intelligent document processing

Helps in processign unstructured and semi-structured data in documents.

For testing purposes - we will be using only a small set (5) of resumes.

Resumes were obtained from: [Kaggle](https://www.kaggle.com/datasets/snehaanbhawal/resume-dataset)

## Installations and processing functions:

In [1]:
!sudo apt-get install -y poppler-utils

! pip install opencv-python matplotlib numpy pdf2image
! pip install poppler-utils
! pip install pytesseract pillow

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.10).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [2]:
import matplotlib.pyplot as plt
import cv2
import numpy as np

In [3]:
def display_image(image, title="Image"):
    plt.figure(figsize=(7, 7))
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.title(title)
    plt.axis('off')
    plt.show()

In [4]:
# Convert the image to grayscale
def convert_to_grayscale(image):
  return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def reduce_noise(gray_image):
  return cv2.GaussianBlur(gray_image, (5, 5), 0)

In [5]:
def binarize_image(blur_reduced_image):
  return cv2.adaptiveThreshold(
    blur_reduced_image,
    255,
    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
    cv2.THRESH_BINARY_INV, # Invert the colors (text becomes white)
    11, # Block size
    4  # Constant C
  )

In [6]:
def deskew_image(image):
    """
    Corrects the skew of an image by finding the minimum area rectangle
    of the text block and rotating accordingly.
    """
    # Find all non-zero (white) pixels
    coords = cv2.findNonZero(image)

    # Get the minimum area bounding rectangle
    # It returns (center(x,y), (width, height), angle of rotation)
    rect = cv2.minAreaRect(coords)
    angle = rect[-1] - 90

    # The `cv2.minAreaRect` angle has a specific range.
    # We need to adjust it for our rotation.
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = angle

    # Get the rotation matrix and rotate the image
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h),
                             flags=cv2.INTER_CUBIC,
                             borderMode=cv2.BORDER_REPLICATE)
    print(f"Detected skew angle: {angle:.2f} degrees")

    # Now, rotate the original grayscale image by the same angle
    (h, w) = rotated.shape
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    deskewed_gray = cv2.warpAffine(rotated, M, (w, h),
                                  flags=cv2.INTER_CUBIC,
                                  borderMode=cv2.BORDER_REPLICATE)

    return deskewed_gray

In [7]:
def process_one_image(image):
  image = convert_to_grayscale(image)
  print("Converted image to grayscale..")
  image = reduce_noise(image)
  print("Reduced noise in the image..")
  image = binarize_image(image)
  print("Binarized the image..")
  image = deskew_image(image)
  print("Corrected image orientation..")
  return image

## Prepping the resumes for extraction:

In [8]:
import time
import os
import zipfile
from pdf2image import convert_from_path

output_folder_path = "/content/processed_images"

if os.makedirs(output_folder_path, exist_ok=True):
  print(f"Created folder: {output_folder_path}")

resumes_folder = '/content/Resumes'

for resume_name in os.listdir(resumes_folder):
  if resume_name.endswith('.pdf'):
    print(f"Processing resume: {resume_name}")
    resume_path = os.path.join(resumes_folder, resume_name)

    # Convert the first page of the PDF to an image
    try:
      pages = convert_from_path(resume_path, first_page=1, last_page=1)
      if pages:
        image = cv2.cvtColor(np.array(pages[0]), cv2.COLOR_RGB2BGR)
        processed_image = process_one_image(image)
        output_path = os.path.join(output_folder_path, resume_name.replace('.pdf', '.png'))
        cv2.imwrite(output_path, processed_image)
        print(f"Saved processed image to: {output_path}")
        print("-"*50)
      else:
        print(f"Could not convert the first page of {resume_name} to an image.")
        print("-"*50)
    except Exception as e:
      print(f"Error processing {resume_name}: {e}")
      print("-"*50)


print("Processing images is completed.")

Processing resume: Resume4.pdf
Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Detected skew angle: -1.38 degrees
Corrected image orientation..
Saved processed image to: /content/processed_images/Resume4.png
--------------------------------------------------
Processing resume: Resume2.pdf
Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Detected skew angle: -0.00 degrees
Corrected image orientation..
Saved processed image to: /content/processed_images/Resume2.png
--------------------------------------------------
Processing resume: Resume1.pdf
Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Detected skew angle: -0.42 degrees
Corrected image orientation..
Saved processed image to: /content/processed_images/Resume1.png
--------------------------------------------------
Processing resume: Resume5.pdf
Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Detected

## Text extraction using Tesseract:

In [9]:
from PIL import Image
import pytesseract
import time

input_folder_path = "/content/processed_images"
output_folder_path = "/content/tesseract_output"
start_time = time.time()

if os.makedirs(output_folder_path, exist_ok=True):
  print(f"Created folder: {output_folder_path}")

total_images = sum(1 for entry in os.scandir(input_folder_path))
print(f"Total images in folder: {total_images}")

for i, image_name in enumerate(os.listdir(input_folder_path)[:20], 1):
  print(f"Processing image {i}/{total_images}: {image_name}")
  image_path = os.path.join(input_folder_path, image_name)
  print("Extracting text from image..")
  text = pytesseract.image_to_string(Image.open(image_path))
  output_path = os.path.join(output_folder_path, image_name.replace(".png", ".txt"))
  with open(output_path, "w") as f:
    f.write(text)

  print(f"Saved extracted text to {output_path}")
  print("-"*50)

print("Text Extraction Completed.")
print(f"Total time taken: {time.time() - start_time} seconds")

Total images in folder: 5
Processing image 1/5: Resume2.png
Extracting text from image..
Saved extracted text to /content/tesseract_output/Resume2.txt
--------------------------------------------------
Processing image 2/5: Resume4.png
Extracting text from image..
Saved extracted text to /content/tesseract_output/Resume4.txt
--------------------------------------------------
Processing image 3/5: Resume1.png
Extracting text from image..
Saved extracted text to /content/tesseract_output/Resume1.txt
--------------------------------------------------
Processing image 4/5: Resume5.png
Extracting text from image..
Saved extracted text to /content/tesseract_output/Resume5.txt
--------------------------------------------------
Processing image 5/5: Resume3.png
Extracting text from image..
Saved extracted text to /content/tesseract_output/Resume3.txt
--------------------------------------------------
Text Extraction Completed.
Total time taken: 86.29071760177612 seconds


## Now that all the text is in .txt files, we can pass all the info into an LLM and extract "information" from our "data"

In [10]:
prompt = """
Extract key information from the given resume text.
Information to be extracted: Position, skills, summary, work_experience.

The text has been extracted from a resume using Tesseract OCR. Use only this text to extract information.
Do NOT make up or generate any data. If a field is not present in the text, leave it as a blank string ("").

For the "work_experience" field, summarize the person's experience into a short paragraph highlighting their key roles, achievements, and duration, based only on the extracted text.

Always give your response in the following JSON format:

{
    "Position": "",
    "skills": "",
    "summary": "",
    "work_experience": ""
}

Respond strictly in the specified JSON format without adding any extra commentary or explanation.

Here is the extracted text:
"""

In [11]:
from google import genai
from google.colab import userdata
from PIL import Image
import json
import time

genai_client = genai.Client(api_key=userdata.get('google_api_key'))

In [12]:
import os
import time
import json
from PIL import Image

image_folder_path = "/content/processed_images"
text_folder_path = "/content/tesseract_output"
output_folder_path = "/content/json_output"

start_time = time.time()

os.makedirs(output_folder_path, exist_ok=True)
print(f"Ensured folder exists: {output_folder_path}")

total_images = sum(1 for entry in os.scandir(image_folder_path))
print(f"Total images in folder: {total_images}")

for i, image_name in enumerate(os.listdir(image_folder_path)[:20], 1):
    print(f"Processing image {i}/{total_images}: {image_name}")
    image_path = os.path.join(image_folder_path, image_name)
    print(f"Loading image: {image_path}")

    with open(image_path, "rb") as f:
        image = Image.open(image_path)

    # Handle both .png and .jpg
    base_name, _ = os.path.splitext(image_name)
    text_path = os.path.join(text_folder_path, base_name + ".txt")

    print(f"Loading extracted text: {text_path}")
    with open(text_path, "r") as f:
        text = f.read()

    print("Extracting information from image and text..")

    prompt_with_text = prompt + text

    contents = [
        image,
        {"text": prompt_with_text}
    ]
    response = genai_client.models.generate_content(
        model='gemini-2.5-flash',
        contents=contents
    )

    # Access the usage_metadata attribute
    usage_metadata = response.usage_metadata
    print(f"Input Token Count: {usage_metadata.prompt_token_count}")
    print(f"Thoughts Token Count: {response.usage_metadata.thoughts_token_count}")
    print(f"Output Token Count: {usage_metadata.candidates_token_count}")
    print(f"Total Token Count: {usage_metadata.total_token_count}")

    # ---- Safe response parsing ----
    response_text = None
    if hasattr(response, "text") and response.text:
        response_text = response.text
    elif hasattr(response, "candidates") and response.candidates:
        parts = response.candidates[0].content.parts
        if parts and hasattr(parts[0], "text"):
            response_text = parts[0].text

    if response_text is None:
        print("⚠️ No text returned from model. Skipping this file.")
        continue

    # Clean and parse JSON safely
    response_text = response_text.replace('```json', '').replace('```', '')

    try:
        extracted_information = json.loads(response_text)
    except json.JSONDecodeError as e:
        print(f"⚠️ Failed to decode JSON for {image_name}: {e}")
        continue

    # Save JSON with correct name
    output_path = os.path.join(output_folder_path, base_name + ".json")
    with open(output_path, "w") as f:
        json.dump(extracted_information, f, indent=4)

    print(f"Saved extracted information to {output_path}")
    print("-" * 50)
    time.sleep(60)

print("Information Extraction Completed.")
print(f"Total time taken: {time.time() - start_time} seconds")

Ensured folder exists: /content/json_output
Total images in folder: 5
Processing image 1/5: Resume2.png
Loading image: /content/processed_images/Resume2.png
Loading extracted text: /content/tesseract_output/Resume2.txt
Extracting information from image and text..
Input Token Count: 1236
Thoughts Token Count: 1656
Output Token Count: 423
Total Token Count: 3315
Saved extracted information to /content/json_output/Resume2.json
--------------------------------------------------
Processing image 2/5: Resume4.png
Loading image: /content/processed_images/Resume4.png
Loading extracted text: /content/tesseract_output/Resume4.txt
Extracting information from image and text..
Input Token Count: 1243
Thoughts Token Count: 1762
Output Token Count: 300
Total Token Count: 3305
Saved extracted information to /content/json_output/Resume4.json
--------------------------------------------------
Processing image 3/5: Resume1.png
Loading image: /content/processed_images/Resume1.png
Loading extracted text: /