In [11]:
import os
import cv2
import numpy as np
import fitz  # PyMuPDF
import pytesseract

def convert_to_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def reduce_noise(image):
    return cv2.GaussianBlur(image, (5, 5), 0)

def binarize_image(image):
    return cv2.adaptiveThreshold(
        image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )

def deskew_image(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    return cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

def process_one_image(image):
    image = convert_to_grayscale(image)
    print("Converted image to grayscale..")
    image = reduce_noise(image)
    print("Reduced noise in the image..")
    image = binarize_image(image)
    print("Binarized the image..")
    image = deskew_image(image)
    print("Corrected image orientation..")
    return image

In [12]:
pdf_folder = "Resume"
image_output_folder = "ResumeImages"
os.makedirs(image_output_folder, exist_ok=True)

pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
all_image_paths = []

for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder, pdf_file)
    doc = fitz.open(pdf_path)
    page = doc.load_page(0)  # first page (0-indexed)
    pix = page.get_pixmap()
    img_path = os.path.join(image_output_folder, f"{os.path.splitext(pdf_file)[0]}_page_1.png")
    pix.save(img_path)
    all_image_paths.append(img_path)
    doc.close()
print(f"Converted {len(all_image_paths)} PDFs to images.")

Converted 5 PDFs to images.


In [13]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update path if needed

ocr_results = {}
for img_path in all_image_paths:
    image = cv2.imread(img_path)
    processed_img = process_one_image(image)
    text = pytesseract.image_to_string(processed_img)
    ocr_results[img_path] = text
    print(f"Extracted text from {img_path}:\n{text[:200]}...\n")  # Show first 200 chars

Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Corrected image orientation..
Extracted text from ResumeImages\10030015_page_1.png:
RN Oe RE TES Bee ne oes Se ee eS Ce eee A
Stas pa ca wt pee een Sey ch eee,

© ames beget ma me cect a aa

4 Chacr ake amet wag and ie mae rg

Exgmoriy Yams Se. Sperbe Agr S42 O48 2006
Compey Nema 6 a...

Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Corrected image orientation..
Extracted text from ResumeImages\10674770_page_1.png:
oom

(Bache of Sccmon: Acoma by 240 Uertyof ar Com 74 Cry, Se Arm.
(atcha of Sons, Rae Alemmaton fmt, ey 2010 Paaem Abemnaacn Toe
Cyr

‘Sad eee

Aap 216 Osa O14 Compeny Res FC Se

(ected of mpm on oct...

Converted image to grayscale..
Reduced noise in the image..
Binarized the image..
Corrected image orientation..
Extracted text from ResumeImages\10889157_page_1.png:
ee enn ene ee eee ee En ese ee OS ee ee Oe
sees omy Ree ed ek Soe Nemo. ec, Rm Accs dg aac L

In [14]:
ocr_output_folder = "ResumeOCR"
os.makedirs(ocr_output_folder, exist_ok=True)

for img_path, text in ocr_results.items():
    base_name = os.path.basename(img_path).replace('.png', '.txt')
    out_path = os.path.join(ocr_output_folder, base_name)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(text)
print(f"OCR results saved to {ocr_output_folder}")

OCR results saved to ResumeOCR
