In [19]:
from PIL import Image
import numpy as np
import pytesseract

In [20]:
pytesseract.pytesseract.tesseract_cmd =r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Full OCR Pipeline: Automated text extraction 

* Image loading from folder
* Preprocessing: Adaptive Thresholding, Morphological opening and closing, Upscaling, Sharpening
* Text Extraction (Tesseract)
* Postprocessing: Cleaning the extracted text
* Storing Results

### 1. I will make my CV PDF into images

In [None]:
from pdf2image import convert_from_path
import os

pdf_path = "C:/Users/aalrassi/Documents/anastasia_learning/DL_indep/OCR_Project/Data Science CV.pdf"        
image_folder = "samples"  
os.makedirs(image_folder, exist_ok=True)

poppler_path ="C:/Users/aalrassi/Downloads/Release-25.07.0-0/poppler-25.07.0/Library/bin"

pages = convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)
for i, page in enumerate(pages):
    image_path = os.path.join(image_folder, f"cv_page{i+1}.png")
    page.save(image_path, "PNG")
    print(f"Saved {image_path}")

### 2. Preprocessing

In [None]:
import cv2
import numpy as np
import os

results_folder = "results"
os.makedirs(results_folder, exist_ok=True)  # creates folder if it doesn't exist

output_csv = os.path.join(results_folder, "cv_text.csv")
def preprocess_image(img_path):
    # Load image in color
    img = cv2.imread(img_path)
    
    # Resize only if width < 1200px
    h, w = img.shape[:2]
    target_width = 1200
    if w < target_width:
        scale = target_width / w
        img = cv2.resize(img, (target_width, int(h * scale)), interpolation=cv2.INTER_CUBIC)
    
    # Sharpening
    kernel = np.array([[0, -1, 0],
                       [-1, 5, -1],
                       [0, -1, 0]])
    img = cv2.filter2D(img, -1, kernel)
    
    # Convert to grayscale for adaptive thresholding
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Adaptive thresholding (Gaussian)
    preprocessed_img = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 8
    )
    kernel = np.ones((2,2), np.uint8)

    return preprocessed_img


* Text extraction

In [None]:
import easyocr, pandas as pd,csv

ocr_results = {}

# Process all images in folder
for filename in sorted(os.listdir(image_folder)):
    if filename.lower().endswith((".png", ".jpg", ".jpeg")):
        img_path = os.path.join(image_folder, filename)
        preprocessed_img = preprocess_image(img_path)
        
        # Convert OpenCV image to PIL image
        pil_img = Image.fromarray(preprocessed_img)
        
        # OCR
        text = pytesseract.image_to_string(pil_img, config='--psm 3')
        ocr_results[filename] = text

# Save results to CSV
output_csv = os.path.join(results_folder, "cv_text.csv")
with open(output_csv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Page", "Extracted_Text"])
    for page, text in ocr_results.items():
        writer.writerow([page, text])

print(f"OCR completed. Results saved to {output_csv}")

ValueError: If using all scalar values, you must pass an index

In [None]:
import os
import cv2

preprocessed_folder = "preprocessed_images"
os.makedirs(preprocessed_folder, exist_ok=True)

for filename in sorted(os.listdir("samples")):
    if filename.lower().endswith((".png", ".jpg", ".jpeg")):
        img_path = os.path.join("samples", filename)  # Correct folder
        preprocessed_img = preprocess_image(img_path)
        
        # Save preprocessed image
        save_path = os.path.join(preprocessed_folder, filename)
        cv2.imwrite(save_path, preprocessed_img)
        print(f"Saved preprocessed image: {save_path}")

Saved preprocessed image: preprocessed_images\cv_page1.png
Saved preprocessed image: preprocessed_images\cv_page2.png


### Checking accuracy

In [None]:
import Levenshtein
import pandas as pd

# Load OCR and ground-truth CSVs
ocr_df = pd.read_csv(r'results/cv_text.csv')
gt_df = pd.read_csv(r'results/true_text.csv')

# Combine all pages into a single string
ocr_text = " ".join(ocr_df['Extracted_Text'].astype(str))
gt_text = " ".join(gt_df['Extracted_Text'].astype(str))

# Compute overall character-level accuracy
accuracy = (1 - Levenshtein.distance(ocr_text, gt_text)/len(gt_text)) * 100
print(f"Overall OCR Accuracy: {accuracy:.2f}%")

# Optional: per-page accuracy
for i, (ocr, gt) in enumerate(zip(ocr_df['Extracted_Text'], gt_df['Extracted_Text']), 1):
    page_acc = (1 - Levenshtein.distance(str(ocr), str(gt))/len(str(gt))) * 100
    print(f"Page {i} accuracy: {page_acc:.2f}%")


Overall OCR Accuracy: 48.72%
Page 1 accuracy: 52.38%
Page 2 accuracy: 43.74%


In [None]:
ocr_df

Unnamed: 0,Page,Extracted_Text
0,cv_page1.png,CONTACT\nMobile: +961 81 393 583\n\nGmail: ana...
1,cv_page2.png,"SKILLS\n\nFluent in English and Arabic,\nwith ..."
