In [1]:
import fitz  # PyMuPDF
import os
import re
import pytesseract
from PIL import Image

In [2]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

PDF_PATH = "pdfs/Promotion Policy - 2025.pdf"
IMG_OUTPUT_DIR = "images"
os.makedirs(IMG_OUTPUT_DIR, exist_ok=True)

In [3]:
doc = fitz.open(PDF_PATH)

In [4]:
sections = []
images = []
current_section = None

In [5]:
def is_section_heading(span):
    return (
        span["size"] >= 11 and
        span["color"] == 12582912 and
        span["flags"] == 20 
        # len(span["text"].split()) <= 4  # Optional: limits to short titles
    )


In [6]:
for page_num in range(len(doc)):
    page = doc[page_num]
    blocks = page.get_text("dict")["blocks"]

    for block in blocks:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text = span["text"].strip()
                # print("TEXT:", span["text"])    
                # print("FONT SIZE:", span["size"])
                # print("COLOR:", span["color"])
                # print("FLAGS:", span["flags"])
                # print("-----")

                if is_section_heading(span) and len(text.split()) <= 6:
                    # Start new section
                    current_section = {
                        "title": text,
                        "content":   "",
                        "page": page_num + 1,
                        "related_images": []
                    }
                    sections.append(current_section)
                elif current_section and text:
                    current_section["content"] += text + " "

    # Extract images from page
    for img_index, img in enumerate(page.get_images(full=True)):
        xref = img[0]
        pix = fitz.Pixmap(doc, xref)
        
        print(f"Page {page_num + 1} — Image {img_index + 1}: {pix.width}x{pix.height}")
        
        if pix.width < 200 or pix.height < 200:
            continue  # Skip small (logo/footer) images

        if pix.n > 4:  # CMYK
            pix = fitz.Pixmap(fitz.csRGB, pix)
        
        img_filename = f"{IMG_OUTPUT_DIR}/pg{page_num + 1}_img{img_index + 1}.png"
        pix.save(img_filename)

        # OCR to extract text
        ocr_text = pytesseract.image_to_string(Image.open(img_filename))

        # Link image to the last detected section
        linked_title = current_section["title"] if current_section else "Unknown"

        images.append({
            "page": page_num + 1,
            "image_path": img_filename,
            "ocr_text": ocr_text,
            "linked_section_title": linked_title
        })

        # Add image to section
        if current_section:
            current_section["related_images"].append(img_filename)



Page 1 — Image 1: 152x80
Page 1 — Image 2: 1335x55
Page 2 — Image 1: 152x80
Page 2 — Image 2: 1335x55
Page 2 — Image 3: 103x43
Page 3 — Image 1: 152x80
Page 3 — Image 2: 1335x55
Page 3 — Image 3: 1153x930
Page 4 — Image 1: 152x80
Page 4 — Image 2: 1335x55
Page 5 — Image 1: 152x80
Page 5 — Image 2: 1335x55
Page 6 — Image 1: 152x80
Page 6 — Image 2: 1335x55
Page 7 — Image 1: 152x80
Page 7 — Image 2: 1335x55
Page 7 — Image 3: 808x696
Page 8 — Image 1: 152x80
Page 8 — Image 2: 1335x55
Page 9 — Image 1: 152x80
Page 9 — Image 2: 1335x55
Page 9 — Image 3: 957x1713
Page 10 — Image 1: 152x80
Page 10 — Image 2: 1335x55
Page 11 — Image 1: 152x80
Page 11 — Image 2: 1335x55
Page 11 — Image 3: 283x115
Page 11 — Image 4: 1085x1494


In [7]:
for sec in sections:
    print(f"\n=== {sec['title']} ===\n{sec['content'][:300]}")



=== Purpose ===
To set, define and communicate broadly Elsewedy Electric policy concerning promotions ensuring equal opportunity to all employees based on job evolution, performance, and business need. This Policy is a minimum standard; where local legislations define higher standards; the Group shall comply with t

=== Applicability ===
This policy applies to all the operating companies and subsidiaries directly or indirectly controlled by Elsewedy Electric, and all the geographical regions where Elsewedy Electric companies and subsidiaries are operating. 

=== The Performance and Talent Cycle ===
Performance Management and Reward Cycle Talent Management and Development Cycle 4.Employees' Performance Bonus 7.End of Year Performance Appraisal & The 25-Box Grid 8.Learning Needs Analysis (LNA) 13.Merit Increase 10.Talent Review Meetings The 9-Box Grid (nominations) 3.Business Goals Reviews 5.Eng

=== The 9-Box Grid of Talent Differentiation ===
The 9-Box Grid of Talent Differentiation i

In [8]:
for img in images:
    print(f"{img['image_path']} (Page {img['page']}) linked to section '{img['linked_section_title']}'")
    print("OCR Preview:", img['ocr_text'][:100], "\n")


images/pg3_img3.png (Page 3) linked to section 'The 9-Box Grid of Talent Differentiation'
OCR Preview: Potential of growing in the organization and move up the ladder based on the Employee's

Capabilitie 

images/pg7_img3.png (Page 7) linked to section 'The band promotion'
OCR Preview: 9B Grid of Management (MRG + Sr. MGR)

Key Talent
(MGR + Sr.
MGR)

9B Grid of Sr. Professionals (TL  

images/pg9_img3.png (Page 9) linked to section 'The band promotion'
OCR Preview: Promotion Cycle Flow Chart

Submitting
Promotion Request
to the department

head

llow the Promotion 

images/pg11_img4.png (Page 11) linked to section 'Grading and Titling Matrix'
OCR Preview: Band

Bands Grades

CEO
ship
Senior Managing Director
Positions CLevel B
(SP)
General Manager
ector  



In [9]:
page_num = 1

page = doc[page_num]

In [10]:
# blocks = page.get_text("dict")["blocks"]
# for i, block in enumerate(blocks):
#     print(f"Block {i}: {block['bbox']}, type: {block.get('type', 'text')}")


In [11]:
page_rect = page.rect

In [12]:
# Define top and bottom y-values as percentages of the total height
start_y = page_rect.y0 + 0.475 * page_rect.height  # start around 60% down
end_y   = page_rect.y0 + 0.73 * page_rect.height  # end around 85% down

# Create the rectangle for cropping
target_rect = fitz.Rect(
    page_rect.x0,  # left
    start_y,       # top
    page_rect.x1,  # right
    end_y          # bottom
)

In [13]:
pix = page.get_pixmap(clip=target_rect, dpi=300)
pix.save(f"images/pg{page_num + 1}_bottom_quarter.png")


In [14]:
images

[{'page': 3,
  'image_path': 'images/pg3_img3.png',
  'ocr_text': "Potential of growing in the organization and move up the ladder based on the Employee's\n\nCapabilities\n\n(competencies, skills, knowledge, al\n\n)\n\nies, experience\n\nHOW\n\nBOTIOM\nDoes not demonstrate the\n\n2\n«2\n3s\n32\n£2\n§\nge\nS28\nRoe\n\n£z\noS\n23\n2\nos\n8\n\nMID\nFully and consistently\ndemonstrates the expected\n\ncapabilities required for the\ncurrent band in comparison to aualifvina them to move to the\n\nexpected capabilities as\n\n8\n\noutstanding capabi\n\nconsistently as peers\n\nThe distribution % reflects an example of a BU achieved 100% results over the last 2-3 years\n\n|\nFA\n5\n:\nFa\n\nLow Performer (LP)\n\nExit\n\nNo potential and below average\nperformance who need to be\nmoved rather quickly to another box\nor exist plans need to be set\n\n2-3 Years GOALS Achievements Rating\n\nRIGHT\nLEFT MID One of the very best at delivering\nDoes not deliver expected resultsas Fully delivers expecte

In [15]:
extracted_img_page = 2
extracted_img_path = "images/pg2_img1.png"
extracted_img_section = "The Performance and Talent Cycle"

ocr_text = pytesseract.image_to_string(Image.open(extracted_img_path))

print("Extracted OCR Text:\n", ocr_text)


Extracted OCR Text:
 ELSEWEDY
ELECTRIC

eee 9.Training

Calendar

11.PIP
(Performance
Improvement
Plans) for
Average Talent

12.Exit Plans for
Low Performers

14, HiPo
confirmation TAC
for Promotions

15. Promotions
(hiring from within
using HiPo)

16. HiPo TAC
for Rentention &
Succession
(confirmations)

17.Retention
Plans (PDP) for
HiPo + Watch-
Listers

9.Acceleration'

Development

Programs for HiPo
+ Watch-Listers

(Academies)

18.Key Jobs
Succession Plans
(using HiPo +
Watch-Listers)

Talent Management and Development Cycle

8.Learning
Needs Analysis
(LNA)

7.End of Year
Performance
Appraisal & The
25-Box Grid

The 9-Box Grid
(nominations)

6.MSF 360
(Multi-source
Feedback)

13.Merit Increase

5.Engagement

Survey

1.Annual Plan
and Budget
(Manpower
Planning)

2.Goals' Setting
(business +
employees)

3.Business Goals
Reviews

4.Employees'
Performance
Bonus

Performance Management and Reward Cycle



In [16]:
extracted_data = {
    "path": extracted_img_page,
    "image_path": extracted_img_path,
    "ocr_text": ocr_text,
    "linked_section_title": extracted_img_section
}

images.append(extracted_data)

In [17]:
images

[{'page': 3,
  'image_path': 'images/pg3_img3.png',
  'ocr_text': "Potential of growing in the organization and move up the ladder based on the Employee's\n\nCapabilities\n\n(competencies, skills, knowledge, al\n\n)\n\nies, experience\n\nHOW\n\nBOTIOM\nDoes not demonstrate the\n\n2\n«2\n3s\n32\n£2\n§\nge\nS28\nRoe\n\n£z\noS\n23\n2\nos\n8\n\nMID\nFully and consistently\ndemonstrates the expected\n\ncapabilities required for the\ncurrent band in comparison to aualifvina them to move to the\n\nexpected capabilities as\n\n8\n\noutstanding capabi\n\nconsistently as peers\n\nThe distribution % reflects an example of a BU achieved 100% results over the last 2-3 years\n\n|\nFA\n5\n:\nFa\n\nLow Performer (LP)\n\nExit\n\nNo potential and below average\nperformance who need to be\nmoved rather quickly to another box\nor exist plans need to be set\n\n2-3 Years GOALS Achievements Rating\n\nRIGHT\nLEFT MID One of the very best at delivering\nDoes not deliver expected resultsas Fully delivers expecte

In [18]:
import sys
import os

sys.path.append(os.path.abspath(".."))

from utils.database_utils import (
    insert_sections_into_db,
    create_table_if_not_exists,
    insert_images_into_db,
)