Skip to content

Commit

Permalink
Merge pull request #413 from yob/skip-characters-outside-crobox
Browse files Browse the repository at this point in the history
Skip text drawn outside the MediaBox
  • Loading branch information
yob committed Dec 24, 2021
2 parents 5f9dd30 + e7fa5ec commit 5c752c3
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 6 deletions.
1 change: 1 addition & 0 deletions lib/pdf/reader.rb
Expand Up @@ -273,6 +273,7 @@ def root

require 'pdf/reader/resource_methods'
require 'pdf/reader/buffer'
require 'pdf/reader/bounding_rectangle_runs_filter'
require 'pdf/reader/cid_widths'
require 'pdf/reader/cmap'
require 'pdf/reader/encoding'
Expand Down
16 changes: 16 additions & 0 deletions lib/pdf/reader/bounding_rectangle_runs_filter.rb
@@ -0,0 +1,16 @@
# coding: utf-8
# typed: strict
# frozen_string_literal: true

class PDF::Reader

# Filter our text/characters that are positioned outside a rectangle. Usually the page
# MediaBox or CropBox, but could be a user specified rectangle too
class BoundingRectangleRunsFilter

def self.runs_within_rect(runs, rect)
runs.select { |run| rect.contains?(run.origin) }
end
end
end

3 changes: 2 additions & 1 deletion lib/pdf/reader/page_layout.rb
Expand Up @@ -21,9 +21,10 @@ def initialize(runs, mediabox)
# PDF::Reader::Rectangle at some point
PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")

@mediabox = process_mediabox(mediabox)
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
@mediabox = process_mediabox(mediabox)
runs = BoundingRectangleRunsFilter.runs_within_rect(runs, @mediabox)
@runs = merge_runs(runs)
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
Expand Down
5 changes: 5 additions & 0 deletions lib/pdf/reader/rectangle.rb
Expand Up @@ -51,6 +51,11 @@ def width
bottom_right.x - bottom_left.x
end

def contains?(point)
point.x >= bottom_left.x && point.x <= top_right.x &&
point.y >= bottom_left.y && point.y <= top_right.y
end

# A pdf-style 4-number array
def to_a
[
Expand Down
17 changes: 12 additions & 5 deletions lib/pdf/reader/text_run.rb
Expand Up @@ -7,13 +7,12 @@ class PDF::Reader
class TextRun
include Comparable

attr_reader :x, :y, :width, :font_size, :text
attr_reader :origin, :width, :font_size, :text

alias :to_s :text

def initialize(x, y, width, font_size, text)
@x = x
@y = y
@origin = PDF::Reader::Point.new(x, y)
@width = width
@font_size = font_size
@text = text
Expand All @@ -35,12 +34,20 @@ def <=>(other)
end
end

def x
@origin.x
end

def y
@origin.y
end

def endx
@endx ||= x + width
@endx ||= @origin.x + width
end

def endy
@endy ||= y + font_size
@endy ||= @origin.y + font_size
end

def mean_character_width
Expand Down
7 changes: 7 additions & 0 deletions rbi/pdf-reader.rbi
Expand Up @@ -43,6 +43,13 @@ module PDF
sig { returns(T::Hash[Symbol, T.untyped]) }
def root; end

class BoundingRectangleRunsFilter
extend T::Sig

sig { params(runs: T::Array[PDF::Reader::TextRun], rect: PDF::Reader::Rectangle).returns(T::Array[PDF::Reader::TextRun]) }
def self.runs_within_rect(runs, rect); end
end

class Buffer
TOKEN_WHITESPACE = [0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
TOKEN_DELIMITER = [0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F]
Expand Down
4 changes: 4 additions & 0 deletions spec/integration_spec.rb
Expand Up @@ -1514,4 +1514,8 @@
end
end
end

context "PDF with glyphs positioned outside the MediaBox" do
it "skips the characteers off the page"
end
end

0 comments on commit 5c752c3

Please sign in to comment.