{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":21716,"defaultBranch":"main","name":"pdf-reader","ownerLogin":"yob","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2008-06-03T01:11:36.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/8132?v=4","public":true,"private":false,"isOrgOwned":false},"refInfo":{"name":"","listCacheKey":"v0:1714804917.0","currentOid":""},"activityList":{"items":[{"before":"2d4b750c0013e9037352a07645c9c111dd375787","after":"c049945d09742d0e35a7423c3242ff356240162b","ref":"refs/heads/main","pushedAt":"2024-05-04T06:41:57.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Merge pull request #537 from yob/hosted\n\nRun jobs on a hosted queue","shortMessageHtmlLink":"Merge pull request #537 from yob/hosted"}},{"before":"d527a17e7b1077f6256de97de8916324f63d5000","after":null,"ref":"refs/heads/hosted","pushedAt":"2024-05-04T06:41:57.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"}},{"before":"25347999ba8f2b3c1f3f499c5301ca9ffd00d4d5","after":"d527a17e7b1077f6256de97de8916324f63d5000","ref":"refs/heads/hosted","pushedAt":"2024-05-04T06:35:10.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Run jobs on a hosted queue","shortMessageHtmlLink":"Run jobs on a hosted queue"}},{"before":"372df3442dbd108af906835d5202638d8c4b5e2a","after":"25347999ba8f2b3c1f3f499c5301ca9ffd00d4d5","ref":"refs/heads/hosted","pushedAt":"2024-05-04T06:34:04.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Run jobs on a hosted queue","shortMessageHtmlLink":"Run jobs on a hosted queue"}},{"before":null,"after":"372df3442dbd108af906835d5202638d8c4b5e2a","ref":"refs/heads/hosted","pushedAt":"2024-05-04T06:32:49.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Run jobs on a hosted queue","shortMessageHtmlLink":"Run jobs on a hosted queue"}},{"before":"7b73e6c3ba81ff9a52bb0502de3f97daee27bba5","after":"2d4b750c0013e9037352a07645c9c111dd375787","ref":"refs/heads/main","pushedAt":"2024-02-21T12:04:50.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Merge pull request #535 from erinnachen/add-stringio-to-sig\n\nAllow StringIO type for PDF::Reader input","shortMessageHtmlLink":"Merge pull request #535 from erinnachen/add-stringio-to-sig"}},{"before":"63bc561f083cc38aab7353dd6bb7b583119f8ee3","after":"7b73e6c3ba81ff9a52bb0502de3f97daee27bba5","ref":"refs/heads/main","pushedAt":"2024-02-05T23:11:48.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Merge pull request #532 from cprodhomme/patch-1\n\nFix indent in readme","shortMessageHtmlLink":"Merge pull request #532 from cprodhomme/patch-1"}},{"before":"78659ebd951b517b2bb08843facb973d57d619a8","after":null,"ref":"refs/heads/ruby-3-3","pushedAt":"2024-01-01T00:21:02.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"}},{"before":"bbe3ff632ad4ee76aa40ce603aa31bd22d290a77","after":"63bc561f083cc38aab7353dd6bb7b583119f8ee3","ref":"refs/heads/main","pushedAt":"2024-01-01T00:21:01.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Merge pull request #531 from yob/ruby-3-3\n\nRun CI on MRI 3.3 final","shortMessageHtmlLink":"Merge pull request #531 from yob/ruby-3-3"}},{"before":null,"after":"78659ebd951b517b2bb08843facb973d57d619a8","ref":"refs/heads/ruby-3-3","pushedAt":"2024-01-01T00:08:41.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Run Ci on MRI 3.3 final","shortMessageHtmlLink":"Run Ci on MRI 3.3 final"}},{"before":"9fa9948e258a035682b29a134b517119e0733270","after":"12cd886c8dea838b1b7b1fea01b769220f1b64d2","ref":"refs/heads/stringscanner","pushedAt":"2023-12-28T14:00:03.000Z","pushType":"push","commitsCount":5,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"PDF names can be empty","shortMessageHtmlLink":"PDF names can be empty"}},{"before":"936de65769c507d140bd2e304e3e5358e0f53bcc","after":"9fa9948e258a035682b29a134b517119e0733270","ref":"refs/heads/stringscanner","pushedAt":"2023-12-26T22:51:06.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"allocate slightly fewer objects in BufferNew","shortMessageHtmlLink":"allocate slightly fewer objects in BufferNew"}},{"before":"e26cf59e6f219ae9e569235264565334daec660d","after":null,"ref":"refs/heads/bar","pushedAt":"2023-12-26T01:36:33.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"}},{"before":"1d935520b7434dc7d24f474a4b986bc212314ed8","after":null,"ref":"refs/heads/chunky-bacon","pushedAt":"2023-12-26T01:36:19.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"}},{"before":"efff95a896a31da0526b01e43c8513dd72ae9a8a","after":null,"ref":"refs/heads/tets-build","pushedAt":"2023-12-26T01:36:17.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"}},{"before":"b5dbae93b40b6263e816e5003f3a8369d0287b6c","after":"bbe3ff632ad4ee76aa40ce603aa31bd22d290a77","ref":"refs/heads/main","pushedAt":"2023-12-26T01:28:13.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"prepare for v2.12.0 release","shortMessageHtmlLink":"prepare for v2.12.0 release"}},{"before":"65623e79b117fee4327080ce6f959c4c383286a0","after":null,"ref":"refs/heads/fix-utf16-surrogate-pairs","pushedAt":"2023-12-26T00:53:57.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"}},{"before":"7762166a56e0bac6c9589a95647e6e6c4cd744f9","after":"b5dbae93b40b6263e816e5003f3a8369d0287b6c","ref":"refs/heads/main","pushedAt":"2023-12-26T00:53:56.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Merge pull request #529 from yob/fix-utf16-surrogate-pairs\n\nFix decoding of some UTF-16 strings that use surrogate pairs","shortMessageHtmlLink":"Merge pull request #529 from yob/fix-utf16-surrogate-pairs"}},{"before":"8ea1f8c6c82684e0e84a2592d7b4ba56f5bab429","after":"65623e79b117fee4327080ce6f959c4c383286a0","ref":"refs/heads/fix-utf16-surrogate-pairs","pushedAt":"2023-12-26T00:49:01.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Fix decoding of some UTF-16 strings that use surrogate pairs\n\nWhen extracting PDF text to UTF-8, some fonts use a ToUnicode mapping\nthat's defined using a CMap table. CMap tables define unicode using\nUTF-16 and for reasons, we unwisely do the decoding of UTF16 to\ncodepoints ourselves instead of deferring to a library.\n\nTurns out we had a boundary bug, where some codepoints that get encoded\nwith the surrogate pair 0xD800 or 0xDBFF weren't detected as surrogate\npairs and were decoded incorrectly.\n\nThis would usually manifest as an incompatible encoding error while\nextracting text:\n\n /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `strip': invalid byte sequence in UTF-8 (Encoding::CompatibilityError)\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `block in interesting_rows'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `map'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `interesting_rows'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:46:in `to_s'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page.rb:121:in `text'\n from bin/pdf_text:12:in `block in
'\n from bin/pdf_text:11:in `each'\n from bin/pdf_text:11:in `
'\n\nI believe Unicode codepoints in the range 0x10000 (decimal 65536) to\n0x103FF (decimal 66559) were impacted, a total of 1023 codepoints.\nTechnically higher codepoints were also impacted, but in an unallocated\nrange). They're mostly ancient languages and numbers, like [Aegean\nNumbers](https://en.wikipedia.org/wiki/Aegean_Numbers_(Unicode_block)),\n[Ancient\nGreek](https://en.wikipedia.org/wiki/Ancient_Greek_Numbers_(Unicode_block)),\n[Phaistos\nDisc](https://en.wikipedia.org/wiki/Phaistos_Disc_(Unicode_block)), and\n[Old\nPersian](https://en.wikipedia.org/wiki/Old_Persian_(Unicode_block)).\n\n (65536...66559).to_a.map { |c| [c].pack(\"U*\") }.each_slice(20) { |s| puts s.join(\" \" )}\n ๐€€ ๐€ ๐€‚ ๐€ƒ ๐€„ ๐€… ๐€† ๐€‡ ๐€ˆ ๐€‰ ๐€Š ๐€‹ ๐€ ๐€Ž ๐€ ๐€ ๐€‘ ๐€’ ๐€“\n ๐€” ๐€• ๐€– ๐€— ๐€˜ ๐€™ ๐€š ๐€› ๐€œ ๐€ ๐€ž ๐€Ÿ ๐€  ๐€ก ๐€ข ๐€ฃ ๐€ค ๐€ฅ ๐€ฆ\n ๐€จ ๐€ฉ ๐€ช ๐€ซ ๐€ฌ ๐€ญ ๐€ฎ ๐€ฏ ๐€ฐ ๐€ฑ ๐€ฒ ๐€ณ ๐€ด ๐€ต ๐€ถ ๐€ท ๐€ธ ๐€น ๐€บ\n ๐€ผ ๐€ฝ ๐€ฟ ๐€ ๐ ๐‚ ๐ƒ ๐„ ๐… ๐† ๐‡ ๐ˆ ๐‰ ๐Š ๐‹ ๐Œ ๐\n ๐ ๐‘ ๐’ ๐“ ๐” ๐• ๐– ๐— ๐˜ ๐™ ๐š ๐› ๐œ ๐\n\n ๐‚€ ๐‚ ๐‚‚ ๐‚ƒ ๐‚„ ๐‚… ๐‚† ๐‚‡ ๐‚ˆ ๐‚‰ ๐‚Š ๐‚‹\n ๐‚Œ ๐‚ ๐‚Ž ๐‚ ๐‚ ๐‚‘ ๐‚’ ๐‚“ ๐‚” ๐‚• ๐‚– ๐‚— ๐‚˜ ๐‚™ ๐‚š ๐‚› ๐‚œ ๐‚ ๐‚ž ๐‚Ÿ\n ๐‚  ๐‚ก ๐‚ข ๐‚ฃ ๐‚ค ๐‚ฅ ๐‚ฆ ๐‚ง ๐‚จ ๐‚ฉ ๐‚ช ๐‚ซ ๐‚ฌ ๐‚ญ ๐‚ฎ ๐‚ฏ ๐‚ฐ ๐‚ฑ ๐‚ฒ ๐‚ณ\n ๐‚ด ๐‚ต ๐‚ถ ๐‚ท ๐‚ธ ๐‚น ๐‚บ ๐‚ป ๐‚ผ ๐‚ฝ ๐‚พ ๐‚ฟ ๐ƒ€ ๐ƒ ๐ƒ‚ ๐ƒƒ ๐ƒ„ ๐ƒ… ๐ƒ† ๐ƒ‡\n ๐ƒˆ ๐ƒ‰ ๐ƒŠ ๐ƒ‹ ๐ƒŒ ๐ƒ ๐ƒŽ ๐ƒ ๐ƒ ๐ƒ‘ ๐ƒ’ ๐ƒ“ ๐ƒ” ๐ƒ• ๐ƒ– ๐ƒ— ๐ƒ˜ ๐ƒ™ ๐ƒš ๐ƒ›\n ๐ƒœ ๐ƒ ๐ƒž ๐ƒŸ ๐ƒ  ๐ƒก ๐ƒข ๐ƒฃ ๐ƒค ๐ƒฅ ๐ƒฆ ๐ƒง ๐ƒจ ๐ƒฉ ๐ƒช ๐ƒซ ๐ƒฌ ๐ƒญ ๐ƒฎ ๐ƒฏ\n ๐ƒฐ ๐ƒฑ ๐ƒฒ ๐ƒณ ๐ƒด ๐ƒต ๐ƒถ ๐ƒท ๐ƒธ ๐ƒน ๐ƒบ ๐„€ ๐„ ๐„‚\n ๐„‡ ๐„ˆ ๐„‰ ๐„Š ๐„‹ ๐„Œ ๐„ ๐„Ž ๐„ ๐„ ๐„‘ ๐„’ ๐„“ ๐„” ๐„• ๐„– ๐„—\n ๐„˜ ๐„™ ๐„š ๐„› ๐„œ ๐„ ๐„ž ๐„Ÿ ๐„  ๐„ก ๐„ข ๐„ฃ ๐„ค ๐„ฅ ๐„ฆ ๐„ง ๐„จ ๐„ฉ ๐„ช ๐„ซ\n ๐„ฌ ๐„ญ ๐„ฎ ๐„ฏ ๐„ฐ ๐„ฑ ๐„ฒ ๐„ณ ๐„ท ๐„ธ ๐„น ๐„บ ๐„ป ๐„ผ ๐„ฝ ๐„พ ๐„ฟ\n ๐…€ ๐… ๐…‚ ๐…ƒ ๐…„ ๐…… ๐…† ๐…‡ ๐…ˆ ๐…‰ ๐…Š ๐…‹ ๐…Œ ๐… ๐…Ž ๐… ๐… ๐…‘ ๐…’ ๐…“\n ๐…” ๐…• ๐…– ๐…— ๐…˜ ๐…™ ๐…š ๐…› ๐…œ ๐… ๐…ž ๐…Ÿ ๐…  ๐…ก ๐…ข ๐…ฃ ๐…ค ๐…ฅ ๐…ฆ ๐…ง\n ๐…จ ๐…ฉ ๐…ช ๐…ซ ๐…ฌ ๐…ญ ๐…ฎ ๐…ฏ ๐…ฐ ๐…ฑ ๐…ฒ ๐…ณ ๐…ด ๐…ต ๐…ถ ๐…ท ๐…ธ ๐…น ๐…บ ๐…ป\n ๐…ผ ๐…ฝ ๐…พ ๐…ฟ ๐†€ ๐† ๐†‚ ๐†ƒ ๐†„ ๐†… ๐†† ๐†‡ ๐†ˆ ๐†‰ ๐†Š ๐†‹ ๐†Œ ๐† ๐†Ž\n ๐† ๐†‘ ๐†’ ๐†“ ๐†” ๐†• ๐†– ๐†— ๐†˜ ๐†™ ๐†š ๐†› ๐†œ ๐† \n\n ๐‡ ๐‡‘ ๐‡’ ๐‡“ ๐‡” ๐‡• ๐‡– ๐‡— ๐‡˜ ๐‡™ ๐‡š ๐‡› ๐‡œ ๐‡ ๐‡ž ๐‡Ÿ\n ๐‡  ๐‡ก ๐‡ข ๐‡ฃ ๐‡ค ๐‡ฅ ๐‡ฆ ๐‡ง ๐‡จ ๐‡ฉ ๐‡ช ๐‡ซ ๐‡ฌ ๐‡ญ ๐‡ฎ ๐‡ฏ ๐‡ฐ ๐‡ฑ ๐‡ฒ ๐‡ณ\n ๐‡ด ๐‡ต ๐‡ถ ๐‡ท ๐‡ธ ๐‡น ๐‡บ ๐‡ป ๐‡ผ\n\n ๐Š€ ๐Š ๐Š‚ ๐Šƒ ๐Š„ ๐Š… ๐Š† ๐Š‡ ๐Šˆ ๐Š‰ ๐ŠŠ ๐Š‹ ๐ŠŒ ๐Š ๐ŠŽ ๐Š ๐Š ๐Š‘ ๐Š’ ๐Š“\n ๐Š” ๐Š• ๐Š– ๐Š— ๐Š˜ ๐Š™ ๐Šš ๐Š› ๐Šœ ๐Š  ๐Šก ๐Šข ๐Šฃ ๐Šค ๐Šฅ ๐Šฆ ๐Šง\n ๐Šจ ๐Šฉ ๐Šช ๐Šซ ๐Šฌ ๐Šญ ๐Šฎ ๐Šฏ ๐Šฐ ๐Šฑ ๐Šฒ ๐Šณ ๐Šด ๐Šต ๐Šถ ๐Šท ๐Šธ ๐Šน ๐Šบ ๐Šป\n ๐Šผ ๐Šฝ ๐Šพ ๐Šฟ ๐‹€ ๐‹ ๐‹‚ ๐‹ƒ ๐‹„ ๐‹… ๐‹† ๐‹‡ ๐‹ˆ ๐‹‰ ๐‹Š ๐‹‹ ๐‹Œ ๐‹ ๐‹Ž ๐‹\n ๐‹ ๐‹  ๐‹ก ๐‹ข ๐‹ฃ\n ๐‹ค ๐‹ฅ ๐‹ฆ ๐‹ง ๐‹จ ๐‹ฉ ๐‹ช ๐‹ซ ๐‹ฌ ๐‹ญ ๐‹ฎ ๐‹ฏ ๐‹ฐ ๐‹ฑ ๐‹ฒ ๐‹ณ ๐‹ด ๐‹ต ๐‹ถ ๐‹ท\n ๐‹ธ ๐‹น ๐‹บ ๐‹ป ๐Œ€ ๐Œ ๐Œ‚ ๐Œƒ ๐Œ„ ๐Œ… ๐Œ† ๐Œ‡ ๐Œˆ ๐Œ‰ ๐ŒŠ ๐Œ‹\n ๐ŒŒ ๐Œ ๐ŒŽ ๐Œ ๐Œ ๐Œ‘ ๐Œ’ ๐Œ“ ๐Œ” ๐Œ• ๐Œ– ๐Œ— ๐Œ˜ ๐Œ™ ๐Œš ๐Œ› ๐Œœ ๐Œ ๐Œž ๐ŒŸ\n ๐Œ  ๐Œก ๐Œข ๐Œฃ ๐Œญ ๐Œฎ ๐Œฏ ๐Œฐ ๐Œฑ ๐Œฒ ๐Œณ\n ๐Œด ๐Œต ๐Œถ ๐Œท ๐Œธ ๐Œน ๐Œบ ๐Œป ๐Œผ ๐Œฝ ๐Œพ ๐Œฟ ๐€ ๐ ๐‚ ๐ƒ ๐„ ๐… ๐† ๐‡\n ๐ˆ ๐‰ ๐Š ๐ ๐‘ ๐’ ๐“ ๐” ๐• ๐– ๐— ๐˜ ๐™ ๐š ๐›\n ๐œ ๐ ๐ž ๐Ÿ ๐  ๐ก ๐ข ๐ฃ ๐ค ๐ฅ ๐ฆ ๐ง ๐จ ๐ฉ ๐ช ๐ซ ๐ฌ ๐ญ ๐ฎ ๐ฏ\n ๐ฐ ๐ฑ ๐ฒ ๐ณ ๐ด ๐ต ๐ถ ๐ท ๐ธ ๐น ๐บ ๐Ž€ ๐Ž ๐Ž‚ ๐Žƒ\n ๐Ž„ ๐Ž… ๐Ž† ๐Ž‡ ๐Žˆ ๐Ž‰ ๐ŽŠ ๐Ž‹ ๐ŽŒ ๐Ž ๐ŽŽ ๐Ž ๐Ž ๐Ž‘ ๐Ž’ ๐Ž“ ๐Ž” ๐Ž• ๐Ž– ๐Ž—\n ๐Ž˜ ๐Ž™ ๐Žš ๐Ž› ๐Žœ ๐Ž ๐ŽŸ ๐Ž  ๐Žก ๐Žข ๐Žฃ ๐Žค ๐Žฅ ๐Žฆ ๐Žง ๐Žจ ๐Žฉ ๐Žช ๐Žซ\n ๐Žฌ ๐Žญ ๐Žฎ ๐Žฏ ๐Žฐ ๐Žฑ ๐Žฒ ๐Žณ ๐Žด ๐Žต ๐Žถ ๐Žท ๐Žธ ๐Žน ๐Žบ ๐Žป ๐Žผ ๐Žฝ ๐Žพ ๐Žฟ\n ๐€ ๐ ๐‚ ๐ƒ ๐ˆ ๐‰ ๐Š ๐‹ ๐Œ ๐ ๐Ž ๐ ๐ ๐‘ ๐’ ๐“\n ๐” ๐•","shortMessageHtmlLink":"Fix decoding of some UTF-16 strings that use surrogate pairs"}},{"before":"a353488bdc7dabe8cb52c490eb54168f864250be","after":"8ea1f8c6c82684e0e84a2592d7b4ba56f5bab429","ref":"refs/heads/fix-utf16-surrogate-pairs","pushedAt":"2023-12-26T00:26:30.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Fix decoding of some UTF-16 strings that use surrogate pairs\n\nWhen extracting PDF text to UTF-8, some fonts use a ToUnicode mapping\nthat's defined using a CMap table. CMap tables define unicode using\nUTF-16 and for reasons, we unwisely do the decoding of UTF16 to\ncodepoints ourselves instead of deferring to a library.\n\nTurns out we had a boundary bug, where some codepoints that get encoded\nwith the surrogate pair 0xD800 or 0xDBFF weren't detected as surrogate\npairs and were decoded incorrectly.\n\nThis would usually manifest as an incompatible encoding error while\nextracting text:\n\n /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `strip': invalid byte sequence in UTF-8 (Encoding::CompatibilityError)\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `block in interesting_rows'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `map'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `interesting_rows'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:46:in `to_s'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page.rb:121:in `text'\n from bin/pdf_text:12:in `block in
'\n from bin/pdf_text:11:in `each'\n from bin/pdf_text:11:in `
'","shortMessageHtmlLink":"Fix decoding of some UTF-16 strings that use surrogate pairs"}},{"before":"f08170fd3bf5e328552ecbd32cf74790c1bd906c","after":"a353488bdc7dabe8cb52c490eb54168f864250be","ref":"refs/heads/fix-utf16-surrogate-pairs","pushedAt":"2023-12-26T00:21:03.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Fix decoding of some UTF-16 strings that use surrogate pairs\n\nWhen extracting PDF text to UTF-8, some fonts use a ToUnicode mapping\nthat's defined using a CMap table. CMap tables define unicde using\nUTF-16 and for reasons, we unwisely do the decoding of UTF16 to\ncodepoints ourselves instead of deferring to a library.\n\nTurns out we had a boundary bug, where some codepoints that get encoded\nwith the surrogate pair 0xD800 or 0xDBFF weren't detected as surrogate\npairs and were deoded incorrectly.\n\nThis would usually manifest as an incompatible encoding error while\nextracting text:\n\n /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `strip': invalid byte sequence in UTF-8 (Encoding::CompatibilityError)\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `block in interesting_rows'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `map'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `interesting_rows'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:46:in `to_s'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page.rb:121:in `text'\n from bin/pdf_text:12:in `block in
'\n from bin/pdf_text:11:in `each'\n from bin/pdf_text:11:in `
'","shortMessageHtmlLink":"Fix decoding of some UTF-16 strings that use surrogate pairs"}},{"before":null,"after":"f08170fd3bf5e328552ecbd32cf74790c1bd906c","ref":"refs/heads/fix-utf16-surrogate-pairs","pushedAt":"2023-12-26T00:19:41.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Fix decoding of some UTF-16 strings that use surrogate pairs\n\nWhen extracting PDF text to UTF-8, some fonts use a ToUnicode mapping\nthat's defined using a CMap table. CMap tables define unicde using\nUTF-16 and for reasons, we unwisely do the decoding of UTF16 to\ncodepoints ourselves instead of deferring to a library.\n\nTurns out we had a boundary bug, where some codepoints that get encoded\nwith the surrogate pair 0xD800 or 0xDBFF weren't detected as surrogate\npairs and were deoded incorrectly.\n\nThis would usually manifest as an incompatible encoding error while\nextracting text:\n\n /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `strip': invalid byte sequence in UTF-8 (Encoding::CompatibilityError)\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `block in interesting_rows'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `map'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:66:in `interesting_rows'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page_layout.rb:46:in `to_s'\n from /home/jh/git/pdf-reader/lib/pdf/reader/page.rb:121:in `text'\n from bin/pdf_text:12:in `block in
'\n from bin/pdf_text:11:in `each'\n from bin/pdf_text:11:in `
'","shortMessageHtmlLink":"Fix decoding of some UTF-16 strings that use surrogate pairs"}},{"before":"b517d4532433694a275ba86161e3fdbec9776cb3","after":"936de65769c507d140bd2e304e3e5358e0f53bcc","ref":"refs/heads/stringscanner","pushedAt":"2023-12-25T12:15:13.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"allocate slightly fewer objects in BufferNew","shortMessageHtmlLink":"allocate slightly fewer objects in BufferNew"}},{"before":"ea4370c82545d7f8de229b426399cb4d9c4086a0","after":"7762166a56e0bac6c9589a95647e6e6c4cd744f9","ref":"refs/heads/main","pushedAt":"2023-12-25T12:14:54.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Merge pull request #528 from yob/optimize-hexstring\n\nReduce allocations when parsing hex strings","shortMessageHtmlLink":"Merge pull request #528 from yob/optimize-hexstring"}},{"before":"803c1b6259f7d6d69f28d103195e7273b4d7ec04","after":null,"ref":"refs/heads/optimize-hexstring","pushedAt":"2023-12-25T12:14:54.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"}},{"before":"bec976aac55cefe31391e5c5772eac5af4f8f491","after":"803c1b6259f7d6d69f28d103195e7273b4d7ec04","ref":"refs/heads/optimize-hexstring","pushedAt":"2023-12-25T12:08:00.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"Reduce allocations when parsing hex strings\n\nRunning a script based on one shared by Aaron at [1], I noticed we allocate a\nsurprising number of objects when parsing hex strings.\n\nThe allocations.rb script (see below) when parsing a file with lots of hex\nstrings shows the hex_string method as the top source of allocations. We can\nfix that!\n\n-- before\n\n $ ruby allocations.rb | head -n 10\n sourcefile sourceline class count\n ------------------------------------------------------ ---------- --------------------------------------- -----\n /lib/pdf/reader/parser.rb 176 Array 65246\n /lib/pdf/reader/parser.rb 176 String 63124\n /lib/pdf/reader/parser.rb 177 String 53500\n /lib/pdf/reader/buffer.rb 362 String 41386\n /lib/pdf/reader/buffer.rb 384 String 27386\n /lib/pdf/reader/transformation_matrix.rb 20 Array 19238\n /lib/pdf/reader/page_state.rb 243 Array 14846\n /lib/pdf/reader/encoding.rb 143 Array 14336\n\n $ ruby benchmark.rb\n ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]\n Warming up --------------------------------------\n 1.000 i/100ms\n Calculating -------------------------------------\n 1.973 (ยฑ 0.0%) i/s - 20.000 in 10.135409s\n {:ALLOCATIONS=>772349}\n\n-- after\n\n $ ruby allocations.rb | head -n 10\n sourcefile sourceline class count\n ------------------------------------------------------ ---------- --------------------------------------- -----\n /lib/pdf/reader/buffer.rb 362 String 41386\n /lib/pdf/reader/buffer.rb 384 String 27386\n /lib/pdf/reader/transformation_matrix.rb 20 Array 19238\n 8 String 17047\n /lib/pdf/reader/page_state.rb 243 Array 14846\n /lib/pdf/reader/encoding.rb 143 Array 14336\n /lib/pdf/reader/page_state.rb 342 PDF::Reader::TransformationMatrix 10743\n /lib/pdf/reader/transformation_matrix.rb 115 Array 10641\n\n $ ruby benchmark.rb\n ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]\n Warming up --------------------------------------\n 1.000 i/100ms\n Calculating -------------------------------------\n 2.097 (ยฑ 0.0%) i/s - 21.000 in 10.017634s\n {:ALLOCATIONS=>561561}\n\n-- benchmark.rb\n\n $ cat benchmark.rb\n #!/bin/env ruby\n\n $LOAD_PATH << \"lib\"\n require \"pdf/reader\"\n require \"benchmark/ips\"\n\n def allocations\n x = GC.stat(:total_allocated_objects)\n yield\n GC.stat(:total_allocated_objects) - x\n end\n\n def go\n doc = PDF::Reader.new(File.join(File.dirname(__FILE__), \"spec/data/cairo-unicode.pdf\"))\n doc.pages.each do |page|\n page.text #extract the text but do nothing with it\n end\n end\n\n Benchmark.ips { |x|\n x.config(:time => 10, :warmup => 5)\n x.report {\n go\n }\n }\n p ALLOCATIONS: allocations { go }\n\n-- allocations.rb\n\n $ cat allocations.rb\n #!/bin/env ruby\n\n $LOAD_PATH << \"lib\"\n require \"pdf/reader\"\n require \"allocation_stats\"\n\n FILENAME = File.join(File.dirname(__FILE__), \"spec/data/cairo-unicode.pdf\")\n\n def go\n doc = PDF::Reader.new(FILENAME)\n doc.pages.each do |page|\n page.text #extract the text but do nothing with it\n end\n end\n\n stats = AllocationStats.trace { go }\n puts stats.allocations(alias_paths: true).group_by(:sourcefile, :sourceline, :class).sort_by_size.to_text\n\n[1] https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html","shortMessageHtmlLink":"Reduce allocations when parsing hex strings"}},{"before":null,"after":"bec976aac55cefe31391e5c5772eac5af4f8f491","ref":"refs/heads/optimize-hexstring","pushedAt":"2023-12-25T11:52:55.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"will it build on all rubies","shortMessageHtmlLink":"will it build on all rubies"}},{"before":"1da8a87199eb7287554ed229682bd7b6b5f9ae59","after":null,"ref":"refs/heads/jellyfish","pushedAt":"2023-12-25T11:42:49.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"}},{"before":"047c437dfb17c1611eeb1ded39dd56bac611cb66","after":"b517d4532433694a275ba86161e3fdbec9776cb3","ref":"refs/heads/stringscanner","pushedAt":"2023-12-25T11:40:59.000Z","pushType":"force_push","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"},"commit":{"message":"allocate slightly fewer objects in BufferNew","shortMessageHtmlLink":"allocate slightly fewer objects in BufferNew"}},{"before":"2f8e72598457b8ca22ad9de282ecb4e006fc16c2","after":null,"ref":"refs/heads/3-3-rc1","pushedAt":"2023-12-25T11:33:28.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"yob","name":"James Healy","path":"/yob","primaryAvatarUrl":"https://avatars.githubusercontent.com/u/8132?s=80&v=4"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAEQU2SgQA","startCursor":null,"endCursor":null}},"title":"Activity ยท yob/pdf-reader"}