/
pdf.c
5103 lines (4303 loc) · 166 KB
/
pdf.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (C) 2013-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
* Copyright (C) 2007-2013 Sourcefire, Inc.
*
* Authors: Nigel Horne, Török Edvin
*
* Also based on Matt Olney's pdf parser in snort-nrt.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* TODO: Embedded fonts
* TODO: Predictor image handling
*/
#if HAVE_CONFIG_H
#include "clamav-config.h"
#endif
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <ctype.h>
#include <string.h>
#include <fcntl.h>
#include <stdlib.h>
#include <errno.h>
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <zlib.h>
#if HAVE_ICONV
#include <iconv.h>
#endif
#ifdef _WIN32
#include <stdint.h>
#endif
#include "clamav.h"
#include "others.h"
#include "pdf.h"
#include "pdfdecode.h"
#include "scanners.h"
#include "fmap.h"
#include "str.h"
#include "entconv.h"
#include "bytecode.h"
#include "bytecode_api.h"
#include "arc4.h"
#include "rijndael.h"
#include "textnorm.h"
#include "conv.h"
#include "json_api.h"
#ifdef CL_DEBUG
/*#define SAVE_TMP
*Save the file being worked on in tmp */
#endif
#define MAX_PDF_OBJECTS (64 * 1024)
struct pdf_struct;
static const char *pdf_nextlinestart(const char *ptr, size_t len);
static const char *pdf_nextobject(const char *ptr, size_t len);
/* PDF statistics callbacks and related */
struct pdfname_action;
#if HAVE_JSON
static void pdf_export_json(struct pdf_struct *);
static void ASCIIHexDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void ASCII85Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void EmbeddedFile_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void FlateDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Image_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void LZWDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void RunLengthDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void CCITTFaxDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void JBIG2Decode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void DCTDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void JPXDecode_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Crypt_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Standard_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Sig_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void JavaScript_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void OpenAction_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Launch_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Page_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Author_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Creator_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Producer_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void CreationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void ModificationDate_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Title_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Subject_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Keywords_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Pages_cb(struct pdf_struct *, struct pdf_obj *, struct pdfname_action *);
static void Colors_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void RichMedia_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void AcroForm_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
static void XFA_cb(struct pdf_struct *pdf, struct pdf_obj *obj, struct pdfname_action *act);
#endif
/* End PDF statistics callbacks and related */
static int pdf_readint(const char *q0, int len, const char *key);
static const char *pdf_getdict(const char *q0, int *len, const char *key);
static char *pdf_readval(const char *q, int len, const char *key);
static char *pdf_readstring(const char *q0, int len, const char *key, unsigned *slen, const char **qend, bool noescape);
static int xrefCheck(const char *xref, const char *eof)
{
const char *q;
while (xref < eof && (*xref == ' ' || *xref == '\n' || *xref == '\r'))
xref++;
if (xref + 4 >= eof)
return -1;
if (!memcmp(xref, "xref", strlen("xref"))) {
cli_dbgmsg("cli_pdf: found xref\n");
return 0;
}
/* could be xref stream */
for (q = xref; q + 5 < eof; q++) {
if (!memcmp(q, "/XRef", strlen("/XRef"))) {
cli_dbgmsg("cli_pdf: found /XRef\n");
return 0;
}
}
return -1;
}
/* define this to be noisy about things that we can't parse properly */
#undef NOISY
#ifdef NOISY
#define noisy_msg(pdf, ...) cli_infomsg(pdf->ctx, __VA_ARGS__)
#define noisy_warnmsg(...) cli_warnmsg(__VA_ARGS__)
#else
#define noisy_msg(pdf, ...)
#define noisy_warnmsg(...)
#endif
/**
* @brief Searching BACKwards, find the next character that is not a whitespace.
*
* @param q Index to start from (at the end of the search space)
* @param start Beginning of the search space.
*
* @return const char* Address of the final non-whitespace character OR the same address as the start.
*/
static const char *findNextNonWSBack(const char *q, const char *start)
{
while (q > start && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
q--;
return q;
}
/**
* @brief Searching FORwards, find the next character that is not a whitespace.
*
* @param q Index to start from (at the end of the search space)
* @param end End of the search space.
*
* @return const char* Address of the final non-whitespace character OR the same address as the start.
*/
static const char *findNextNonWS(const char *q, const char *end)
{
while (q < end && (*q == 0 || *q == 9 || *q == 0xa || *q == 0xc || *q == 0xd || *q == 0x20))
q++;
return q;
}
/**
* @brief Find bounds of stream.
*
* PDF streams are prefixed with "stream" and suffixed with "endstream".
* Return value indicates success or failure.
*
* @param start start address of search space.
* @param size size of search space
* @param[out] stream output param, address of start of stream data
* @param[out] stream_size output param, size of stream data
* @param newline_hack hack to support newlines that are \r\n, and not just \n or just \r.
*
* @return cl_error_t CL_SUCCESS if stream bounds were found.
* @return cl_error_t CL_BREAK if stream bounds could not be found.
* @return cl_error_t CL_EFORMAT if stream start was found, but not end. (truncated)
* @return cl_error_t CL_EARG if invalid args were provided.
*/
static cl_error_t find_stream_bounds(
const char *start,
size_t size,
const char **stream,
size_t *stream_size,
int newline_hack)
{
cl_error_t status = CL_BREAK;
const char *idx;
const char *stream_begin;
const char *endstream_begin;
size_t bytesleft = size;
if ((NULL == start) || (0 == bytesleft) || (NULL == stream) || (NULL == stream_size)) {
status = CL_EARG;
return status;
}
*stream = NULL;
*stream_size = 0;
/* Begin by finding the "stream" string that prefixes stream data. */
if ((stream_begin = cli_memstr(start, bytesleft, "stream", strlen("stream")))) {
idx = stream_begin + strlen("stream");
if ((size_t)(idx - start) >= bytesleft)
goto done;
bytesleft -= idx - start;
/* Skip any new line characters. */
if (bytesleft >= 2 && idx[0] == '\xd' && idx[1] == '\xa') {
idx += 2;
bytesleft -= 2;
if (newline_hack && (bytesleft > 2) && idx[0] == '\xa') {
idx++;
bytesleft--;
}
} else if (bytesleft && idx[0] == '\xa') {
idx++;
bytesleft--;
}
/* Pass back start of the stream data. */
*stream = idx;
/* Now find the "endstream" string that suffixes stream data. */
endstream_begin = cli_memstr(idx, bytesleft, "endstream", strlen("endstream"));
if (!endstream_begin) {
/* Couldn't find "endstream", but that's ok --
* -- we'll just count the rest of the provided buffer. */
cli_dbgmsg("find_stream_bounds: Truncated stream found!\n");
endstream_begin = start + size;
status = CL_EFORMAT;
}
/* Pass back end of the stream data, as offset from start. */
*stream_size = endstream_begin - *stream;
if (CL_EFORMAT != status)
status = CL_SUCCESS;
}
done:
return status;
}
/**
* @brief Find the next *indirect* object in an object stream, adds it to our list of
* objects, and increments nobj.
*
* Indirect objects in a stream DON'T begin with "obj" and end with "endobj".
* Instead, they have an objid and an offset from the first object to point you
* right at them.
*
* If found, objstm->current will be updated to the next objid.
*
* All objects in an object stream are indirect and thus do not begin or start
* with "obj" or "endobj". Instead, the object stream takes the following
* format.
*
* <dictionary describing stream> objstm content endobjstm
*
* where content looks something like the following:
*
* 15 0 16 3 17 46 (ab)<</IDS 8 0 R/JavaScript 27 0 R/URLS 9 0 R>><</Names[(Test)28 0 R]>>
*
* In the above example, the literal string (ab) is indirect object # 15, and
* begins at offset 0 of the set of objects. The next object, # 16 begis at
* offset 3 is a dictionary. The final object is also a dictionary, beginning
* at offset 46.
*
* @param pdf Pdf struct that keeps track of all information found in the PDF.
* @param objstm
*
* @return CL_SUCCESS if success
* @return CL_EPARSE if parsing error
* @return CL_EMEM if error allocating memory
* @return CL_EARG if invalid arguments
*/
int pdf_findobj_in_objstm(struct pdf_struct *pdf, struct objstm_struct *objstm, struct pdf_obj **obj_found)
{
cl_error_t status = CL_EPARSE;
struct pdf_obj *obj = NULL;
unsigned long objid = 0, objoff = 0;
long temp_long = 0;
const char *index = NULL;
size_t bytes_remaining = 0;
if (NULL == pdf || NULL == objstm) {
cli_warnmsg("pdf_findobj_in_objstm: invalid arguments\n");
return CL_EARG;
}
if (pdf->nobjs >= MAX_PDF_OBJECTS) {
pdf->flags |= 1 << BAD_PDF_TOOMANYOBJS;
cli_dbgmsg("pdf_findobj_in_objstm: reached object maximum\n");
status = CL_BREAK;
goto done;
}
*obj_found = NULL;
index = objstm->streambuf + objstm->current_pair;
bytes_remaining = objstm->streambuf_len - objstm->current_pair;
obj = calloc(sizeof(struct pdf_obj), 1);
if (!obj) {
cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n");
status = CL_EMEM;
goto done;
}
/* This object is in a stream, not in the regular map buffer. */
obj->objstm = objstm;
/* objstm->current_pair points directly to the objid */
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
/* Failed to find objid */
cli_dbgmsg("pdf_findobj_in_objstm: Failed to find objid for obj in object stream\n");
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative objid (%ld).\n", temp_long);
status = CL_EPARSE;
goto done;
}
objid = (unsigned long)temp_long;
/* Find the obj offset that appears just after the objid*/
while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
index++;
bytes_remaining--;
}
index = findNextNonWS(index, objstm->streambuf + objstm->first);
bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
/* Failed to find obj offset */
cli_dbgmsg("pdf_findobj_in_objstm: Failed to find obj offset for obj in object stream\n");
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld).\n", temp_long);
status = CL_EPARSE;
goto done;
}
objoff = (unsigned long)temp_long;
if ((size_t)objstm->first + (size_t)objoff > objstm->streambuf_len) {
/* Alleged obj location is further than the length of the stream */
cli_dbgmsg("pdf_findobj_in_objstm: obj offset found is greater than the length of the stream.\n");
status = CL_EPARSE;
goto done;
}
objstm->current = objstm->first + objoff;
obj->id = (objid << 8) | (0 & 0xff);
obj->start = objstm->current;
obj->flags = 0;
objstm->nobjs_found++;
while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
index++;
bytes_remaining--;
}
objstm->current_pair = (uint32_t)(findNextNonWS(index, objstm->streambuf + objstm->first) - objstm->streambuf);
/* Update current_pair, if there are more */
if ((objstm->nobjs_found < objstm->n) &&
(index < objstm->streambuf + objstm->streambuf_len)) {
unsigned long next_objoff = 0;
/*
* While we're at it,
* lets record the size as running up to the next object offset.
*
* To do so, we will need to parse the next obj pair.
*/
/* objstm->current_pair points directly to the objid */
index = objstm->streambuf + objstm->current_pair;
bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
/* We don't actually care about the object id at this point, so reading the object id is commented out.
I didn't delete it entirely in case the object id is needed in the future. */
// if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
// /* Failed to find objid for next obj */
// cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next objid for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
// status = CL_EPARSE;
// goto done;
// } else if (temp_long < 0) {
// cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative objid (%ld).\n", temp_long);
// status = CL_EPARSE;
// goto done;
// }
// next_objid = (unsigned long)temp_long;
/* Find the obj offset that appears just after the objid*/
while ((index < objstm->streambuf + objstm->streambuf_len) && isdigit(*index)) {
index++;
bytes_remaining--;
}
index = findNextNonWS(index, objstm->streambuf + objstm->first);
bytes_remaining = objstm->streambuf + objstm->streambuf_len - index;
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
/* Failed to find obj offset for next obj */
cli_dbgmsg("pdf_findobj_in_objstm: Failed to find next obj offset for obj in object stream though there should be {%u} more.\n", objstm->n - objstm->nobjs_found);
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj_in_objstm: Encountered invalid negative obj offset (%ld).\n", temp_long);
status = CL_EPARSE;
goto done;
}
next_objoff = (unsigned long)temp_long;
if (next_objoff <= objoff) {
/* Failed to find obj offset for next obj */
cli_dbgmsg("pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's less than or equal to the current one!\n");
status = CL_EPARSE;
goto done;
} else if (objstm->first + next_objoff > objstm->streambuf_len) {
/* Failed to find obj offset for next obj */
cli_dbgmsg("pdf_findobj_in_objstm: Found next obj offset for obj in object stream but it's further out than the size of the stream!\n");
status = CL_EPARSE;
goto done;
}
obj->size = next_objoff - objoff;
} else {
/*
* Should be no more objects. We should verify.
*
* Either way...
* obj->size should be the rest of the buffer.
*/
if (objstm->nobjs_found < objstm->n) {
cli_warnmsg("pdf_findobj_in_objstm: Fewer objects found in object stream than expected!\n");
}
obj->size = objstm->streambuf_len - obj->start;
}
/* Success! Add the object to the list of all objects found. */
pdf->nobjs++;
CLI_REALLOC(pdf->objs, sizeof(struct pdf_obj *) * pdf->nobjs,
cli_warnmsg("pdf_findobj_in_objstm: out of memory finding objects in stream\n"),
status = CL_EMEM);
pdf->objs[pdf->nobjs - 1] = obj;
*obj_found = obj;
status = CL_SUCCESS;
done:
if (CL_SUCCESS != status) {
if (NULL != obj) {
free(obj);
}
}
return status;
}
/**
* @brief Find the next *indirect* object.
*
* Indirect objects located outside of an object stream are prefaced with:
* <objid> <genid> obj
*
* Each of the above are separated by whitespace of some sort.
*
* Indirect objects are postfaced with:
* endobj
*
* The specification does not say if whitespace is required before or after "endobj".
*
* Identify truncated objects.
*
* If found, pdf->offset will be updated to just after the "endobj".
* If truncated, pdf->offset will == pdf->size.
* If not found, pdf->offset will not be updated.
*
* @param pdf Pdf context struct that keeps track of all information found in the PDF.
*
* @return CL_SUCCESS if success
* @return CL_BREAK if no more objects
* @return CL_EPARSE if parsing error
* @return CL_EMEM if error allocating memory
*/
cl_error_t pdf_findobj(struct pdf_struct *pdf)
{
cl_error_t status = CL_EPARSE;
const char *start, *idx, *genid_search_index, *objid_search_index;
const char *obj_begin = NULL, *obj_end = NULL;
const char *endobj_begin = NULL, *endobj_end = NULL;
struct pdf_obj *obj = NULL;
size_t bytesleft;
unsigned long genid, objid;
long temp_long;
if (pdf->nobjs >= MAX_PDF_OBJECTS) {
pdf->flags |= 1 << BAD_PDF_TOOMANYOBJS;
cli_dbgmsg("pdf_findobj: reached object maximum\n");
status = CL_BREAK;
goto done;
}
pdf->nobjs++;
CLI_REALLOC(pdf->objs, sizeof(struct pdf_obj *) * pdf->nobjs, status = CL_EMEM);
obj = malloc(sizeof(struct pdf_obj));
if (!obj) {
status = CL_EMEM;
goto done;
}
pdf->objs[pdf->nobjs - 1] = obj;
memset(obj, 0, sizeof(*obj));
start = pdf->map + pdf->offset;
bytesleft = pdf->size - pdf->offset;
/*
* Start by searching for "obj"
*/
idx = start + 1;
while (bytesleft > 1 + strlen("obj")) {
/* `- 1` accounts for size of white space before obj */
idx = cli_memstr(idx, bytesleft - 1, "obj", strlen("obj"));
if (NULL == idx) {
status = CL_BREAK;
goto done; /* No more objs. */
}
/* verify that the word has a whitespace before it, and is not the end of
* a previous word */
idx--;
bytesleft = (pdf->size - pdf->offset) - (size_t)(idx - start);
if (*idx != 0 && *idx != 9 && *idx != 0xa && *idx != 0xc && *idx != 0xd && *idx != 0x20) {
/* This instance of "obj" appears to be part of a longer string.
* Skip it, and keep searching for an object. */
idx += 1 + strlen("obj");
bytesleft -= 1 + strlen("obj");
continue;
}
/* Found the beginning of the word */
obj_begin = idx;
obj_end = idx + 1 + strlen("obj");
break;
}
if ((NULL == obj_begin) || (NULL == obj_end)) {
status = CL_BREAK;
goto done; /* No more objs. */
}
/* Find the generation id (genid) that appears before the "obj" */
genid_search_index = findNextNonWSBack(obj_begin - 1, start);
while (genid_search_index > start && isdigit(*genid_search_index))
genid_search_index--;
if (CL_SUCCESS != cli_strntol_wrap(genid_search_index, (size_t)((obj_begin)-genid_search_index), 0, 10, &temp_long)) {
cli_dbgmsg("pdf_findobj: Failed to parse object genid (# objects found: %u)\n", pdf->nobjs);
/* Failed to parse, probably not a real object. Skip past the "obj" thing, and continue. */
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj: Encountered invalid negative obj genid (%ld).\n", temp_long);
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
genid = (unsigned long)temp_long;
/* Find the object id (objid) that appears before the genid */
objid_search_index = findNextNonWSBack(genid_search_index - 1, start);
while (objid_search_index > start && isdigit(*objid_search_index))
objid_search_index--;
if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index)-objid_search_index), 0, 10, &temp_long)) {
/*
* Edge case:
*
* PDFs with multiple revisions will have %%EOF before the end of the file,
* followed by the next revision of the PDF, which will probably be an immediate objid.
*
* Example:
* %%EOF1 1 obj <blah> endobj
*
* If this is the case, we can detect it and continue parsing after the %%EOF.
*/
if (objid_search_index - strlen("\%\%EO") > start) {
const char *lastfile = objid_search_index - strlen("\%\%EO");
if (0 != strncmp(lastfile, "\%\%EOF", 5)) {
/* Nope, wasn't %%EOF */
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
/* Skip past the "obj" thing, and continue. */
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
/* Yup, Looks, like the file continues after %%EOF.
* Probably another revision. Keep parsing... */
objid_search_index++;
cli_dbgmsg("pdf_findobj: \%\%EOF detected before end of file, at offset: %zu\n", (size_t)(objid_search_index - pdf->map));
} else {
/* Failed parsing at the very beginning */
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
/* Probably not a real object. Skip past the "obj" thing, and continue. */
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
/* Try again, with offset slightly adjusted */
if (CL_SUCCESS != cli_strntol_wrap(objid_search_index, (size_t)((genid_search_index - 1) - objid_search_index), 0, 10, &temp_long)) {
cli_dbgmsg("pdf_findobj: Failed to parse object objid (# objects found: %u)\n", pdf->nobjs);
/* Still failed... Probably not a real object. Skip past the "obj" thing, and continue. */
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
cli_dbgmsg("pdf_findobj: There appears to be an additional revision. Continuing to parse...\n");
} else if (temp_long < 0) {
cli_dbgmsg("pdf_findobj: Encountered invalid negative objid (%ld).\n", temp_long);
pdf->offset = obj_end - pdf->map;
status = CL_EPARSE;
goto done;
}
objid = (unsigned long)temp_long;
obj->id = (objid << 8) | (genid & 0xff);
obj->start = obj_end - pdf->map; /* obj start begins just after the "obj" string */
obj->flags = 0;
/*
* We now have the objid, genid, and object start.
* Find the object end ("endobj").
*/
/* `- 1` accounts for size of white space before obj */
endobj_begin = cli_memstr(obj_end, pdf->map + pdf->size - obj_end, "endobj", strlen("endobj"));
if (NULL == endobj_begin) {
/* No end to object.
* PDF appears to be malformed or truncated.
* Will record the object size as going ot the end of the file.
* Will record that the object is truncated.
* Will position the pdf offset to the end of the PDF.
* The next iteration of this function will find no more objects. */
obj->flags |= 1 << OBJ_TRUNCATED;
obj->size = (pdf->map + pdf->size) - obj_end;
pdf->offset = pdf->size;
/* Truncated "object" found! */
status = CL_SUCCESS;
goto done;
}
endobj_end = endobj_begin + strlen("endobj");
/* Size of the object goes from "obj" <-> "endobject". */
obj->size = endobj_begin - obj_end;
pdf->offset = endobj_end - pdf->map;
/*
* Object found!
*/
status = CL_SUCCESS; /* truncated file, no end to obj. */
done:
if (status == CL_SUCCESS) {
cli_dbgmsg("pdf_findobj: found %d %d obj @%lld, size: %zu bytes.\n", obj->id >> 8, obj->id & 0xff, (long long)(obj->start + pdf->startoff), obj->size);
} else {
/* Remove the unused obj reference from our list of objects found */
/* No need to realloc pdf->objs back down. It won't leak. */
pdf->objs[pdf->nobjs - 1] = NULL;
pdf->nobjs--;
/* Free up the obj struct. */
if (NULL != obj)
free(obj);
if (status == CL_BREAK) {
cli_dbgmsg("pdf_findobj: No more objects (# objects found: %u)\n", pdf->nobjs);
} else if (status == CL_EMEM) {
cli_warnmsg("pdf_findobj: Error allocating memory (# objects found: %u)\n", pdf->nobjs);
} else {
cli_dbgmsg("pdf_findobj: Unexpected status code %d.\n", status);
}
}
return status;
}
static size_t filter_writen(struct pdf_struct *pdf, struct pdf_obj *obj, int fout, const char *buf, size_t len, size_t *sum)
{
UNUSEDPARAM(obj);
if (cli_checklimits("pdf", pdf->ctx, (unsigned long)*sum, 0, 0)) /* TODO: May truncate for large values on 64-bit platforms */
return len; /* pretend it was a successful write to suppress CL_EWRITE */
*sum += len;
return cli_writen(fout, buf, len);
}
void pdfobj_flag(struct pdf_struct *pdf, struct pdf_obj *obj, enum pdf_flag flag)
{
const char *s = "";
pdf->flags |= 1 << flag;
if (!cli_debug_flag)
return;
switch (flag) {
case UNTERMINATED_OBJ_DICT:
s = "dictionary not terminated";
break;
case ESCAPED_COMMON_PDFNAME:
/* like /JavaScript */
s = "escaped common pdfname";
break;
case BAD_STREAM_FILTERS:
s = "duplicate stream filters";
break;
case BAD_PDF_VERSION:
s = "bad pdf version";
break;
case BAD_PDF_HEADERPOS:
s = "bad pdf header position";
break;
case BAD_PDF_TRAILER:
s = "bad pdf trailer";
break;
case BAD_PDF_TOOMANYOBJS:
s = "too many pdf objs";
break;
case BAD_FLATE:
s = "bad deflate stream";
break;
case BAD_FLATESTART:
s = "bad deflate stream start";
break;
case BAD_STREAMSTART:
s = "bad stream start";
break;
case UNKNOWN_FILTER:
s = "unknown filter used";
break;
case BAD_ASCIIDECODE:
s = "bad ASCII decode";
break;
case HEX_JAVASCRIPT:
s = "hex javascript";
break;
case BAD_INDOBJ:
s = "referencing nonexistent obj";
break;
case HAS_OPENACTION:
s = "has /OpenAction";
break;
case HAS_LAUNCHACTION:
s = "has /LaunchAction";
break;
case BAD_STREAMLEN:
s = "bad /Length, too small";
break;
case ENCRYPTED_PDF:
s = "PDF is encrypted";
break;
case LINEARIZED_PDF:
s = "linearized PDF";
break;
case MANY_FILTERS:
s = "more than 2 filters per obj";
break;
case DECRYPTABLE_PDF:
s = "decryptable PDF";
break;
}
cli_dbgmsg("pdfobj_flag: %s flagged in object %u %u\n", s, obj->id >> 8, obj->id & 0xff);
}
struct pdf_obj *find_obj(struct pdf_struct *pdf, struct pdf_obj *obj, uint32_t objid)
{
uint32_t j;
uint32_t i;
/* search starting at previous obj (if exists) */
for (i = 0; i < pdf->nobjs; i++) {
if (pdf->objs[i] == obj)
break;
}
for (j = i; j < pdf->nobjs; j++) {
obj = pdf->objs[j];
if (obj->id == objid)
return obj;
}
/* restart search from beginning if not found */
for (j = 0; j < i; j++) {
obj = pdf->objs[j];
if (obj->id == objid)
return obj;
}
return NULL;
}
/**
* @brief Find and interpret the "/Length" dictionary key value.
*
* The value may be:
* - a direct object (i.e. just a number)
* - an indirect object, where the value is somewhere else in the document and we have to look it up.
* indirect objects are referenced using an object id (objid), generation id (genid) genid, and the letter 'R'.
*
* Example dictionary with a single key "/Length" that relies direct object for the value.
*
* 1 0 obj
* << /Length 534
* /Filter [ /ASCII85Decode /LZWDecode ]
* >>
* stream
* J..)6T`?p&<!J9%_[umg"B7/Z7KNXbN'S+,*Q/&"OLT'FLIDK#!n`$"<Atdi`\Vn%b%)&'cA*VnK\CJY(sF>c!Jnl@
* RM]WM;jjH6Gnc75idkL5]+cPZKEBPWdR>FF(kj1_R%W_d&/jS!;iuad7h?[L-F$+]]0A3Ck*$I0KZ?;<)CJtqi65Xb
* Vc3\n5ua:Q/=0$W<#N3U;H,MQKqfg1?:lUpR;6oN[C2E4ZNr8Udn.'p+?#X+1>0Kuk$bCDF/(3fL5]Oq)^kJZ!C2H1
* 'TO]Rl?Q:&'<5&iP!$Rq;BXRecDN[IJB`,)o8XJOSJ9sDS]hQ;Rj@!ND)bD_q&C\g:inYC%)&u#:u,M6Bm%IY!Kb1+
* ":aAa'S`ViJglLb8<W9k6Yl\\0McJQkDeLWdPN?9A'jX*al>iG1p&i;eVoK&juJHs9%;Xomop"5KatWRT"JQ#qYuL,
* JD?M$0QP)lKn06l1apKDC@\qJ4B!!(5m+j.7F790m(Vj88l8Q:_CZ(Gm1%X\N1&u!FKHMB~>
* endstream
* endobj
*
* Example dictionary with a single key "/Length" that relies on an indirect object for the value.
*
* 7 0 obj
* << /Length 8 0 R >> % An indirect reference to object 8, with generation id 0.
* stream
* BT
* /F1 12 Tf
* 72 712 Td
* ( A stream with an indirect length ) Tj
* ET
* endstream
* endobj
*
* 8 0 obj
* 77 % The length of the preceding stream
* endobj
*
* @param pdf Pdf context structure.
* @param obj Pdf object context structure.
* @param start Pointer start of the dictionary string.
* @param len Remaining length of the dictioary string in bytes.
* @return size_t Unsigned integer value of the "/Length" key
*/
static size_t find_length(struct pdf_struct *pdf, struct pdf_obj *obj, const char *dict_start, size_t dict_len)
{
size_t length = 0;
const char *obj_start = dict_start;
size_t bytes_remaining = dict_len;
long temp_long = 0;
const char *index;
if (bytes_remaining < 8) {
return 0;
}
/*
* Find the "/Length" dictionary key
*/
index = cli_memstr(obj_start, bytes_remaining, "/Length", 7);
if (!index)
return 0;
bytes_remaining -= index - obj_start;
if (bytes_remaining < 1) {
return 0;
}
/* Step the index into the "/Length" string. */
index++;
bytes_remaining--;
/* Find the start of the next direct or indirect object.
* pdf_nextobject() assumes we started searching from within a previous object */
obj_start = pdf_nextobject(index, bytes_remaining);
if (!obj_start)
return 0;
if (bytes_remaining < (size_t)(obj_start - index)) {
return 0;
}
bytes_remaining -= obj_start - index;
index = obj_start;
/* Read the value. This could either be the direct length value,
or the object id of the indirect object that has the length */
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
cli_dbgmsg("find_length: failed to parse object length or objid\n");
return 0;
} else if (temp_long < 0) {
cli_dbgmsg("find_length: Encountered invalid negative object length or objid (%ld).\n", temp_long);
return 0;
}
length = (size_t)temp_long; /* length or maybe object id */
/*
* Keep parsing, skipping past the first integer that might have been what we wanted.
* If it's an indirect object, we'll find a Generation ID followed by the letter 'R'
* I.e. something like " 0 R"
*/
while ((bytes_remaining > 0) && isdigit(*index)) {
index++;
bytes_remaining--;
}
if ((bytes_remaining > 0) && (*index == ' ')) {
unsigned long genid;
index++;
bytes_remaining--;
if (CL_SUCCESS != cli_strntol_wrap(index, bytes_remaining, 0, 10, &temp_long)) {
cli_dbgmsg("find_length: failed to parse object genid\n");
return 0;
} else if (temp_long < 0) {
cli_dbgmsg("find_length: Encountered invalid negative object genid (%ld).\n", temp_long);
return 0;
}
genid = (unsigned long)temp_long;
while ((bytes_remaining > 0) && isdigit(*index)) {
index++;
bytes_remaining--;
}
if (bytes_remaining < 2) {
return 0;
}
if (index[0] == ' ' && index[1] == 'R') {
/*
* Ok so we found a genid and that 'R'. Which means that first value
* was actually the objid.
* We can look up the indirect object using this information.
*/
unsigned long objid = length;
const char *indirect_obj_start = NULL;
cli_dbgmsg("find_length: length is in indirect object %lu %lu\n", objid, genid);