forked from gabriel-vasile/mimetype
/
ms_office.go
225 lines (200 loc) · 6.14 KB
/
ms_office.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
package magic
import (
"bytes"
"encoding/binary"
)
var (
xlsxSigFiles = []string{
"xl/worksheets/",
"xl/drawings/",
"xl/theme/",
"xl/_rels/",
"xl/styles.xml",
"xl/workbook.xml",
"xl/sharedStrings.xml",
}
docxSigFiles = []string{
"word/media/",
"word/_rels/document.xml.rels",
"word/document.xml",
"word/styles.xml",
"word/fontTable.xml",
"word/settings.xml",
"word/numbering.xml",
"word/header",
"word/footer",
}
pptxSigFiles = []string{
"ppt/slides/",
"ppt/media/",
"ppt/slideLayouts/",
"ppt/theme/",
"ppt/slideMasters/",
"ppt/tags/",
"ppt/notesMasters/",
"ppt/_rels/",
"ppt/handoutMasters/",
"ppt/notesSlides/",
"ppt/presentation.xml",
"ppt/tableStyles.xml",
"ppt/presProps.xml",
"ppt/viewProps.xml",
}
)
// Xlsx matches a Microsoft Excel 2007 file.
func Xlsx(raw []byte, limit uint32) bool {
return zipContains(raw, xlsxSigFiles...)
}
// Docx matches a Microsoft Word 2007 file.
func Docx(raw []byte, limit uint32) bool {
return zipContains(raw, docxSigFiles...)
}
// Pptx matches a Microsoft PowerPoint 2007 file.
func Pptx(raw []byte, limit uint32) bool {
return zipContains(raw, pptxSigFiles...)
}
// Ole matches an Open Linking and Embedding file.
//
// https://en.wikipedia.org/wiki/Object_Linking_and_Embedding
func Ole(raw []byte, limit uint32) bool {
return bytes.HasPrefix(raw, []byte{0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1})
}
// Aaf matches an Advanced Authoring Format file.
// See: https://pyaaf.readthedocs.io/en/latest/about.html
// See: https://en.wikipedia.org/wiki/Advanced_Authoring_Format
func Aaf(raw []byte, limit uint32) bool {
if len(raw) < 31 {
return false
}
return bytes.HasPrefix(raw[8:], []byte{0x41, 0x41, 0x46, 0x42, 0x0D, 0x00, 0x4F, 0x4D}) &&
(raw[30] == 0x09 || raw[30] == 0x0C)
}
// Doc matches a Microsoft Word 97-2003 file.
// See: https://github.com/decalage2/oletools/blob/412ee36ae45e70f42123e835871bac956d958461/oletools/common/clsid.py
func Doc(raw []byte, _ uint32) bool {
clsids := [][]byte{
// Microsoft Word 97-2003 Document (Word.Document.8)
{0x06, 0x09, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46},
// Microsoft Word 6.0-7.0 Document (Word.Document.6)
{0x00, 0x09, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46},
// Microsoft Word Picture (Word.Picture.8)
{0x07, 0x09, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46},
}
for _, clsid := range clsids {
if matchOleClsid(raw, clsid) {
return true
}
}
return false
}
// Ppt matches a Microsoft PowerPoint 97-2003 file or a PowerPoint 95 presentation.
func Ppt(raw []byte, limit uint32) bool {
// Root CLSID test is the safest way to detect identify OLE, however, the format
// often places the root CLSID at the end of the file.
if matchOleClsid(raw, []byte{
0x10, 0x8d, 0x81, 0x64, 0x9b, 0x4f, 0xcf, 0x11,
0x86, 0xea, 0x00, 0xaa, 0x00, 0xb9, 0x29, 0xe8,
}) || matchOleClsid(raw, []byte{
0x70, 0xae, 0x7b, 0xea, 0x3b, 0xfb, 0xcd, 0x11,
0xa9, 0x03, 0x00, 0xaa, 0x00, 0x51, 0x0e, 0xa3,
}) {
return true
}
lin := len(raw)
if lin < 520 {
return false
}
pptSubHeaders := [][]byte{
{0xA0, 0x46, 0x1D, 0xF0},
{0x00, 0x6E, 0x1E, 0xF0},
{0x0F, 0x00, 0xE8, 0x03},
}
for _, h := range pptSubHeaders {
if bytes.HasPrefix(raw[512:], h) {
return true
}
}
if bytes.HasPrefix(raw[512:], []byte{0xFD, 0xFF, 0xFF, 0xFF}) &&
raw[518] == 0x00 && raw[519] == 0x00 {
return true
}
return lin > 1152 && bytes.Contains(raw[1152:min(4096, lin)],
[]byte("P\x00o\x00w\x00e\x00r\x00P\x00o\x00i\x00n\x00t\x00 D\x00o\x00c\x00u\x00m\x00e\x00n\x00t"))
}
// Xls matches a Microsoft Excel 97-2003 file.
func Xls(raw []byte, limit uint32) bool {
// Root CLSID test is the safest way to detect identify OLE, however, the format
// often places the root CLSID at the end of the file.
if matchOleClsid(raw, []byte{
0x10, 0x08, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
}) || matchOleClsid(raw, []byte{
0x20, 0x08, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
}) {
return true
}
lin := len(raw)
if lin < 520 {
return false
}
xlsSubHeaders := [][]byte{
{0x09, 0x08, 0x10, 0x00, 0x00, 0x06, 0x05, 0x00},
{0xFD, 0xFF, 0xFF, 0xFF, 0x10},
{0xFD, 0xFF, 0xFF, 0xFF, 0x1F},
{0xFD, 0xFF, 0xFF, 0xFF, 0x22},
{0xFD, 0xFF, 0xFF, 0xFF, 0x23},
{0xFD, 0xFF, 0xFF, 0xFF, 0x28},
{0xFD, 0xFF, 0xFF, 0xFF, 0x29},
}
for _, h := range xlsSubHeaders {
if bytes.HasPrefix(raw[512:], h) {
return true
}
}
return lin > 1152 && bytes.Contains(raw[1152:min(4096, lin)],
[]byte("W\x00k\x00s\x00S\x00S\x00W\x00o\x00r\x00k\x00B\x00o\x00o\x00k"))
}
// Pub matches a Microsoft Publisher file.
func Pub(raw []byte, limit uint32) bool {
return matchOleClsid(raw, []byte{
0x01, 0x12, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46,
})
}
// Msg matches a Microsoft Outlook email file.
func Msg(raw []byte, limit uint32) bool {
return matchOleClsid(raw, []byte{
0x0B, 0x0D, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46,
})
}
// Msi matches a Microsoft Windows Installer file.
// http://fileformats.archiveteam.org/wiki/Microsoft_Compound_File
func Msi(raw []byte, limit uint32) bool {
return matchOleClsid(raw, []byte{
0x84, 0x10, 0x0C, 0x00, 0x00, 0x00, 0x00, 0x00,
0xC0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x46,
})
}
// Helper to match by a specific CLSID of a compound file.
//
// http://fileformats.archiveteam.org/wiki/Microsoft_Compound_File
func matchOleClsid(in []byte, clsid []byte) bool {
// Microsoft Compound files v3 have a sector length of 512, while v4 has 4096.
// Change sector offset depending on file version.
// https://www.loc.gov/preservation/digital/formats/fdd/fdd000392.shtml
sectorLength := 512
if len(in) < sectorLength {
return false
}
if in[26] == 0x04 && in[27] == 0x00 {
sectorLength = 4096
}
// SecID of first sector of the directory stream.
firstSecID := int(binary.LittleEndian.Uint32(in[48:52]))
// Expected offset of CLSID for root storage object.
clsidOffset := sectorLength*(1+firstSecID) + 80
if len(in) <= clsidOffset+16 {
return false
}
return bytes.HasPrefix(in[clsidOffset:], clsid)
}