public
Description: A lightweight MVC framework on top of Grok-PHP
Homepage: http://chippino.googlecode.com
Clone URL: git://github.com/Jakobo/chippino.git
commit  d3a79fdb286807a98533ba4eb4e7eff7ca9cfe7f
tree    7c6b63bb63bc9b7e97e345e1be19a82d040b9f0a
parent  cb83fceff9a2a9c0fe1c1545735896384ea84800
chippino / library / chippino / util / request / _xss.php
100644 241 lines (199 sloc) 7.595 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
<?php
 
/**
* Performs an XSS Clean of the content
* @param $value the value to clean
**/
 
$value = $this->value;
 
$charset = chipi('Chippino/Util/Config')->with()->core['charset'];
 
// these strings are removed when encountered.
// we don't reduce them to '' to avoid reintroducing a blacklisted word
$blacklist_strings = array(
    'fscommand' => '[removed]',
    'seeksegmenttime' => '[removed]',
    'document.cookie'  => '[removed]',
    'document.write'  => '[removed]',
    '.parentNode'    => '[removed]',
    '.innerHTML'    => '[removed]',
    'window.location'  => '[removed]',
    '-moz-binding'    => '[removed]',
    '<!--'        => '&lt;!--',
    '-->'        => '--&gt;',
    '<![CDATA['      => '&lt;![CDATA[',
);
 
// these words, if they are found, will trigger an advanced
// regex search
$blacklist_regex_triggers = array(
    'javascript',
    'expression',
    'redirect',
);
 
// if a trigger is found, these regex will be ran
$blacklist_regex = array(
    "javascript\s*:"  => '[removed]',
    "expression\s*\("  => '[removed]', // CSS and IE
    "Redirect\s+302"  => '[removed]',
);
 
// this is a list of words which browsers will render with whitespace
// in them, for example
// java
// script
// these will be collapsed down to their non-spaced version.
$blacklist_whitespace = array(
    'javascript',
    'expression',
    'script',
    'vbscript',
    'alert',
    'document',
    'write',
    'cookie',
    'window',
);
 
// a list of attributes that are not allowed. Anything starting with
// "on"* is caught in a separate filter
// http://www.w3schools.com/jsref/jsref_events.asp
$xpath_check_attributes = array(
    'xmlns',
);
 
// a list of notes that require special scrutiny
$xpath_check_special_attributes = array(
    'href',
    'style',
    'src',
    'dynsrc',
    'lowsrc',
    'datasrc',
);
 
// a list of known nodes to remove
$xpath_remove_nodes = array(
    'vbscript',
    'script',
    'applet',
    'object',
    'embed',
    'xss',
    'bgsound',
    'style',
    'link',
    'meta',
);
 
// first pass normalization
// we already have normalized our spaces and our newlines
// now we need to normalize our charcters
 
// let's make hashes for placeholders
$hashes = array(
    '&' => md5(time() + mt_rand(0, 1000000000)),
    'wrapper' => md5(time() + mt_rand(0, 1000000000)),
);
 
 
// anything that looks like it belongs in a query string, we'll hash the & part
// so that we don't accidentally encode it
// &phrase=value
$value = preg_replace('#\&([a-z\_0-9]+)\=([a-z\_0-9]+)#i', $hashes['&']."\\1=\\2", $value);
 
// ensure all character entities end in a ;
// &#x0f;
$value = preg_replace('#(&\#?[0-9a-z]+)[\x00-\x20]*;?#i', "\\1;", $value);
 
// handle unicode since php 5 kinda-sorta supports it. Add a semicolon.
// &#123282
$value = preg_replace('#(&\#x?)([0-9A-F]+);?#i',"\\1\\2;",$value);
 
// strip control characters
// any sort of hex < 1f
$value = preg_replace('#[\x00-\x08\x0e-\x1f]#', '', $value);
 
// change all space-like things to spaces
$value = preg_replace('#&\#(?:x0*20|0*32);?#i', ' ', $value);
 
// restore our ampersands
$value = str_replace($hashes['&'], '&', $value);
 
// and now we can decode everything
$value = rawurldecode($value);
 
// decode all html ents if we have a chance of one
// we need a &, a #, and a ;
if (strpos($value, '&') !== FALSE && strpos($value, '#') !== FALSE && strpos($value, ';') !== FALSE) {
    $value = html_entity_decode($value, ENT_COMPAT, $charset);
    $value = preg_replace('~&#x([0-9a-f]{2,5})~ei', 'chr(hexdec("\\1"))', $value);
    $value = preg_replace('~&#([0-9]{2,4})~e', 'chr(\\1)', $value);
}
 
// alright, now we can clean up our blacklists since all entities are sane
$value = str_replace(array_keys($blacklist_strings), array_values($blacklist_strings), $value);
 
// strip all possible opening php tags
// this catches XML tags as well
$value = str_replace(array('<?php', '<?PHP', '<?', '?'.'>'), array('&lt;?php', '&lt;?PHP', '&lt;?', '?&gt;'), $value);
 
// collapse our expandable whitespace items
foreach ($blacklist_whitespace as $word) {
    $regex = '#(' . implode('\s*', str_split($word)) . ')(\W)#is';
    $value = preg_replace($regex, $word."\\2", $value);
}
 
// load into a DOM document
// add a paragraph element we can extract from
if (!function_exists('xss_loadhtml_error_handler')) {
    chip('Chippino/Util/Request/_XSSErrorHandler')->with();
}
 
set_error_handler('xss_loadhtml_error_handler');
$dom = new DOMDocument();
$dom->recover = TRUE;
$dom->strictErrorChecking = FALSE;
$dom->loadHTML('<html><body class="'.$hashes['wrapper'].'">'.$value.'</body></html>');
$xpath = new DOMXPath($dom);
restore_error_handler();
 
 
// strip all nodes that are not allowed
// keep going until done
$nodes_exist = TRUE;
while ($nodes_exist) {
    $nodes = $xpath->query('//' . implode(' | //', $xpath_remove_nodes));
    if (!$nodes->length) {
        $nodes_exist = FALSE;
    }
    for ($i = 0; $i < $nodes->length; $i++) {
        $nodes->item($i)->parentNode->removeChild($nodes->item($i));
    }
}
 
 
// check for blacklisted words in attributes
$nodes = $xpath->query("//@*[starts-with(local-name(), 'on')] | //@*[starts-with(local-name(), 'xmlns')]");
for ($i = 0; $i < $nodes->length; $i++) {
    $nodes->item($i)->ownerElement->removeAttribute($nodes->item($i)->name);
}
 
// and now, we can start doing some uber smart checking on attributes
// first, remove attr that are not allowed (and not xpathable quite like on*)
$nodes = $xpath->query('//*[@' . implode('] | //*[@', $xpath_check_attributes) . ']');
for ($i = 0; $i < $nodes->length; $i++) {
    foreach ($xpath_check_attributes as $attr) {
        if ($nodes->item($i)->hasAttribute($attr)) {
            $nodes->item($i)->removeAttribute($attr);
        }
    }
}
 
// now, special attributes have to be allowed, but scanned for blacklist strings
$nodes = $xpath->query('//*[@' . implode('] | //*[@', $xpath_check_special_attributes) . ']');
for ($i = 0; $i < $nodes->length; $i++) {
    foreach ($xpath_check_special_attributes as $attr) {
        $node = $nodes->item($i);
        if ($node->hasAttribute($attr)) {
            $repl = $node->getAttribute($attr);
            $repl = str_replace(array_keys($blacklist_strings), array_values($blacklist_strings), $repl);
            
            // only preg clean if a trigger is found
            foreach ($blacklist_regex_triggers as $trigger) {
                if (strpos($repl, $trigger) !== FALSE) {
                    foreach ($blacklist_regex as $regex => $replace_with) {
                        $repl = preg_replace('#'.$regex.'#i', $replace_with, $repl);
                    }
                    break;
                }
            }
            
            $node->setAttribute($attr, $repl);
        }
    }
}
 
 
// special case. Remove sub HTML and body nodes. There should only be one HTML document
// and there should only be one body element. And it has a class of the wrapper md5
// we are going to be over-aggressive on this one. You tack in a body tag, it will
// simply eat your content.
$nodes = $xpath->query('//html/html | //html/body[@class != "'.$hashes['wrapper'].'"]');
for ($i = 0; $i < $nodes->length; $i++) {
    $nodes->item($i)->parentNode->removeChild($nodes->item($i));
}
 
 
// export, capture our content
$value = $dom->saveHTML();
 
// extract everything within our body tags
$value = preg_replace('#\A.*?<body class="'.$hashes['wrapper'].'">(.*)</body>.*\Z#is', '\\1', $value);
 
// do one final cleanup
$value = str_replace(array_keys($blacklist_strings), array_values($blacklist_strings), $value);
 
//done
return $value;