<?php
/**
* Performs an XSS Clean of the content
* @param $value the value to clean
**/
$value = $this->value;
$charset = chipi('Chippino/Util/Config')->with()->core['charset'];
// these strings are removed when encountered.
// we don't reduce them to '' to avoid reintroducing a blacklisted word
$blacklist_strings = array(
'fscommand' => '[removed]',
'seeksegmenttime' => '[removed]',
'document.cookie' => '[removed]',
'document.write' => '[removed]',
'.parentNode' => '[removed]',
'.innerHTML' => '[removed]',
'window.location' => '[removed]',
'-moz-binding' => '[removed]',
'<!--' => '<!--',
'-->' => '-->',
'<![CDATA[' => '<![CDATA[',
);
// these words, if they are found, will trigger an advanced
// regex search
$blacklist_regex_triggers = array(
'javascript',
'expression',
'redirect',
);
// if a trigger is found, these regex will be ran
$blacklist_regex = array(
"javascript\s*:" => '[removed]',
"expression\s*\(" => '[removed]', // CSS and IE
"Redirect\s+302" => '[removed]',
);
// this is a list of words which browsers will render with whitespace
// in them, for example
// java
// script
// these will be collapsed down to their non-spaced version.
$blacklist_whitespace = array(
'javascript',
'expression',
'script',
'vbscript',
'alert',
'document',
'write',
'cookie',
'window',
);
// a list of attributes that are not allowed. Anything starting with
// "on"* is caught in a separate filter
// http://www.w3schools.com/jsref/jsref_events.asp
$xpath_check_attributes = array(
'xmlns',
);
// a list of notes that require special scrutiny
$xpath_check_special_attributes = array(
'href',
'style',
'src',
'dynsrc',
'lowsrc',
'datasrc',
);
// a list of known nodes to remove
$xpath_remove_nodes = array(
'vbscript',
'script',
'applet',
'object',
'embed',
'xss',
'bgsound',
'style',
'link',
'meta',
);
// first pass normalization
// we already have normalized our spaces and our newlines
// now we need to normalize our charcters
// let's make hashes for placeholders
$hashes = array(
'&' => md5(time() + mt_rand(0, 1000000000)),
'wrapper' => md5(time() + mt_rand(0, 1000000000)),
);
// anything that looks like it belongs in a query string, we'll hash the & part
// so that we don't accidentally encode it
// &phrase=value
$value = preg_replace('#\&([a-z\_0-9]+)\=([a-z\_0-9]+)#i', $hashes['&']."\\1=\\2", $value);
// ensure all character entities end in a ;
// 
$value = preg_replace('#(&\#?[0-9a-z]+)[\x00-\x20]*;?#i', "\\1;", $value);
// handle unicode since php 5 kinda-sorta supports it. Add a semicolon.
// 𞆒
$value = preg_replace('#(&\#x?)([0-9A-F]+);?#i',"\\1\\2;",$value);
// strip control characters
// any sort of hex < 1f
$value = preg_replace('#[\x00-\x08\x0e-\x1f]#', '', $value);
// change all space-like things to spaces
$value = preg_replace('#&\#(?:x0*20|0*32);?#i', ' ', $value);
// restore our ampersands
$value = str_replace($hashes['&'], '&', $value);
// and now we can decode everything
$value = rawurldecode($value);
// decode all html ents if we have a chance of one
// we need a &, a #, and a ;
if (strpos($value, '&') !== FALSE && strpos($value, '#') !== FALSE && strpos($value, ';') !== FALSE) {
$value = html_entity_decode($value, ENT_COMPAT, $charset);
$value = preg_replace('~&#x([0-9a-f]{2,5})~ei', 'chr(hexdec("\\1"))', $value);
$value = preg_replace('~&#([0-9]{2,4})~e', 'chr(\\1)', $value);
}
// alright, now we can clean up our blacklists since all entities are sane
$value = str_replace(array_keys($blacklist_strings), array_values($blacklist_strings), $value);
// strip all possible opening php tags
// this catches XML tags as well
$value = str_replace(array('<?php', '<?PHP', '<?', '?'.'>'), array('<?php', '<?PHP', '<?', '?>'), $value);
// collapse our expandable whitespace items
foreach ($blacklist_whitespace as $word) {
$regex = '#(' . implode('\s*', str_split($word)) . ')(\W)#is';
$value = preg_replace($regex, $word."\\2", $value);
}
// load into a DOM document
// add a paragraph element we can extract from
if (!function_exists('xss_loadhtml_error_handler')) {
chip('Chippino/Util/Request/_XSSErrorHandler')->with();
}
set_error_handler('xss_loadhtml_error_handler');
$dom = new DOMDocument();
$dom->recover = TRUE;
$dom->strictErrorChecking = FALSE;
$dom->loadHTML('<html><body class="'.$hashes['wrapper'].'">'.$value.'</body></html>');
$xpath = new DOMXPath($dom);
restore_error_handler();
// strip all nodes that are not allowed
// keep going until done
$nodes_exist = TRUE;
while ($nodes_exist) {
$nodes = $xpath->query('//' . implode(' | //', $xpath_remove_nodes));
if (!$nodes->length) {
$nodes_exist = FALSE;
}
for ($i = 0; $i < $nodes->length; $i++) {
$nodes->item($i)->parentNode->removeChild($nodes->item($i));
}
}
// check for blacklisted words in attributes
$nodes = $xpath->query("//@*[starts-with(local-name(), 'on')] | //@*[starts-with(local-name(), 'xmlns')]");
for ($i = 0; $i < $nodes->length; $i++) {
$nodes->item($i)->ownerElement->removeAttribute($nodes->item($i)->name);
}
// and now, we can start doing some uber smart checking on attributes
// first, remove attr that are not allowed (and not xpathable quite like on*)
$nodes = $xpath->query('//*[@' . implode('] | //*[@', $xpath_check_attributes) . ']');
for ($i = 0; $i < $nodes->length; $i++) {
foreach ($xpath_check_attributes as $attr) {
if ($nodes->item($i)->hasAttribute($attr)) {
$nodes->item($i)->removeAttribute($attr);
}
}
}
// now, special attributes have to be allowed, but scanned for blacklist strings
$nodes = $xpath->query('//*[@' . implode('] | //*[@', $xpath_check_special_attributes) . ']');
for ($i = 0; $i < $nodes->length; $i++) {
foreach ($xpath_check_special_attributes as $attr) {
$node = $nodes->item($i);
if ($node->hasAttribute($attr)) {
$repl = $node->getAttribute($attr);
$repl = str_replace(array_keys($blacklist_strings), array_values($blacklist_strings), $repl);
// only preg clean if a trigger is found
foreach ($blacklist_regex_triggers as $trigger) {
if (strpos($repl, $trigger) !== FALSE) {
foreach ($blacklist_regex as $regex => $replace_with) {
$repl = preg_replace('#'.$regex.'#i', $replace_with, $repl);
}
break;
}
}
$node->setAttribute($attr, $repl);
}
}
}
// special case. Remove sub HTML and body nodes. There should only be one HTML document
// and there should only be one body element. And it has a class of the wrapper md5
// we are going to be over-aggressive on this one. You tack in a body tag, it will
// simply eat your content.
$nodes = $xpath->query('//html/html | //html/body[@class != "'.$hashes['wrapper'].'"]');
for ($i = 0; $i < $nodes->length; $i++) {
$nodes->item($i)->parentNode->removeChild($nodes->item($i));
}
// export, capture our content
$value = $dom->saveHTML();
// extract everything within our body tags
$value = preg_replace('#\A.*?<body class="'.$hashes['wrapper'].'">(.*)</body>.*\Z#is', '\\1', $value);
// do one final cleanup
$value = str_replace(array_keys($blacklist_strings), array_values($blacklist_strings), $value);
//done
return $value;