moved to platform core to avoid inclusion multiple times

thatcher · Apr 27, 2010 · aa60e10 · aa60e10
1 parent 09ce067
commit aa60e10
Showing 1 changed file with 329 additions and 0 deletions.
diff --git a/src/platform/core/urlparse.js b/src/platform/core/urlparse.js
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2010 Nick Galbreath
+ * http://code.google.com/p/stringencoders/source/browse/#svn/trunk/javascript
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * url processing in the spirit of python's urlparse module
+ * see `pydoc urlparse` or
+ * http://docs.python.org/library/urlparse.html
+ *
+ *  urlsplit: break apart a URL into components
+ *  urlunsplit:  reconsistute a URL from componets
+ *  urljoin: join an absolute and another URL
+ *  urldefrag: remove the fragment from a URL
+ *
+ * Take a look at the tests in urlparse-test.html
+ *
+ * On URL Normalization:
+ *
+ * urlsplit only does minor normalization the components Only scheme
+ * and hostname are lowercased urljoin does a bit more, normalizing
+ * paths with "."  and "..".
+
+ * urlnormalize adds additional normalization
+ *
+ *   * removes default port numbers
+ *     http://abc.com:80/ -> http://abc.com/, etc
+ *   * normalizes path
+ *     http://abc.com -> http://abc.com/
+ *     and other "." and ".." cleanups
+ *   * if file, remove query and fragment
+ *
+ * It does not do:
+ *   * normalizes escaped hex values
+ *     http://abc.com/%7efoo -> http://abc.com/%7Efoo
+ *   * normalize '+' <--> '%20'
+ *
+ * Differences with Python
+ *
+ * The javascript urlsplit returns a normal object with the following
+ * properties: scheme, netloc, hostname, port, path, query, fragment.
+ * All properties are read-write.
+ *
+ * In python, the resulting object is not a dict, but a specialized,
+ * read-only, and has alternative tuple interface (e.g. obj[0] ==
+ * obj.scheme).  It's not clear why such a simple function requires
+ * a unique datastructure.
+ *
+ * urlunsplit in javascript takes an duck-typed object,
+ *  { scheme: 'http', netloc: 'abc.com', ...}
+ *  while in  * python it takes a list-like object.
+ *  ['http', 'abc.com'... ]
+ *
+ * For all functions, the javascript version use
+ * hostname+port if netloc is missing.  In python
+ * hostname+port were always ignored.
+ *
+ * Similar functionality in different languages:
+ *
+ *   http://php.net/manual/en/function.parse-url.php
+ *   returns assocative array but cannot handle relative URL
+ *
+ * TODO: test allowfragments more
+ * TODO: test netloc missing, but hostname present
+ */
+
+var urlparse = {};
+
+// Unlike to be useful standalone
+//
+// NORMALIZE PATH with "../" and "./"
+//   http://en.wikipedia.org/wiki/URL_normalization
+//   http://tools.ietf.org/html/rfc3986#section-5.2.3
+//
+urlparse.normalizepath = function(path)
+{
+    if (!path || path === '/') {
+        return '/';
+    }
+
+    var parts = path.split('/');
+
+    var newparts = [];
+    // make sure path always starts with '/'
+    if (parts[0]) {
+        newparts.push('');
+    }
+
+    for (var i = 0; i < parts.length; ++i) {
+        if (parts[i] === '..') {
+            if (newparts.length > 1) {
+                newparts.pop();
+            } else {
+                newparts.push(parts[i]);
+            }
+        } else if (parts[i] != '.') {
+            newparts.push(parts[i]);
+        }
+    }
+
+    path = newparts.join('/');
+    if (!path) {
+        path = '/';
+    }
+    return path;
+};
+
+//
+// Does many of the normalizations that the stock
+//  python urlsplit/urlunsplit/urljoin neglects
+//
+// Doesn't do hex-escape normalization on path or query
+//   %7e -> %7E
+// Nor, '+' <--> %20 translation
+//
+urlparse.urlnormalize = function(url)
+{
+    var parts = urlparse.urlsplit(url);
+    switch (parts.scheme) {
+    case 'file':
+        // files can't have query strings
+        //  and we don't bother with fragments
+        parts.query = '';
+        parts.fragment = '';
+        break;
+    case 'http':
+    case 'https':
+        // remove default port
+        if ((parts.scheme === 'http' && parts.port == 80) ||
+            (parts.scheme === 'https' && parts.port == 443)) {
+            parts.port = null;
+            // hostname is already lower case
+            parts.netloc = parts.hostname;
+        }
+        break;
+    default:
+        // if we don't have specific normalizations for this
+        // scheme, return the original url unmolested
+        return url;
+    }
+
+    // for [file|http|https].  Not sure about other schemes
+    parts.path = urlparse.normalizepath(parts.path);
+
+    return urlparse.urlunsplit(parts);
+};
+
+urlparse.urldefrag = function(url)
+{
+    var idx = url.indexOf('#');
+    if (idx == -1) {
+        return [ url, '' ];
+    } else {
+        return [ url.substr(0,idx), url.substr(idx+1) ];
+    }
+};
+
+urlparse.urlsplit = function(url, default_scheme, allow_fragments)
+{
+    var leftover;
+
+    if (typeof allow_fragments === 'undefined') {
+        allow_fragments = true;
+    }
+
+    // scheme (optional), host, port
+    var fullurl = /^([A-Za-z]+)?(:?\/\/)([0-9.\-A-Za-z]*)(?::(\d+))?(.*)$/;
+    // path, query, fragment
+    var parse_leftovers = /([^?#]*)?(?:\?([^#]*))?(?:#(.*))?$/;
+
+    var o = {};
+
+    var parts = url.match(fullurl);
+    if (parts) {
+        o.scheme = parts[1] || default_scheme || '';
+        o.hostname = parts[3].toLowerCase() || '';
+        o.port = parseInt(parts[4],10) || '';
+        // Probably should grab the netloc from regexp
+        //  and then parse again for hostname/port
+
+        o.netloc = parts[3];
+        if (parts[4]) {
+            o.netloc += ':' + parts[4];
+        }
+
+        leftover = parts[5];
+    } else {
+        o.scheme = default_scheme || '';
+        o.netloc = '';
+        o.hostname = '';
+        leftover = url;
+    }
+    o.scheme = o.scheme.toLowerCase();
+
+    parts = leftover.match(parse_leftovers);
+
+    o.path =  parts[1] || '';
+    o.query = parts[2] || '';
+
+    if (allow_fragments) {
+        o.fragment = parts[3] || '';
+    } else {
+        o.fragment = '';
+    }
+
+    return o;
+};
+
+urlparse.urlunsplit = function(o) {
+    var s = '';
+    if (o.scheme) {
+        s += o.scheme + '://';
+    }
+
+    if (o.netloc) {
+        if (s == '') {
+            s += '//';
+        }
+        s +=  o.netloc;
+    } else if (o.hostname) {
+        // extension.  Python only uses netloc
+        if (s == '') {
+            s += '//';
+        }
+        s += o.hostname;
+        if (o.port) {
+            s += ':' + o.port;
+        }
+    }
+
+    if (o.path) {
+        s += o.path;
+    }
+
+    if (o.query) {
+        s += '?' + o.query;
+    }
+    if (o.fragment) {
+        s += '#' + o.fragment;
+    }
+    return s;
+};
+
+urlparse.urljoin = function(base, url, allow_fragments)
+{
+    if (typeof allow_fragments === 'undefined') {
+        allow_fragments = true;
+    }
+
+    var url_parts = urlparse.urlsplit(url);
+
+    // if url parts has a scheme (i.e. absolute)
+    // then nothing to do
+    if (url_parts.scheme) {
+        if (! allow_fragments) {
+            return url;
+        } else {
+            return urlparse.urldefrag(url)[0];
+        }
+    }
+    var base_parts = urlparse.urlsplit(base);
+
+    // copy base, only if not present
+    if (!base_parts.scheme) {
+        base_parts.scheme = url_parts.scheme;
+    }
+
+    // copy netloc, only if not present
+    if (!base_parts.netloc || !base_parts.hostname) {
+        base_parts.netloc = url_parts.netloc;
+        base_parts.hostname = url_parts.hostname;
+        base_parts.port = url_parts.port;
+    }
+
+    // paths
+    if (url_parts.path.length > 0) {
+        if (url_parts.path.charAt(0) == '/') {
+            base_parts.path = url_parts.path;
+        } else {
+            // relative path.. get rid of "current filename" and
+            //   replace.  Same as var parts =
+            //   base_parts.path.split('/'); parts[parts.length-1] =
+            //   url_parts.path; base_parts.path = parts.join('/');
+            var idx = base_parts.path.lastIndexOf('/');
+            if (idx == -1) {
+                base_parts.path = url_parts.path;
+            } else {
+                base_parts.path = base_parts.path.substr(0,idx) + '/' +
+                    url_parts.path;
+            }
+        }
+    }
+
+    // clean up path
+    base_parts.path = urlparse.normalizepath(base_parts.path);
+
+    // copy query string
+    base_parts.query = url_parts.query;
+
+    // copy fragments
+    if (allow_fragments) {
+        base_parts.fragment = url_parts.fragment;
+    } else {
+        base_parts.fragment = '';
+    }
+
+    return urlparse.urlunsplit(base_parts);
+};