numerodix / spiderfetch

A modular web spider

This URL has Read+Write access

spiderfetch / urlrewrite.py
100644 97 lines (74 sloc) 2.851 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
 
import os
import re
import urlparse
 
 
SCHEMES = ["ftp", "http", "https"]
 
_scheme = "(?P<scheme>%s)$" % "".join(reduce(lambda x, y: "%s|%s" % (x, y), SCHEMES))
scheme_regex = re.compile(_scheme)
 
class InvalidUrl(Exception): pass
 
def rewrite_scheme(scheme):
    m = re.search(scheme_regex, scheme)
    if m and m.groups():
        return m.group('scheme')
    return scheme
 
def assemble_netloc(username, password, hostname, port):
    netloc = hostname
    if username:
        if password:
            username = "%s:%s" % (username, password)
        netloc = "%s@%s" % (username, hostname)
    if port:
        netloc = "%s:%s" % (netloc, port)
    return netloc
 
def get_hostname(url):
    pack = urlparse.urlsplit(url)
    return pack.hostname
 
def get_scheme(url):
    pack = urlparse.urlsplit(url)
    return pack.scheme
 
def rewrite_urls(origin_url, urls):
    origin_pack = urlparse.urlsplit(origin_url)
    for u in urls:
        # kill breaks
        if u:
            u = re.sub("(\n|\t)", "", u)
 
        pack = urlparse.urlsplit(u)
        (scheme, netloc, path, query, fragment) = pack
 
        # try to rewrite scheme
        scheme = rewrite_scheme(pack.scheme)
 
        # rewrite netloc to include credentials
        if origin_pack.username and pack.hostname == origin_pack.hostname:
            netloc = assemble_netloc(origin_pack.username,\
                        origin_pack.password, pack.hostname, pack.port)
 
        # reassemble into url
        new_u = urlparse.urlunsplit((scheme, netloc, path, query, None))
 
        # no scheme or netloc, it's a path on-site
        if not scheme and not netloc and (path or query):
            path_query = urlparse.urlunsplit((None, None, path, query, None))
            new_u = urlparse.urljoin(origin_url, path_query)
 
        # quote spaces
        new_u = new_u.replace(" ", "%20")
        if new_u:
            yield new_u
 
        # drop null urls on the floor, eg: #chapter2
 
def url_to_filename(url):
    (scheme, netloc, path, query, _) = urlparse.urlsplit(url)
    file = os.path.basename(path)
    if os.environ.get("ORIG_FILENAMES") and file:
        filename = file
    else:
        (path, ext) = os.path.splitext(path)
        filename = "_".join([x for x in (scheme, netloc, path, query) if x])
        filename = re.sub("[^a-zA-Z0-9]", "_", filename)
        filename = re.sub("_{2,}", "_", filename)
        filename = re.sub("_$", "", filename)
        filename = filename + ext
    return filename
 
def hostname_to_filename(url):
    return re.sub("[^a-zA-Z0-9]", "_", url)
 
 
 
if __name__ == "__main__":
    base = "http://user:pass@www.juventuz.com/forum/search.php?searchid=1186852"
    urls = ["../index.php?name=jack&act=whatever",
            "http://www.juventuz.com/matches"]
    for u in rewrite_urls(base, urls):
        print u