numerodix / spiderfetch

A modular web spider

This URL has Read+Write access

numerodix (author)
Fri Jun 27 11:56:29 -0700 2008
commit  3556b280d07ca9b019b5c41ca6690998c6dd5a07
tree    111577e764718537c1801ef4fa09f8afd1827bb2
parent  5fcb044cf5b4c43bde858e59d3acc6de87b8092d
spiderfetch / spider.py
100755 160 lines (129 sloc) 4.604 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
 
import itertools
import optparse
import re
import urllib
 
import io
import shcolor
import urlrewrite
 
 
testcases = """\
<a href="http://1host/path">
<a href="http://2host/path" >
<a href='http://3host/path' >
<a href'http://4host/path' >
< href"http://5host/path" >
< href=http://6host/path >
<a href=`http://7host/path`>
<a href="http://8host/p\"ath">
<a href="http://9host/path"att">
<a href="http://10host/p'ath">
<a href="http://11
host/path">
<a href="http://12
host/path">
"""
 
_link = """(?ims)<\s*a[^>]+href[ ]*=?[ ]*(?P<quot>["'`])(?P<url>.*?)(?P=quot)[^>]*?>"""
LINK = re.compile(_link)
 
_frame = """(?ims)<\s*i?frame[^>]+src[ ]*=?[ ]*(?P<quot>["'`])(?P<url>.*?)(?P=quot)[^>]*?>"""
FRAME = re.compile(_frame)
 
_img = """(?ims)<\s*img[^>]+src[ ]*=?[ ]*(?P<quot>["'`])(?P<url>.*?)(?P=quot)[^>]*?>"""
IMG = re.compile(_img)
 
_uri_match = """(?ims)(?P<url>[a-z][a-z0-9+.-]{1,120}:\/\/(([a-z0-9$_.+!*,;\/?:@&~(){}\[\]=-])|%[a-f0-9]{2}){1,333}([a-z0-9][a-z0-9 $_.+!*,;\/?:@&~(){}\[\]=%-]{0,1000})?)"""
URI_MATCH = re.compile(_uri_match)
 
#-rw-r--r-- 1 1042 1042 28620269 Apr 19 2007 stage1-x86-2007.0.tar.bz2
_ftp_listing = """.[^ ]{9}(?:\s+[^ ]+){7}\s+(?P<url>.*)$"""
FTP_LISTING = re.compile(_ftp_listing)
 
def find_with_r(r, s):
    return re.finditer(r, s)
 
def spider_ftp(s):
    lines = s.splitlines()
    filler = ""
    for line in lines:
        it = re.finditer(FTP_LISTING, filler+line)
        filler += (2+len(line))*" "
        for match in it:
            yield match
 
def spider(s):
    for it in [find_with_r(r, s) for r in (LINK, FRAME, IMG)]:
        for match in it:
            yield match
 
def harvest(s):
    return find_with_r(URI_MATCH, s)
 
def findall(s, url):
    its = [spider(s), harvest(s)]
    if urlrewrite.get_scheme(url) == "ftp":
        its.append(spider_ftp(s))
    for (idx, it) in enumerate(its):
        for match in it:
            yield match
 
def unbox_it_to_ss(it):
    for match in it:
        yield match.group('url')
 
def group_by_regex(s):
    its = [spider(s), harvest(s), spider_ftp(s)]
    for (idx, it) in enumerate(its):
        for match in it:
            yield (idx, match)
 
def unique(it):
    seen = set()
    return [x for x in it if x not in seen and not seen.add(x)]
 
def colorize_shell(str):
    it = group_by_regex(str)
 
    # (match_obj, regex_serial_id, color_id)
    it = itertools.imap(lambda (i, m): (m, i, shcolor.map(i)), it)
 
    tuples = [e for e in it]
    def compare(x, y):
        ((match1, serial1, color1), (match2, serial2, color2)) = (x, y)
        ((s1, e1), (s2, e2)) = (match1.span('url'), match2.span('url'))
        return cmp( (s1, e2, serial2), (s2, e1, serial1) )
    tuples.sort(cmp=compare)
 
    # (string_pos, pos_in_tuple_list, [color_id])
    markers = []
    for i in tuples:
        (match, serial, color) = i
        (s, e) = match.span('url')
        markers.append( (s, tuples.index(i), color) )
        markers.append( (e, tuples.index(i), None) )
    def compare(x, y):
        ((pos1, serial1, _), (pos2, serial2, _)) = (x, y)
        return cmp( (pos1, serial1), (pos2, serial2) )
    markers.sort(cmp=compare)
 
    # piecewise add chunks of content followed by new color, using markers
    str_fmt = ""
    stack = []
    cursor = 0
    for (pos, serial, color) in markers:
        col = color
        col_bold = False
        
        if color: # starting new color
            stack.append(color)
        else: # ending color
            stack.pop()
 
        if len(stack) > 1: # more than one layer of color
            col_bold = True
        if len(stack) > 0: # at least one layer
            col = stack[-1:].pop()
 
        str_fmt += str[cursor:pos] + shcolor.code(col, bold=col_bold)
        cursor = pos
    str_fmt += str[cursor:-1]
 
    return str_fmt
 
 
 
if __name__ == "__main__":
    parser = optparse.OptionParser(add_help_option=None) ; a = parser.add_option
    parser.usage = "[ <url> [options] | --test ]"
    a("--dump", action="store_true", help="Dump urls")
    a("-h", action="callback", callback=io.opts_help, help="Display this message")
    a("--test", action="store_true", help="Run spider testsuite")
    (opts, args) = parser.parse_args()
    try:
        if opts.test:
            data = testcases
        else:
            url = args[0]
            data = urllib.urlopen(url).read()
 
        if opts.dump:
            for u in unique(unbox_it_to_ss(findall(data, url))):
                print u
        else:
            print colorize_shell(data)
    except IndexError:
        io.opts_help(None, None, None, parser)