numerodix / spiderfetch

A modular web spider

This URL has Read+Write access

spiderfetch / io.py
100644 174 lines (135 sloc) 5.27 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python
 
import gzip
import cPickle as pickle # cPickle is supposed to be faster
import optparse
import os
import tempfile
import sys
 
import shcolor
 
 
_help_header = "spiderfetch tool suite\n\n"
 
_help_tools="""\
== spiderfetch ==
 
Spiders recursively for urls, starting from <url>. Driven either by <pattern>
or <recipe>. Spidering can be paused/canceled at any time with Ctrl+C, which
will attempt to save the current state in $host.{session,web}. Spidering can
resume provided these two files are found. Terminates either by reaching the
end of the recipe, or reaching the end of the spider queue (no more urls
found). At this point the web is saved to $host.web.
 
During execution, successful fetches are written to log_urls, failed fetches
to error_urls, and outright errors (that shouldn't happen) to error_log.
 
== web ==
 
A query tool for webs that operates on .web files produced by spiderfetch.
 
== fetch ==
 
A general purpose fetcher for ftp/http/https, used by spiderfetch. Displays
one url per line and error codes for common fetch errors.
 
== spider ==
 
A spider module for spidering urls in documents. Can be used standalone with a
single url to test spidering capabilities and can also highlight matches in the
document.
 
== dumpstream ==
 
An automation module for use with mplayer to record media streams. Reads urls
from a file and records with mplayer.
"""
 
_help_vars="""\
SOCKET_TIMEOUT Seconds to wait before calling a socket timeout.
TRIES Number of tries on 503 Service Unavailable.
 
ORIG_FILENAMES Save files with their original filenames on the host (1) or
use filenames generated from the full url to avoid name collisions (0).
TMPDIR Temp directory for downloads.
LOGDIR Directory to use for logfiles.
 
TERM When set and not 'dumb' gives color output.
DEBUG_FETCH Write newlines after every update to see the full output.
"""
 
#LOGDIR = os.environ.get("LOGDIR") or "logs"
LOGDIR = os.environ.get("LOGDIR") or "."
 
def write_out(s):
    sys.stdout.write(s)
 
def write_err(s):
    sys.stderr.write(s)
    sys.stderr.flush()
 
def write_abort():
    write_err("\n%s\n" % shcolor.color(shcolor.RED, "User aborted"))
 
def get_tempfile():
return tempfile.mkstemp(prefix="."+os.path.basename(sys.argv[0])+".")
 
def safe_filename(filename, dir=None):
    if dir:
        filename = os.path.join(dir, filename)
    if os.path.exists(filename):
        path = os.path.dirname(filename)
        file = os.path.basename(filename)
        (root, ext) = os.path.splitext(file)
        serial = 1
        while os.path.exists(filename):
            serial += 1
            filename = os.path.join(path, root + "-" + str(serial) + ext)
    return os.path.basename(filename)
 
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
 
def file_exists(filename, dir=None):
    if dir:
        filename = os.path.join(dir, filename)
    return os.path.exists(filename)
 
def delete(filename, dir=None):
    if dir:
        filename = os.path.join(dir, filename)
    return os.unlink(filename)
 
def savelog(s, filename, mode=None):
    create_dir(LOGDIR)
    mode = mode or 'w'
    open(os.path.join(LOGDIR, filename), mode).write(s)
 
def serialize(o, filename, dir=None):
    if dir:
        create_dir(dir)
        filename = os.path.join(dir, filename)
    try:
        getattr(o, "_to_pickle")()
    except AttributeError:
        pass
    #fp = gzip.GzipFile(logdir(filename), 'w', compresslevel=1)
    pickle.dump(o, open(filename, 'w'), pickle.HIGHEST_PROTOCOL)
 
def deserialize(filename, dir=None):
    if dir:
        filename = os.path.join(dir, filename)
    #fp = gzip.GzipFile(filename, 'r')
    o = pickle.load(open(filename, 'r'))
    try:
        getattr(o, "_from_pickle")()
    except AttributeError:
        pass
    return o
 
def init_opts(usage):
    parser = optparse.OptionParser(add_help_option=None)
    parser.usage = usage
    return parser, parser.add_option
 
def opts_help(option, opt_str, value, parser):
    write_err(_help_header+
        "Usage: %s %s\n\n" % (os.path.basename(sys.argv[0]), parser.usage))
    for o in parser.option_list:
        var = o.metavar or ""
        short = (o._short_opts and o._short_opts[0]) or ""
        long = (o._long_opts and o._long_opts[0]) or ""
        argument = "%s %s %s" % (short, long, var)
        write_err(" %s %s\n" % (argument.strip().ljust(25), o.help))
    sys.exit(2)
 
def help_tools(option, opt_str, value, parser):
    write_err(_help_header+_help_tools)
    sys.exit(2)
 
def help_vars(option, opt_str, value, parser):
    write_err(_help_header+_help_vars)
    sys.exit(2)
 
def parse_args(parser):
    a = parser.add_option
    a("-h", action="callback", callback=opts_help, help="Display this message")
    a("--tools", action="callback", callback=help_tools, help="Descriptions of the tools")
    a("--vars", action="callback", callback=help_vars, help="Environmental variables")
    return parser.parse_args()
 
 
 
if __name__ == "__main__":
    try:
        s = "dvorak"
        (fp, filename) = get_tempfile()
        serialize(s, filename)
        print "Serialization sanity check:", s == deserialize(filename)
    finally:
        os.close(fp)
        os.unlink(filename)