numerodix / spiderfetch

A modular web spider

This URL has Read+Write access

numerodix (author)
Fri Jun 27 11:56:29 -0700 2008
commit  3556b280d07ca9b019b5c41ca6690998c6dd5a07
tree    111577e764718537c1801ef4fa09f8afd1827bb2
parent  5fcb044cf5b4c43bde858e59d3acc6de87b8092d
spiderfetch / io.py
100644 104 lines (84 sloc) 2.835 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
 
import gzip
import cPickle as pickle # cPickle is supposed to be faster
import os
import tempfile
import sys
 
import shcolor
 
 
#LOGDIR = os.environ.get("LOGDIR") or "logs"
LOGDIR = os.environ.get("LOGDIR") or "."
 
def write_out(s):
    sys.stdout.write(s)
 
def write_err(s):
    sys.stderr.write(s)
    sys.stderr.flush()
 
def write_abort():
    write_err("\n%s\n" % shcolor.color(shcolor.RED, "User aborted"))
 
def get_tempfile():
return tempfile.mkstemp(prefix=os.path.basename("." + sys.argv[0]) + ".")
 
def safe_filename(filename, dir=None):
    if dir:
        filename = os.path.join(dir, filename)
    if os.path.exists(filename):
        path = os.path.dirname(filename)
        file = os.path.basename(filename)
        (root, ext) = os.path.splitext(file)
        serial = 1
        while os.path.exists(filename):
            serial += 1
            filename = os.path.join(path, root + "-" + str(serial) + ext)
    return os.path.basename(filename)
 
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
 
def file_exists(filename, dir=None):
    if dir:
        filename = os.path.join(dir, filename)
    return os.path.exists(filename)
 
def delete(filename, dir=None):
    if dir:
        filename = os.path.join(dir, filename)
    return os.unlink(filename)
 
def savelog(s, filename, mode=None):
    create_dir(LOGDIR)
    mode = mode or 'w'
    open(os.path.join(LOGDIR, filename), mode).write(s)
 
def serialize(o, filename, dir=None):
    if dir:
        create_dir(dir)
        filename = os.path.join(dir, filename)
    try:
        getattr(o, "_to_pickle")()
    except AttributeError:
        pass
    #fp = gzip.GzipFile(logdir(filename), 'w', compresslevel=1)
    pickle.dump(o, open(filename, 'w'), pickle.HIGHEST_PROTOCOL)
 
def deserialize(filename, dir=None):
    if dir:
        filename = os.path.join(dir, filename)
    #fp = gzip.GzipFile(filename, 'r')
    o = pickle.load(open(filename, 'r'))
    try:
        getattr(o, "_from_pickle")()
    except AttributeError:
        pass
    return o
 
def opts_help(option, opt_str, value, parser):
    header = "spiderfetch tool suite\n"
    write_err(header+"Usage: %s %s\n\n" % (os.path.basename(sys.argv[0]), parser.usage))
    for o in parser.option_list:
        var = o.metavar or ""
        short = (o._short_opts and o._short_opts[0]) or ""
        long = (o._long_opts and o._long_opts[0]) or ""
        argument = "%s %s %s" % (short, long, var)
        write_err(" %s %s\n" % (argument.strip().ljust(25), o.help))
    sys.exit(2)
 
 
 
if __name__ == "__main__":
    try:
        s = "dvorak"
        (fp, filename) = get_tempfile()
        serialize(s, filename)
        print "Serialization sanity check:", s == deserialize(filename)
    finally:
        os.close(fp)
        os.unlink(filename)