|
1 | 1 | #!/olsystem/bin/olenv python
|
2 | 2 | """
|
3 |
| -Temporary script to store unique IPs in a single day by parsing the |
4 |
| -lighttpd log files directly. |
| 3 | +Store count of unique IPs per day to infobase by parsing the nginx log files directly. |
| 4 | +
|
| 5 | +This file is currently (17 July 2018) run on production using cron. |
5 | 6 | """
|
6 |
| -import _init_path |
| 7 | +from datetime import datetime, timedelta |
7 | 8 | import os
|
8 |
| -import datetime |
9 | 9 | import subprocess
|
10 |
| -from openlibrary.config import load_config |
11 |
| -import web |
12 |
| -import infogami |
13 | 10 |
|
14 |
| -import couchdb |
15 |
| -import yaml |
| 11 | +import _init_path |
| 12 | +import infogami |
| 13 | +import web |
16 | 14 |
|
17 |
| -def connect_to_couch(config_file): |
18 |
| - "Connects to the couch databases" |
19 |
| - load_config(config_file) |
20 |
| - infogami._setup() |
21 | 15 |
|
22 |
| - f = open(config_file) |
23 |
| - config = yaml.load(f) |
24 |
| - f.close() |
25 |
| - admin_db = config["admin"]["counts_db"] |
26 |
| - return couchdb.Database(admin_db) |
| 16 | +def run_piped(cmds, stdin=None): |
| 17 | + """ |
| 18 | + Run the commands piping one's output to the next's input. |
| 19 | + :param list[list[str]] cmds: |
| 20 | + :param file stdin: The stdin to supply the first command of the pipe |
| 21 | + :return: the stdout of the last command |
| 22 | + :rtype: file |
| 23 | + """ |
| 24 | + prev_stdout = stdin |
| 25 | + for cmd in cmds: |
| 26 | + print(" " + ' '.join(cmd)) |
| 27 | + p = subprocess.Popen(cmd, stdin=prev_stdout, stdout=subprocess.PIPE) |
| 28 | + prev_stdout = p.stdout |
| 29 | + return prev_stdout |
27 | 30 |
|
28 |
| -def store_data(db, data, date): |
29 |
| - uid = date.strftime("counts-%Y-%m-%d") |
30 | 31 |
|
31 |
| - # start storing data in store as well, so that we can phase out couch |
| 32 | +def store_data(data, day): |
| 33 | + """ |
| 34 | + Store stats data about the provided date. |
| 35 | + :param dict data: |
| 36 | + :param datetime day: |
| 37 | + :return: |
| 38 | + """ |
| 39 | + uid = day.strftime("counts-%Y-%m-%d") |
32 | 40 | doc = web.ctx.site.store.get(uid) or {}
|
33 | 41 | doc.update(data)
|
34 | 42 | doc['type'] = 'admin-stats'
|
35 | 43 | web.ctx.site.store[uid] = doc
|
36 | 44 |
|
37 |
| - try: |
38 |
| - try: |
39 |
| - vals = db[uid] |
40 |
| - vals.update(data) |
41 |
| - except couchdb.http.ResourceNotFound: |
42 |
| - vals = data |
43 |
| - db[uid] = vals |
44 |
| - print "saving %s"%vals |
45 |
| - db.save(vals) |
46 |
| - except IOError, e: |
47 |
| - print >> sys.stderr, "unable to save to couchdb:", str(e) |
48 |
| - |
49 |
| - |
50 |
| -def run_for_day(d): |
51 |
| - basedir = d.strftime("/var/log/nginx/") |
52 |
| - awk = ["awk", '$2 == "openlibrary.org" { print $1 }'] |
53 |
| - sort = ["sort", "-u"] |
54 |
| - count = ["wc", "-l"] |
55 |
| - print " ", basedir |
56 |
| - zipfile = d.strftime("access.log-%Y%m%d.gz") |
57 |
| - if os.path.exists(basedir + zipfile): |
58 |
| - print " Using ", basedir + zipfile |
59 |
| - cmd = subprocess.Popen(["zcat", basedir + zipfile], stdout = subprocess.PIPE) |
60 |
| - elif os.path.exists(basedir + "access.log"): |
61 |
| - cmd = subprocess.Popen(["cat", "%s/access.log"%basedir], stdout = subprocess.PIPE) |
62 |
| - print " Using ", basedir + "access.log" |
63 |
| - print " ", awk |
64 |
| - cmd = subprocess.Popen(awk, stdin = cmd.stdout, stdout = subprocess.PIPE) |
65 |
| - print " ", sort |
66 |
| - cmd = subprocess.Popen(sort, stdin = cmd.stdout, stdout = subprocess.PIPE) |
67 |
| - print " ", count |
68 |
| - cmd = subprocess.Popen(count, stdin = cmd.stdout, stdout = subprocess.PIPE) |
69 |
| - val = cmd.stdout.read() |
70 |
| - return dict (visitors = int(val)) |
71 |
| - |
72 |
| - |
73 |
| -def main(config): |
74 |
| - admin_db = connect_to_couch(config) |
75 |
| - current = datetime.datetime.now() |
76 |
| - for i in range(2): |
| 45 | + |
| 46 | +def count_unique_ips_for_day(day): |
| 47 | + """ |
| 48 | + Get the number of unique visitors for the given day. |
| 49 | + Throws an IndexError if missing log for the given day. |
| 50 | + :param datetime day: |
| 51 | + :return: A dict of the form `{visitors: int}` |
| 52 | + :rtype: int |
| 53 | + """ |
| 54 | + basedir = "/var/log/nginx/" |
| 55 | + |
| 56 | + # Cat the logs we'll be processing |
| 57 | + print(" " + basedir) |
| 58 | + log_file = basedir + "access.log" |
| 59 | + zipped_log_file = log_file + day.strftime("-%Y%m%day.gz") |
| 60 | + |
| 61 | + if os.path.exists(zipped_log_file): |
| 62 | + cat_log_cmd = ["zcat", zipped_log_file] |
| 63 | + elif day > (datetime.today() - timedelta(days=5)): |
| 64 | + # if recent day, then they haven't been compressed yet |
| 65 | + cat_log_cmd = ["cat", log_file] |
| 66 | + else: |
| 67 | + raise IndexError("Cannot find log file for " + day.strftime("%Y-%m-%day")) |
| 68 | + |
| 69 | + out = run_piped([ |
| 70 | + cat_log_cmd, # cat the server logs |
| 71 | + ["awk", '$2 == "openlibrary.org" { print $1 }'], # get all the IPs |
| 72 | + ["sort", "-u"], # get unique only |
| 73 | + ["wc", "-l"], # count number of lines |
| 74 | + ]) |
| 75 | + |
| 76 | + return int(out.read()) |
| 77 | + |
| 78 | + |
| 79 | +def main(start, end): |
| 80 | + """ |
| 81 | + Get the unique visitors per day between the 2 dates (inclusive) and store them |
| 82 | + in the infogami database. Ignores errors |
| 83 | + :param datetime start: |
| 84 | + :param datetime end: |
| 85 | + :return: |
| 86 | + """ |
| 87 | + infogami._setup() |
| 88 | + |
| 89 | + current = start |
| 90 | + while current <= end: |
77 | 91 | print current
|
78 |
| - d = run_for_day(current) |
79 |
| - store_data(admin_db, d, current) |
80 |
| - current = current - datetime.timedelta(days = 1) |
| 92 | + try: |
| 93 | + count = count_unique_ips_for_day(current) |
| 94 | + store_data(dict(visitors=count), current) |
| 95 | + except IndexError, e: |
| 96 | + print(" " + e.message) |
| 97 | + current += timedelta(days=1) |
| 98 | + |
81 | 99 |
|
82 | 100 | if __name__ == "__main__":
|
| 101 | + from argparse import ArgumentParser |
83 | 102 | import sys
|
84 |
| - sys.exit(main(sys.argv[1])) |
| 103 | + |
| 104 | + parser = ArgumentParser( |
| 105 | + description="Store count of unique IPs per day from the past K days (including today) in infobase.") |
| 106 | + parser.add_argument('--days', type=int, default=1, |
| 107 | + help="how many days to go back") |
| 108 | + parser.add_argument('--range', nargs=2, type=lambda d: datetime.strptime(d, "%Y-%m-%d"), |
| 109 | + help="alternatively, provide a range of dates to visit (like `--range 2018-06-25 2018-07-14`)") |
| 110 | + args = parser.parse_args() |
| 111 | + |
| 112 | + if args.range: |
| 113 | + start, end = args.range |
| 114 | + else: |
| 115 | + end = datetime.today() |
| 116 | + start = end - timedelta(days=args.days) |
| 117 | + |
| 118 | + sys.exit(main(start, end)) |
0 commit comments