Skip to content

Commit fc54756

Browse files
authored
Merge pull request #1031 from cdrini/1028/refactor/ipstats
1028/refactor/ipstats
2 parents 76033c0 + 9114415 commit fc54756

File tree

3 files changed

+100
-70
lines changed

3 files changed

+100
-70
lines changed

conf/openlibrary.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,6 @@ lists:
125125
editions_view: http://127.0.0.1:5984/editions/_fti/_design/seeds/by_seed
126126

127127
admin:
128-
#counts_db: http://127.0.0.1:5984/admin
129128
olsystem_root: /home/noufal/projects/OL/olsystem/
130129
nagios_url: http://monitor.us.archive.org/cgi-bin/nagios3/status.cgi?hostgroup=24.openlibrary&style=detail
131130
statsd_server: localhost:9090

openlibrary/plugins/openlibrary/home.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,7 @@ def is_enabled(self):
3636

3737
def GET(self):
3838
try:
39-
if 'counts_db' in config.admin:
40-
stats = admin.get_stats()
41-
else:
42-
stats = None
39+
stats = admin.get_stats()
4340
except Exception:
4441
logger.error("Error in getting stats", exc_info=True)
4542
stats = None

scripts/ipstats.py

Lines changed: 99 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,118 @@
11
#!/olsystem/bin/olenv python
22
"""
3-
Temporary script to store unique IPs in a single day by parsing the
4-
lighttpd log files directly.
3+
Store count of unique IPs per day to infobase by parsing the nginx log files directly.
4+
5+
This file is currently (17 July 2018) run on production using cron.
56
"""
6-
import _init_path
7+
from datetime import datetime, timedelta
78
import os
8-
import datetime
99
import subprocess
10-
from openlibrary.config import load_config
11-
import web
12-
import infogami
1310

14-
import couchdb
15-
import yaml
11+
import _init_path
12+
import infogami
13+
import web
1614

17-
def connect_to_couch(config_file):
18-
"Connects to the couch databases"
19-
load_config(config_file)
20-
infogami._setup()
2115

22-
f = open(config_file)
23-
config = yaml.load(f)
24-
f.close()
25-
admin_db = config["admin"]["counts_db"]
26-
return couchdb.Database(admin_db)
16+
def run_piped(cmds, stdin=None):
17+
"""
18+
Run the commands piping one's output to the next's input.
19+
:param list[list[str]] cmds:
20+
:param file stdin: The stdin to supply the first command of the pipe
21+
:return: the stdout of the last command
22+
:rtype: file
23+
"""
24+
prev_stdout = stdin
25+
for cmd in cmds:
26+
print(" " + ' '.join(cmd))
27+
p = subprocess.Popen(cmd, stdin=prev_stdout, stdout=subprocess.PIPE)
28+
prev_stdout = p.stdout
29+
return prev_stdout
2730

28-
def store_data(db, data, date):
29-
uid = date.strftime("counts-%Y-%m-%d")
3031

31-
# start storing data in store as well, so that we can phase out couch
32+
def store_data(data, day):
33+
"""
34+
Store stats data about the provided date.
35+
:param dict data:
36+
:param datetime day:
37+
:return:
38+
"""
39+
uid = day.strftime("counts-%Y-%m-%d")
3240
doc = web.ctx.site.store.get(uid) or {}
3341
doc.update(data)
3442
doc['type'] = 'admin-stats'
3543
web.ctx.site.store[uid] = doc
3644

37-
try:
38-
try:
39-
vals = db[uid]
40-
vals.update(data)
41-
except couchdb.http.ResourceNotFound:
42-
vals = data
43-
db[uid] = vals
44-
print "saving %s"%vals
45-
db.save(vals)
46-
except IOError, e:
47-
print >> sys.stderr, "unable to save to couchdb:", str(e)
48-
49-
50-
def run_for_day(d):
51-
basedir = d.strftime("/var/log/nginx/")
52-
awk = ["awk", '$2 == "openlibrary.org" { print $1 }']
53-
sort = ["sort", "-u"]
54-
count = ["wc", "-l"]
55-
print " ", basedir
56-
zipfile = d.strftime("access.log-%Y%m%d.gz")
57-
if os.path.exists(basedir + zipfile):
58-
print " Using ", basedir + zipfile
59-
cmd = subprocess.Popen(["zcat", basedir + zipfile], stdout = subprocess.PIPE)
60-
elif os.path.exists(basedir + "access.log"):
61-
cmd = subprocess.Popen(["cat", "%s/access.log"%basedir], stdout = subprocess.PIPE)
62-
print " Using ", basedir + "access.log"
63-
print " ", awk
64-
cmd = subprocess.Popen(awk, stdin = cmd.stdout, stdout = subprocess.PIPE)
65-
print " ", sort
66-
cmd = subprocess.Popen(sort, stdin = cmd.stdout, stdout = subprocess.PIPE)
67-
print " ", count
68-
cmd = subprocess.Popen(count, stdin = cmd.stdout, stdout = subprocess.PIPE)
69-
val = cmd.stdout.read()
70-
return dict (visitors = int(val))
71-
72-
73-
def main(config):
74-
admin_db = connect_to_couch(config)
75-
current = datetime.datetime.now()
76-
for i in range(2):
45+
46+
def count_unique_ips_for_day(day):
47+
"""
48+
Get the number of unique visitors for the given day.
49+
Throws an IndexError if missing log for the given day.
50+
:param datetime day:
51+
:return: A dict of the form `{visitors: int}`
52+
:rtype: int
53+
"""
54+
basedir = "/var/log/nginx/"
55+
56+
# Cat the logs we'll be processing
57+
print(" " + basedir)
58+
log_file = basedir + "access.log"
59+
zipped_log_file = log_file + day.strftime("-%Y%m%day.gz")
60+
61+
if os.path.exists(zipped_log_file):
62+
cat_log_cmd = ["zcat", zipped_log_file]
63+
elif day > (datetime.today() - timedelta(days=5)):
64+
# if recent day, then they haven't been compressed yet
65+
cat_log_cmd = ["cat", log_file]
66+
else:
67+
raise IndexError("Cannot find log file for " + day.strftime("%Y-%m-%day"))
68+
69+
out = run_piped([
70+
cat_log_cmd, # cat the server logs
71+
["awk", '$2 == "openlibrary.org" { print $1 }'], # get all the IPs
72+
["sort", "-u"], # get unique only
73+
["wc", "-l"], # count number of lines
74+
])
75+
76+
return int(out.read())
77+
78+
79+
def main(start, end):
80+
"""
81+
Get the unique visitors per day between the 2 dates (inclusive) and store them
82+
in the infogami database. Ignores errors
83+
:param datetime start:
84+
:param datetime end:
85+
:return:
86+
"""
87+
infogami._setup()
88+
89+
current = start
90+
while current <= end:
7791
print current
78-
d = run_for_day(current)
79-
store_data(admin_db, d, current)
80-
current = current - datetime.timedelta(days = 1)
92+
try:
93+
count = count_unique_ips_for_day(current)
94+
store_data(dict(visitors=count), current)
95+
except IndexError, e:
96+
print(" " + e.message)
97+
current += timedelta(days=1)
98+
8199

82100
if __name__ == "__main__":
101+
from argparse import ArgumentParser
83102
import sys
84-
sys.exit(main(sys.argv[1]))
103+
104+
parser = ArgumentParser(
105+
description="Store count of unique IPs per day from the past K days (including today) in infobase.")
106+
parser.add_argument('--days', type=int, default=1,
107+
help="how many days to go back")
108+
parser.add_argument('--range', nargs=2, type=lambda d: datetime.strptime(d, "%Y-%m-%d"),
109+
help="alternatively, provide a range of dates to visit (like `--range 2018-06-25 2018-07-14`)")
110+
args = parser.parse_args()
111+
112+
if args.range:
113+
start, end = args.range
114+
else:
115+
end = datetime.today()
116+
start = end - timedelta(days=args.days)
117+
118+
sys.exit(main(start, end))

0 commit comments

Comments
 (0)