Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1031 from cdrini/1028/refactor/ipstats
1028/refactor/ipstats
- Loading branch information
Showing
3 changed files
with
100 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,84 +1,118 @@ | ||
#!/olsystem/bin/olenv python | ||
""" | ||
Temporary script to store unique IPs in a single day by parsing the | ||
lighttpd log files directly. | ||
Store count of unique IPs per day to infobase by parsing the nginx log files directly. | ||
This file is currently (17 July 2018) run on production using cron. | ||
""" | ||
import _init_path | ||
from datetime import datetime, timedelta | ||
import os | ||
import datetime | ||
import subprocess | ||
from openlibrary.config import load_config | ||
import web | ||
import infogami | ||
|
||
import couchdb | ||
import yaml | ||
import _init_path | ||
import infogami | ||
import web | ||
|
||
def connect_to_couch(config_file): | ||
"Connects to the couch databases" | ||
load_config(config_file) | ||
infogami._setup() | ||
|
||
f = open(config_file) | ||
config = yaml.load(f) | ||
f.close() | ||
admin_db = config["admin"]["counts_db"] | ||
return couchdb.Database(admin_db) | ||
def run_piped(cmds, stdin=None): | ||
""" | ||
Run the commands piping one's output to the next's input. | ||
:param list[list[str]] cmds: | ||
:param file stdin: The stdin to supply the first command of the pipe | ||
:return: the stdout of the last command | ||
:rtype: file | ||
""" | ||
prev_stdout = stdin | ||
for cmd in cmds: | ||
print(" " + ' '.join(cmd)) | ||
p = subprocess.Popen(cmd, stdin=prev_stdout, stdout=subprocess.PIPE) | ||
prev_stdout = p.stdout | ||
return prev_stdout | ||
|
||
def store_data(db, data, date): | ||
uid = date.strftime("counts-%Y-%m-%d") | ||
|
||
# start storing data in store as well, so that we can phase out couch | ||
def store_data(data, day): | ||
""" | ||
Store stats data about the provided date. | ||
:param dict data: | ||
:param datetime day: | ||
:return: | ||
""" | ||
uid = day.strftime("counts-%Y-%m-%d") | ||
doc = web.ctx.site.store.get(uid) or {} | ||
doc.update(data) | ||
doc['type'] = 'admin-stats' | ||
web.ctx.site.store[uid] = doc | ||
|
||
try: | ||
try: | ||
vals = db[uid] | ||
vals.update(data) | ||
except couchdb.http.ResourceNotFound: | ||
vals = data | ||
db[uid] = vals | ||
print "saving %s"%vals | ||
db.save(vals) | ||
except IOError, e: | ||
print >> sys.stderr, "unable to save to couchdb:", str(e) | ||
|
||
|
||
def run_for_day(d): | ||
basedir = d.strftime("/var/log/nginx/") | ||
awk = ["awk", '$2 == "openlibrary.org" { print $1 }'] | ||
sort = ["sort", "-u"] | ||
count = ["wc", "-l"] | ||
print " ", basedir | ||
zipfile = d.strftime("access.log-%Y%m%d.gz") | ||
if os.path.exists(basedir + zipfile): | ||
print " Using ", basedir + zipfile | ||
cmd = subprocess.Popen(["zcat", basedir + zipfile], stdout = subprocess.PIPE) | ||
elif os.path.exists(basedir + "access.log"): | ||
cmd = subprocess.Popen(["cat", "%s/access.log"%basedir], stdout = subprocess.PIPE) | ||
print " Using ", basedir + "access.log" | ||
print " ", awk | ||
cmd = subprocess.Popen(awk, stdin = cmd.stdout, stdout = subprocess.PIPE) | ||
print " ", sort | ||
cmd = subprocess.Popen(sort, stdin = cmd.stdout, stdout = subprocess.PIPE) | ||
print " ", count | ||
cmd = subprocess.Popen(count, stdin = cmd.stdout, stdout = subprocess.PIPE) | ||
val = cmd.stdout.read() | ||
return dict (visitors = int(val)) | ||
|
||
|
||
def main(config): | ||
admin_db = connect_to_couch(config) | ||
current = datetime.datetime.now() | ||
for i in range(2): | ||
|
||
def count_unique_ips_for_day(day): | ||
""" | ||
Get the number of unique visitors for the given day. | ||
Throws an IndexError if missing log for the given day. | ||
:param datetime day: | ||
:return: A dict of the form `{visitors: int}` | ||
:rtype: int | ||
""" | ||
basedir = "/var/log/nginx/" | ||
|
||
# Cat the logs we'll be processing | ||
print(" " + basedir) | ||
log_file = basedir + "access.log" | ||
zipped_log_file = log_file + day.strftime("-%Y%m%day.gz") | ||
|
||
if os.path.exists(zipped_log_file): | ||
cat_log_cmd = ["zcat", zipped_log_file] | ||
elif day > (datetime.today() - timedelta(days=5)): | ||
# if recent day, then they haven't been compressed yet | ||
cat_log_cmd = ["cat", log_file] | ||
else: | ||
raise IndexError("Cannot find log file for " + day.strftime("%Y-%m-%day")) | ||
|
||
out = run_piped([ | ||
cat_log_cmd, # cat the server logs | ||
["awk", '$2 == "openlibrary.org" { print $1 }'], # get all the IPs | ||
["sort", "-u"], # get unique only | ||
["wc", "-l"], # count number of lines | ||
]) | ||
|
||
return int(out.read()) | ||
|
||
|
||
def main(start, end): | ||
""" | ||
Get the unique visitors per day between the 2 dates (inclusive) and store them | ||
in the infogami database. Ignores errors | ||
:param datetime start: | ||
:param datetime end: | ||
:return: | ||
""" | ||
infogami._setup() | ||
|
||
current = start | ||
while current <= end: | ||
print current | ||
d = run_for_day(current) | ||
store_data(admin_db, d, current) | ||
current = current - datetime.timedelta(days = 1) | ||
try: | ||
count = count_unique_ips_for_day(current) | ||
store_data(dict(visitors=count), current) | ||
except IndexError, e: | ||
print(" " + e.message) | ||
current += timedelta(days=1) | ||
|
||
|
||
if __name__ == "__main__": | ||
from argparse import ArgumentParser | ||
import sys | ||
sys.exit(main(sys.argv[1])) | ||
|
||
parser = ArgumentParser( | ||
description="Store count of unique IPs per day from the past K days (including today) in infobase.") | ||
parser.add_argument('--days', type=int, default=1, | ||
help="how many days to go back") | ||
parser.add_argument('--range', nargs=2, type=lambda d: datetime.strptime(d, "%Y-%m-%d"), | ||
help="alternatively, provide a range of dates to visit (like `--range 2018-06-25 2018-07-14`)") | ||
args = parser.parse_args() | ||
|
||
if args.range: | ||
start, end = args.range | ||
else: | ||
end = datetime.today() | ||
start = end - timedelta(days=args.days) | ||
|
||
sys.exit(main(start, end)) |