Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SFR-1900_ParseDownloadRequests #296

Merged
merged 3 commits into from Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGELOG.md
@@ -1,6 +1,11 @@
# CHANGELOG

## unreleased version -- v0.12.4
## unreleased version -- v0.13.1
## Added
- New script to parse download requests from S3 log files for UMP books
## Fixed

## 2024-03-21 -- v0.13.0
## Added
- New script to add nypl_login flag to Links objects
- Added nypl_login flags to NYPL and University of Michigan mapping and process
Expand Down
3 changes: 2 additions & 1 deletion scripts/__init__.py
Expand Up @@ -13,4 +13,5 @@
from .updatePubLocationAndLinks import main as updateLocationAndLinks
from .countCABooks import main as countCA
from .nyplLoginFlags import main as nyplFlags
from .deleteUMPManifestLinks import main as deleteUMPManifests
from .deleteUMPManifestLinks import main as deleteUMPManifests
from .parseDownloadRequests import main as parseDownloads
88 changes: 88 additions & 0 deletions scripts/parseDownloadRequests.py
@@ -0,0 +1,88 @@
import os
import boto3
import re
import numpy

from model import Edition, Item, Link
from model.postgres.item import ITEM_LINKS
from managers import DBManager

s3_client = boto3.client("s3")

bucketName = 'ump-pdf-repository-logs'
logPrefix = 'logs/946183545209/us-east-1/ump-pdf-repository/2024/03/13/'
requestRegex = r'REST.GET.OBJECT '
fileIDRegex = r'REST.GET.OBJECT (.+pdf\s)' #File ID includes the file name for the pdf object
timeStampRegex = r'\[.+\]'
referrerRegex = r'https://drb-qa.nypl.org/'
umpDownloadArray = [['title', 'timeStamp', 'identifier']]

def main():

'''
The edition title, identifier, and timestamp are parsed out of the
S3 server access log files for UMP download requests
'''

batches = load_batch()
for batch in batches:
for c in batch['Contents']:
currKey = str(c['Key'])
#logObject is a dict type
logObject = s3_client.get_object(Bucket= bucketName, Key= f'{currKey}')
for i in logObject['Body'].iter_lines():
logObject = i.decode('utf8')
parseTuple = parseInfo(logObject)
if parseTuple:
umpDownloadArray.append(parseTuple)
umpDownloadCSV = numpy.array(umpDownloadArray)
with open('data3.csv', 'w') as f:
f.write(str(umpDownloadCSV))

def load_batch():
paginator = s3_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket= bucketName, Prefix=logPrefix)
return page_iterator

def parseInfo(logObject):
mitri-slory marked this conversation as resolved.
Show resolved Hide resolved
matchRequest = re.search(requestRegex, logObject)
matchReferrer = re.search(referrerRegex, logObject)

if matchRequest and matchReferrer and '403 AccessDenied' not in logObject:
matchTime = re.search(timeStampRegex, logObject)
matchFileID = re.search(fileIDRegex, logObject)
linkGroup = matchFileID.group(1)
titleParse = ''
idParse = None

dbManager = DBManager(
user= os.environ.get('POSTGRES_USER', None),
pswd= os.environ.get('POSTGRES_PSWD', None),
host= os.environ.get('POSTGRES_HOST', None),
port= os.environ.get('POSTGRES_PORT', None),
db= os.environ.get('POSTGRES_NAME', None)
)
dbManager.generateEngine()

dbManager.createSession()

for item in dbManager.session.query(Item) \
.filter(Item.source == 'UofM'):
for link in dbManager.session.query(Link) \
.join(ITEM_LINKS) \
.filter(ITEM_LINKS.c.item_id == item.id) \
.filter(Link.media_type == 'application/pdf') \
.filter(Link.url.contains(linkGroup.strip())).all():
itemEditID = item.edition_id
for edit in dbManager.session.query(Edition) \
.filter(Edition.id == itemEditID):
titleParse = edit.title
idParse = edit.id
Apophenia marked this conversation as resolved.
Show resolved Hide resolved

dbManager.closeConnection()

return [titleParse, matchTime.group(0), idParse]


if __name__ == '__main__':
main()