-
Notifications
You must be signed in to change notification settings - Fork 16
/
journals_last_manual_update_between.py
97 lines (85 loc) · 3.09 KB
/
journals_last_manual_update_between.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
""" Create a CSV of journals last updated before a given date. Headings are:
Journal title
Journal URL
Journal ISSN (print version)
Journal EISSN (online version)
Created date
Owner
Owner's email address
Country
Publisher
"""
import csv
from portality.lib.dates import DEFAULT_TIMESTAMP_VAL
from portality.models import Journal, Account
from portality.core import es_connection
from portality.lib import dates
LAST_MANUAL_UPDATE_BETWEEN = {
"query": {
"bool": {
"filter": {
"term": {
"admin.in_doaj": "true"
}
},
"must": {
"range": {
"last_manual_update": {
"gte": "x",
"lte": "y"
}
}
}
}
},
"sort": [
{
"created_date": "asc"
}
]
}
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--out", help="output file path")
parser.add_argument('-s', '--start_date', help=f'Last updated after threshold, default is {DEFAULT_TIMESTAMP_VAL}',
default=DEFAULT_TIMESTAMP_VAL)
parser.add_argument('-e', '--end_date', help='Last updated before threshold, default is now',
default=dates.now_str_with_microseconds())
args = parser.parse_args()
if not args.out:
print("Please specify an output file path with the -o option")
parser.print_help()
exit()
conn = es_connection
# Populate our query
LAST_MANUAL_UPDATE_BETWEEN['query']['bool']['must']['range']['last_manual_update']["gte"] = args.start_date
LAST_MANUAL_UPDATE_BETWEEN['query']['bool']['must']['range']['last_manual_update']["lte"] = args.end_date
with open(args.out, "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["ID",
"Journal Name",
"Journal URL",
"E-ISSN",
"P-ISSN",
"Created Date",
"Owner",
"Owner's email address",
"Country",
"Publisher"])
for journal in Journal.iterate(q=LAST_MANUAL_UPDATE_BETWEEN, keepalive='5m', wrap=True):
bibjson = journal.bibjson()
index = journal.data["index"]
owner = journal.owner
account = Account.pull(owner)
writer.writerow([journal.id,
bibjson.title,
bibjson.get_single_url(urltype="homepage"),
bibjson.get_one_identifier(bibjson.E_ISSN),
bibjson.get_one_identifier(bibjson.P_ISSN),
journal.created_date,
owner,
account.email if account else "Not Found",
index["country"],
bibjson.publisher
])