-
Notifications
You must be signed in to change notification settings - Fork 16
/
cross_journal_article_issns.py
73 lines (56 loc) · 2.1 KB
/
cross_journal_article_issns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
Script which determines which articles have ISSNs which belong to two distinct Journals
"""
from portality import models
from portality.core import app
from datetime import datetime
from portality.lib import dates
import esprit
import codecs
import csv
IN_DOAJ = {
"query" : {
"bool" : {
"must" : [
{"term" : {"admin.in_doaj" : True}}
]
}
}
}
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--out", help="output file path")
args = parser.parse_args()
if not args.out:
print("Please specify an output file path with the -o option")
parser.print_help()
exit()
conn = esprit.raw.make_connection(None, app.config["ELASTIC_SEARCH_HOST"], None, app.config["ELASTIC_SEARCH_DB"])
total = models.Article.count()
with codecs.open(args.out, "wb", "utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Count", "Article ID", "Article ISSNs", "Match On", "Journal ID", "Journal ISSNs", "Journal In DOAJ?"])
counter = 1
sofar = 0
start = dates.now()
for a in esprit.tasks.scroll(conn, models.Article.__type__, IN_DOAJ, page_size=1000, keepalive='5m'):
sofar += 1
if sofar % 1000 == 0:
eta = dates.eta(start, sofar, total)
print("{now} : {sofar}/{total} | ETA {eta}".format(now=dates.now_str(), sofar=sofar, total=total, eta=eta))
article = models.Article(**a)
bibjson = article.bibjson()
issns = bibjson.issns()
if len(issns) == 1:
continue
js = models.Journal.find_by_issn(issns, in_doaj=True)
if len(js) <= 1:
continue
for j in js:
jissns = j.bibjson().issns()
match_on = list(set(issns) & set(jissns))
row = [counter, article.id, ", ".join(issns), ", ".join(match_on), j.id, ", ".join(jissns), j.is_in_doaj()]
writer.writerow(row)
print(row)
counter += 1