-
Notifications
You must be signed in to change notification settings - Fork 16
/
orphan_issns.py
97 lines (82 loc) · 2.39 KB
/
orphan_issns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import esprit
from copy import deepcopy
doaj = esprit.raw.Connection("http://doaj.org", "query", port=80)
journal_issn_query = {
"query" : {"term" : {"_type" : "journal"}},
"facets" : {
"issns" : {
"terms" : {
"field" : "index.issn.exact",
"size" : 30000
}
}
}
}
resp = esprit.raw.search(doaj, "journal,article", journal_issn_query, method="GET")
j = resp.json()
jissns = [t.get("term") for t in j.get("facets", {}).get("issns", {}).get("terms", [])]
print("Journal ISSNs", len(jissns))
article_issn_query = {
"query" : {"term" : {"_type" : "article"}},
"facets" : {
"issns" : {
"terms" : {
"field" : "index.issn.exact",
"size" : 30000
}
}
}
}
resp = esprit.raw.search(doaj, "journal,article", article_issn_query, method="GET")
j = resp.json()
aissns = [t.get("term") for t in j.get("facets", {}).get("issns", {}).get("terms", [])]
print("Article ISSNs", len(aissns))
missing = [issn for issn in aissns if issn not in jissns]
print("Orphaned", len(missing))
print(missing)
get_query = {
"query" : {
"bool" : {
"must" : [
{"term" : {"_type" : "article"}},
{"term" : {"index.issn.exact" : "<issn>"}}
]
}
},
"size" : 1
}
info = []
for m in missing:
q = deepcopy(get_query)
q["query"]["bool"]["must"][1]["term"]["index.issn.exact"] = m
resp = esprit.raw.search(doaj, "journal,article", q, method="GET")
res = esprit.raw.unpack_result(resp)
for r in res:
title = r.get("bibjson", {}).get("journal", {}).get("title")
info.append((m, title))
print(info)
name_query = {
"query" : {
"bool" : {
"must" : [
{"term" : {"_type" : "journal"}},
{"term" : {"index.title.exact" : "<title>"}}
]
}
},
"size" : 1
}
new_info = []
unfound = []
for i, n in info:
q = deepcopy(name_query)
q["query"]["bool"]["must"][1]["term"]["index.title.exact"] = n
resp = esprit.raw.search(doaj, "journal,article", q, method="GET")
res = esprit.raw.unpack_result(resp)
if len(res) == 0:
unfound.append((i, n))
for r in res:
issns = r.get("index", {}).get("issn", [])
new_info.append((i, n, issns))
print(i, n, "=>", issns)
print(unfound)