-
Notifications
You must be signed in to change notification settings - Fork 16
/
test_duplicate_report_script.py
184 lines (151 loc) · 8.49 KB
/
test_duplicate_report_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
""" Test the duplicate reporting and deletion script """
from portality.core import app
from doajtest.helpers import DoajTestCase
from doajtest.fixtures import ArticleFixtureFactory
from portality.tasks import article_duplicate_report
from portality.lib import paths
from portality import models
from portality.lib import dates
from collections import OrderedDict
import time
import os
import shutil
import csv
class TestArticleMatch(DoajTestCase):
def test_01_duplicates_report(self):
"""Check duplication reporting across all articles in the index"""
tmp_dir = paths.create_tmp_path(is_auto_mkdir=True).as_posix()
# Create 2 identical articles, a duplicate pair
article1 = models.Article(**ArticleFixtureFactory.make_article_source(
eissn='1111-1111',
pissn='2222-2222',
with_id=False,
in_doaj=True,
with_journal_info=True
))
a1_doi = article1.bibjson().get_identifiers('doi')
assert a1_doi is not None
article1.save(blocking=True)
time.sleep(1)
article2 = models.Article(**ArticleFixtureFactory.make_article_source(
eissn='1111-1111',
pissn='2222-2222',
with_id=False,
in_doaj=True,
with_journal_info=True
))
a2_doi = article2.bibjson().get_identifiers('doi')
assert a2_doi == a1_doi
article2.save(blocking=True)
# Run the reporting task
user = app.config.get("SYSTEM_USERNAME")
job = article_duplicate_report.ArticleDuplicateReportBackgroundTask.prepare(user, outdir=tmp_dir)
task = article_duplicate_report.ArticleDuplicateReportBackgroundTask(job)
task.run()
# The audit log should show we saved the reports to the TMP_DIR defined above
audit_1 = job.audit.pop(0)
assert audit_1.get('message', '').endswith(tmp_dir)
assert os.path.exists(tmp_dir + '/duplicate_articles_global_' + dates.today() + '.csv')
# It should also clean up its interim article csv
assert not os.path.exists(paths.rel2abs(__file__, 'tmp_article_duplicate_report'))
# The duplicates should be detected and appear in the report and audit summary count
with open(tmp_dir + '/duplicate_articles_global_' + dates.today() + '.csv') as f:
csvlines = f.readlines()
# We expect one result line + headings: our newest article has 1 duplicate
res = csvlines.pop()
assert res.startswith(article2.id) # The newest comes first, so article1 is article2's duplicate.
assert article1.id in res
assert 'doi+fulltext' in res
audit_2 = job.audit.pop(0)
assert audit_2.get('message', '') == '2 articles processed for duplicates. 1 global duplicate sets found.'
shutil.rmtree(tmp_dir, ignore_errors=True)
def test_02_duplicates_global_criteria(self):
""" Check we match only the actual duplicates, amongst other articles in the index. """
tmp_dir = paths.create_tmp_path(is_auto_mkdir=True).as_posix()
dup_doi = '10.xxx/xxx/duplicate'
dup_fulltext = 'http://fulltext.url/article/duplicate'
# Create 6 duplicate articles with varying creation times and duplication criteria
for i in range(1, 7):
src_minus_identifiers = ArticleFixtureFactory.make_article_source(
with_id=False,
in_doaj=True,
with_journal_info=True
)
del src_minus_identifiers['bibjson']['identifier']
del src_minus_identifiers['bibjson']['link']
article = models.Article(**src_minus_identifiers)
# some overlapping duplication criteria
if i % 2:
article.bibjson().add_identifier('doi', dup_doi)
else:
article.bibjson().add_identifier('doi', '10.1234/' + str(i))
if i % 3:
article.bibjson().add_url(url=dup_fulltext, urltype='fulltext', content_type='html')
else:
article.bibjson().add_url('http://not_duplicate/fulltext/' + str(i), 'fulltext', 'html')
article.save(blocking=True)
# So we have the following fixtures:
# +---------------------------------------------------------+
# | Generated Articles |
# +---------------------------------------------------------+
# | | DOI match | Fulltext match | Expected Report Result |
# +---+-----------+----------------+------------------------+
# | 1 | X | X | doi+fulltext |
# +---+-----------+----------------+------------------------+
# | 2 | 0 | X | fulltext |
# +---+-----------+----------------+------------------------+
# | 3 | X | 0 | doi |
# +---+-----------+----------------+------------------------+
# | 4 | 0 | X | fulltext |
# +---+-----------+----------------+------------------------+
# | 5 | X | X | doi+fulltext |
# +---+-----------+----------------+------------------------+
# | 6 | 0 | 0 | none |
# +---+-----------+----------------+------------------------+
# If a criteria is hit, it is duplicated with all other articles i.e. not just pairwise. Going newest to oldest,
# we expect: 5 is duplicated to 4, 3, 2, 1
# 4 is duplicated to 5, 2, 1
# 3 is duplicated to 5, 1
# 2 is duplicated to 5, 4, 1
# and 1 is duplicated to 5, 4, 3, 2
#
# So the task will report that there are 16 duplicates.
#
# However, once a pair has been detected it'll only be reported once. Therefore, we expect:
# 5 is duplicated to 4, 3, 2, 1
# 4 is duplicated to 5, 2, 1
# 3 is duplicated to 5, 1
#
# So the report will have 9 match pairs, totalling 10 lines including the headings.
# Run the reporting task
user = app.config.get("SYSTEM_USERNAME")
job = article_duplicate_report.ArticleDuplicateReportBackgroundTask.prepare(user, outdir=tmp_dir)
task = article_duplicate_report.ArticleDuplicateReportBackgroundTask(job)
task.run()
audit = job.audit
assert next((msg for msg in audit if msg["message"] == '6 articles processed for duplicates. 3 global duplicate sets found.'), None) is not None
table = []
with open(tmp_dir + '/duplicate_articles_global_' + dates.today() + '.csv') as f:
reader = csv.reader(f)
for row in reader:
table.append(row)
# We expect there to be 10 rows.
assert len(table) == 10, "expected: 10, received: {}".format(len(table))
headings = table.pop(0)
# We expect there to be one ID with 4 duplicates, one with 3, and one with 2 (in that order)
article_ids = [row[0] for row in table]
[a, b, c] = list(OrderedDict.fromkeys(article_ids)) # Dedupe keeping order
expected = {a: 4, b: 3, c: 2}
assert article_ids.count(a) == expected[a], "received: {}, expected: {}".format(article_ids.count(a), expected[a])
assert article_ids.count(b) == expected[b], "received: {}, expected: {}".format(article_ids.count(a), expected[a])
assert article_ids.count(c) == expected[c], "received: {}, expected: {}".format(article_ids.count(a), expected[a])
# These counts should equal the number counted in the report itself
for r in table:
assert int(r[headings.index('n_matches')]) == expected[r[0]], "received: {}, expected: {}".format(int(r[headings.index('n_matches')]), expected[r[0]])
# Article a should have one doi+fulltext match, one doi match, and 2 fulltext matches.
a_duplicates = [row for row in table if row[0] == a]
a_match_types = [row[headings.index('match_type')] for row in a_duplicates]
assert a_match_types.count('doi+fulltext') == 1, "received: {}, expected 1".format(a_match_types.count('doi+fulltext'))
assert a_match_types.count('doi') == 1, "received: {}, expected 1".format(a_match_types.count('doi'))
assert a_match_types.count('fulltext') == 2, "received: {}, expected 2".format(a_match_types.count('fulltext'))
shutil.rmtree(tmp_dir, ignore_errors=True)