/
duplicate_articles.py
88 lines (68 loc) · 3.36 KB
/
duplicate_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import csv
IN = "/home/richard/tmp/article_duplicates_2018-05-22/duplicate_articles_global_2018-05-22.csv"
GENUINE = "/home/richard/tmp/article_duplicates_2018-05-22/genuine_duplicates.csv"
BAD_DATA = "/home/richard/tmp/article_duplicates_2018-05-22/bad_data.csv"
def _to_dict(header, row):
d = {}
for i, h in enumerate(header):
d[h] = row[i]
return d
unique_ids = []
unique_deduplicated = []
genuine_unique_ids = []
genuine_unique_deduplicated = []
bad_data_unique_ids = []
bad_data_unique_deduplicated = []
genuine_count = 0
bad_data_count = 0
with open(GENUINE, "w", encoding="utf-8") as a:
awriter = csv.writer(a)
with open(BAD_DATA, "w", encoding="utf-8") as b:
bwriter = csv.writer(b)
with open(IN, "r", encoding="utf-8") as f:
reader = csv.writer(f)
headers = next(reader)
awriter.writerow(headers)
bwriter.writerow(headers)
i = 0
for row in reader:
print(i)
i += 1
data = _to_dict(headers, row)
aid = data["article_id"]
mid = data["match_id"]
if aid not in unique_ids:
unique_ids.append(aid)
if aid not in unique_deduplicated:
unique_deduplicated.append(aid)
if mid not in unique_ids:
unique_ids.append(mid)
titles_match = data["titles_match"] == "True"
owners_match = data["owners_match"] == "True"
issns_match = sorted(data["match_issns"].split(",")) == sorted(data["article_issns"].split(","))
if titles_match and owners_match and issns_match:
genuine_count += 1
awriter.writerow(row)
if aid not in genuine_unique_ids:
genuine_unique_ids.append(aid)
if aid not in genuine_unique_deduplicated:
genuine_unique_deduplicated.append(aid)
if mid not in genuine_unique_ids:
genuine_unique_ids.append(mid)
else:
bad_data_count += 1
bwriter.writerow(row)
if aid not in bad_data_unique_ids:
bad_data_unique_ids.append(aid)
if aid not in bad_data_unique_deduplicated:
bad_data_unique_deduplicated.append(aid)
if mid not in bad_data_unique_ids:
bad_data_unique_ids.append(mid)
print(("Total articles engaged in duplication: " + str(len(unique_ids))))
print(("Total articles that would remain after de-duplication: " + str(len(unique_deduplicated))))
print(("Total estimated genuine duplication pairs: " + str(genuine_count)))
print(("Total estimated articles engaged in genuine duplication: " + str(len(genuine_unique_ids))))
print(("Total estimated articles that would remain from genuine duplication after de-duplication: " + str(len(genuine_unique_deduplicated))))
print(("Total estimated bad data duplication pairs: " + str(bad_data_count)))
print(("Total estimated articles engaged in 'bad data' duplication: " + str(len(bad_data_unique_ids))))
print(("Total estimated articles that would remain from 'bad data' duplication after de-duplication: " + str(len(bad_data_unique_deduplicated))))