-
Notifications
You must be signed in to change notification settings - Fork 16
/
accounts_with_same_email.py
92 lines (76 loc) · 2.94 KB
/
accounts_with_same_email.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
This script can be run to generate a CSV output of accounts which have the same email address
```
python accounts_with_same_email.py -o accounts_email.csv [-a]
```
"""
import csv
import esprit
from portality.core import es_connection
from portality.util import ipt_prefix
from portality import models
HAS_EMAIL = {
"query": {
"bool": {
"filter": {
"exists": {"field": "email"}
}
}
}
}
def users_with_emails():
for acc in esprit.tasks.scroll(conn, ipt_prefix('account'), q=HAS_EMAIL, page_size=100, keepalive='1m'):
yield models.Account(**acc)
def users_with_journals_and_emails():
""" Get accounts for all users with journals in the DOAJ """
for acc in esprit.tasks.scroll(conn, ipt_prefix('account'), q=HAS_EMAIL, page_size=100, keepalive='1m'):
acct = models.Account(**acc)
journal_ids = acct.journal
if journal_ids is not None:
for j in journal_ids:
journal = models.Journal.pull(j)
if journal is not None and journal.is_in_doaj():
yield acct
break
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--out", help="output file path")
parser.add_argument("-a", "--all", help="all users (defaults to users with journals in doaj)", action="store_true")
args = parser.parse_args()
if not args.out:
print("Please specify an output file path with the -o option")
parser.print_help()
exit()
conn = es_connection
with open(args.out, "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["ID", "Name", "Email", "Created", "Last Updated"])
# A dict of email: id for all accounts
emails_seen = {}
# To minimise how many accounts we need to initialise a second time, keep a set of ids to write to the csv
duplicated_ids = set()
users = users_with_emails() if args.all else users_with_journals_and_emails()
for account in users:
# for simplicity we just write out our account with duplicated email when found, sort later via spreadsheet
if account.email in emails_seen:
writer.writerow([
account.id,
account.name,
account.email,
account.created_date,
account.last_updated
])
duplicated_ids.add(emails_seen[account.email])
else:
emails_seen[account.email] = account.id
# Write additional rows containing those records found first that have duplicated emails
for _id in duplicated_ids:
a = models.Account.pull(_id)
writer.writerow([
a.id,
a.name,
a.email,
a.created_date,
a.last_updated
])