-
Notifications
You must be signed in to change notification settings - Fork 36
/
test_data.py
328 lines (263 loc) · 12.3 KB
/
test_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
"""Tests for the data module."""
from collections import OrderedDict
import csv
from datetime import datetime
import io
import os
import requests
import tempfile
import uuid
import shutil
from zipfile import ZipFile
import pandas as pd
import psycopg2
import pytest
import dallinger
from dallinger.compat import open_for_csv
from dallinger.utils import generate_random_id
@pytest.fixture
def zip_path():
return os.path.join("tests", "datasets", "test_export.zip")
@pytest.mark.slow
class TestDataS3Integration(object):
"""Tests that interact with the network and S3, and are slow as a result.
"""
def test_connection_to_s3(self):
s3 = dallinger.data._s3_resource()
assert s3
def test_user_s3_bucket_first_time(self):
bucket = dallinger.data.user_s3_bucket(canonical_user_id=generate_random_id())
assert bucket
bucket.delete()
def test_user_s3_bucket_thrice(self):
id = generate_random_id()
for i in range(3):
bucket = dallinger.data.user_s3_bucket(canonical_user_id=id)
assert bucket
bucket.delete()
def test_user_s3_bucket_no_id_provided(self):
bucket = dallinger.data.user_s3_bucket()
assert bucket
def test_data_loading(self):
data = dallinger.data.load("3b9c2aeb-0eb7-4432-803e-bc437e17b3bb")
assert data
assert data.networks.csv
def test_register_id(self):
new_uuid = "12345-12345-12345-12345"
url = dallinger.data.register(new_uuid, "http://original-url.com/value")
# The registration creates a new file in the dallinger-registrations bucket
assert url.startswith("https://dallinger-registrations.")
assert new_uuid in url
# These files should be inaccessible to make it impossible to use the bucket
# as a file repository
res = requests.get(url)
assert res.status_code == 403
# We should be able to check that the UUID is registered
assert dallinger.data.is_registered(new_uuid) is True
assert dallinger.data.is_registered("bogus-uuid-value") is False
class TestDataLocally(object):
"""Tests that interact with local data only, and are relatively fast to
execute.
"""
@pytest.fixture
def cleanup(self):
yield
shutil.rmtree("data")
@pytest.fixture
def export(self, cleanup):
path = dallinger.data.export("12345-12345-12345-12345", local=True)
return path
data_path = os.path.join(
"tests", "datasets", "12eee6c6-f37f-4963-b684-da585acd77f1-data.zip"
)
bartlett_export = os.path.join("tests", "datasets", "bartlett_bots.zip")
def test_dataset_creation(self):
"""Load a dataset."""
dallinger.data.Data(self.data_path)
def test_conversions(self):
data = dallinger.data.Data(self.data_path)
assert data.networks.csv
assert data.networks.dict
assert data.networks.df.shape
assert data.networks.html
assert data.networks.latex
assert data.networks.list
assert data.networks.ods
assert data.networks.tsv
assert data.networks.xls
assert data.networks.xlsx
assert data.networks.yaml
def test_dataframe_conversion(self):
data = dallinger.data.Data(self.data_path)
assert data.networks.df.shape == (1, 13)
def test_csv_conversion(self):
data = dallinger.data.Data(self.data_path)
assert data.networks.csv[0:3] == "id,"
def test_tsv_conversion(self):
data = dallinger.data.Data(self.data_path)
assert data.networks.tsv[0:3] == "id\t"
def test_list_conversion(self):
data = dallinger.data.Data(self.data_path)
assert type(data.networks.list) is list
def test_dict_conversion(self):
data = dallinger.data.Data(self.data_path)
assert type(data.networks.dict) is OrderedDict
def test_df_conversion(self):
data = dallinger.data.Data(self.data_path)
assert type(data.networks.df) is pd.DataFrame
def test_local_data_loading(self):
local_data_id = "77777-77777-77777-77777"
dallinger.data.export(local_data_id, local=True)
data = dallinger.data.load(local_data_id)
assert data
assert data.networks.csv
def test_export_of_nonexistent_database(self):
nonexistent_local_db = str(uuid.uuid4())
with pytest.raises(psycopg2.OperationalError):
dallinger.data.copy_db_to_csv(nonexistent_local_db, "")
def test_export_of_dallinger_database(self):
export_dir = tempfile.mkdtemp()
dallinger.data.copy_db_to_csv("dallinger", export_dir)
assert os.path.isfile(os.path.join(export_dir, "network.csv"))
def test_exported_database_includes_headers(self):
export_dir = tempfile.mkdtemp()
dallinger.data.copy_db_to_csv("dallinger", export_dir)
network_table_path = os.path.join(export_dir, "network.csv")
assert os.path.isfile(network_table_path)
with open_for_csv(network_table_path, "r") as f:
reader = csv.reader(f, delimiter=",")
header = next(reader)
assert "creation_time" in header
def test_export(self, export):
assert os.path.isfile("data/12345-12345-12345-12345-data.zip")
def test_export_directory_format(self, export):
archive = ZipFile(export)
assert "data/info.csv" in archive.namelist()
def test_export_compatible_with_data(self, export):
assert dallinger.data.Data(export)
def test_scrub_pii(self):
path_to_data = os.path.join("tests", "datasets", "pii")
dallinger.data._scrub_participant_table(path_to_data)
with open_for_csv(os.path.join(path_to_data, "participant.csv"), "r") as f:
reader = csv.reader(f, delimiter=",")
next(reader) # Skip the header
for row in reader:
assert "PII" not in row
def test_scrub_pii_preserves_participants(self, db_session, zip_path, cleanup):
dallinger.data.ingest_zip(zip_path)
assert len(dallinger.models.Participant.query.all()) == 4
path = dallinger.data.export("test_export", local=True, scrub_pii=True)
p_file = ZipFile(path).open("data/participant.csv")
p_file = io.TextIOWrapper(p_file, encoding="utf8", newline="")
assert len(p_file.readlines()) == 5 # 4 Participants + header row
def test_copy_db_to_csv_includes_participant_data(self, db_session):
dallinger.data.ingest_zip(self.bartlett_export)
export_dir = tempfile.mkdtemp()
dallinger.data.copy_db_to_csv("dallinger", export_dir, scrub_pii=False)
participant_table_path = os.path.join(export_dir, "participant.csv")
assert os.path.isfile(participant_table_path)
with open_for_csv(participant_table_path, "r") as f:
reader = csv.reader(f, delimiter=",")
header = next(reader)
row1 = next(reader)
assert row1[header.index("worker_id")] == "SM6DMD"
def test_copy_db_to_csv_includes_scrubbed_participant_data(self, db_session):
dallinger.data.ingest_zip(self.bartlett_export)
export_dir = tempfile.mkdtemp()
dallinger.data.copy_db_to_csv("dallinger", export_dir, scrub_pii=True)
participant_table_path = os.path.join(export_dir, "participant.csv")
assert os.path.isfile(participant_table_path)
with open_for_csv(participant_table_path, "r") as f:
reader = csv.reader(f, delimiter=",")
header = next(reader)
row1 = next(reader)
assert row1[header.index("worker_id")] == "1"
class TestImport(object):
@pytest.fixture
def network_file(self):
data = u"""id,creation_time,property1,property2,property3,property4,property5,failed,time_of_death,type,max_size,full,role
1,2001-01-01 09:46:40.133536,,,,,,f,,fully-connected,4,f,experiment"""
f = io.StringIO(initial_value=data)
return f
@pytest.fixture
def missing_column_required(self):
"""Test participant table without worker_id column"""
data = u"""id,creation_time,property1,property2,property3,property4,property5,failed,time_of_death,type,worker_id,\
assignment_id,unique_id,hit_id,mode,end_time,base_pay,bonus,status
1,2001-01-01 09:46:40.133536,,,,,,f,,participant,,8,8:36V4Q8R5ZLTJWMX0SFF0G6R67PCQMI,\
3EHVO81VN5E60KEEQ146ZGFI3FH1H6,live,2017-03-30 20:06:44.618385,,,returned"""
f = io.StringIO(initial_value=data)
return f
@pytest.fixture
def missing_column_not_required(self):
"""Test participant table without fingerprint_hash column"""
data = u"""id,creation_time,property1,property2,property3,property4,property5,failed,time_of_death,type,worker_id,\
assignment_id,unique_id,hit_id,mode,end_time,base_pay,bonus,status
1,2001-01-01 09:46:40.133536,,,,,,f,,participant,8,36V4Q8R5ZLTJWMX0SFF0G6R67PCQMI,8:36V4Q8R5ZLTJWM\
X0SFF0G6R67PCQMI,3EHVO81VN5E60KEEQ146ZGFI3FH1H6,live,2017-03-30 20:06:44.618385,,,returned"""
f = io.StringIO(initial_value=data)
return f
def test_ingest_to_model(self, db_session, network_file):
dallinger.data.ingest_to_model(network_file, dallinger.models.Network)
networks = dallinger.models.Network.query.all()
assert len(networks) == 1
network = networks[0]
assert network.type == "fully-connected"
assert network.creation_time == datetime(2001, 1, 1, 9, 46, 40, 133536)
assert network.role == "experiment"
def test_ingest_to_model_allows_subsequent_insert(self, db_session, network_file):
dallinger.data.ingest_to_model(network_file, dallinger.models.Network)
db_session.add(dallinger.models.Network())
db_session.flush()
db_session.commit()
networks = dallinger.models.Network.query.all()
assert networks[1].id == 2
def test_missing_column_required(self, db_session, missing_column_required):
with pytest.raises(psycopg2.IntegrityError):
dallinger.data.ingest_to_model(
missing_column_required, dallinger.models.Participant
)
def test_missing_column_not_required(self, db_session, missing_column_not_required):
dallinger.data.ingest_to_model(
missing_column_not_required, dallinger.models.Participant
)
participant = dallinger.models.Participant.query.all()
assert len(participant) == 1
participant = participant[0]
assert participant.creation_time == datetime(2001, 1, 1, 9, 46, 40, 133536)
def test_ingest_zip_recreates_network(self, db_session, zip_path):
dallinger.data.ingest_zip(zip_path)
networks = dallinger.models.Network.query.all()
assert len(networks) == 1
assert networks[0].type == "chain"
def test_ingest_zip_recreates_participants(self, db_session, zip_path):
dallinger.data.ingest_zip(zip_path)
participants = dallinger.models.Participant.query.all()
assert len(participants) == 4
for p in participants:
assert p.status == "approved"
def test_ingest_zip_recreates_nodes(self, db_session, zip_path):
dallinger.data.ingest_zip(zip_path)
assert len(dallinger.models.Node.query.all()) == 5
def test_ingest_zip_recreates_infos(self, db_session, zip_path):
dallinger.data.ingest_zip(zip_path)
infos = dallinger.models.Info.query.all()
assert len(infos) == 5
for info in infos:
assert info.contents.startswith(u"One night two young men")
def test_ingest_zip_recreates_notifications(self, db_session, zip_path):
dallinger.data.ingest_zip(zip_path)
assert len(dallinger.models.Notification.query.all()) == 8
def test_ingest_zip_recreates_questions(self, db_session, zip_path):
dallinger.data.ingest_zip(zip_path)
model = dallinger.models.Question
p1_questions = model.query.filter_by(participant_id=1).all()
for q in p1_questions:
if q.response:
assert q.response == u"5"
def test_ingest_zip_recreates_vectors(self, db_session, zip_path):
dallinger.data.ingest_zip(zip_path)
assert len(dallinger.models.Vector.query.all()) == 4
def test_ingest_zip_recreates_transmissions(self, db_session, zip_path):
dallinger.data.ingest_zip(zip_path)
assert len(dallinger.models.Transmission.query.all()) == 4