-
Notifications
You must be signed in to change notification settings - Fork 17
/
test_cache_models.py
626 lines (421 loc) · 20.8 KB
/
test_cache_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
"""Test reference cache models."""
from pathlib import Path
from typing import Dict, Any, List
import pytest
from _pytest.logging import LogCaptureFixture
from pydantic import ValidationError
from BALSAMIC.constants.cache import (
GRCHVersion,
DockerContainers,
GenomeVersion,
)
from BALSAMIC.constants.constants import FileType, BwaIndexFileType
from BALSAMIC.models.cache import (
AnalysisReferences,
AnalysisReferencesCanFam,
AnalysisReferencesHg,
ReferenceUrl,
References,
ReferencesCanFam,
ReferencesHg,
CacheAnalysis,
CacheConfig,
)
from BALSAMIC.utils.exc import BalsamicError
def test_analysis_references(analysis_references_data: Dict[str, Path]):
"""Test common analysis references model."""
# GIVEN an input for the analysis reference model
# WHEN initialising the model
model: AnalysisReferences = AnalysisReferences(**analysis_references_data)
# THEN the model should have been correctly built
assert model.model_dump() == analysis_references_data
def test_analysis_references_empty():
"""Test common analysis references model for an empty input."""
# GIVEN no input for the analysis reference model
# WHEN initialising the model
with pytest.raises(ValidationError):
# THEN an empty model should raise a ValidationError
AnalysisReferences()
def test_analysis_references_canfam(analysis_references_data: Dict[str, Path]):
"""Test canine analysis references model."""
# GIVEN an input for the canine analysis reference model
# WHEN initialising the model
model: AnalysisReferencesCanFam = AnalysisReferencesCanFam(
**analysis_references_data
)
# THEN the model should have been correctly built
assert model.model_dump() == analysis_references_data
def test_analysis_references_canfam_empty():
"""Test canine analysis references model for an empty input."""
# GIVEN no input for the canine analysis reference model
# WHEN initialising the model
with pytest.raises(ValidationError):
# THEN an empty model should raise a ValidationError
AnalysisReferencesCanFam()
def test_analysis_references_hg(analysis_references_hg_data: Dict[str, Path]):
"""Test human genome analysis references model."""
# GIVEN an input for the human genome analysis reference model
# WHEN initialising the model
model: AnalysisReferencesHg = AnalysisReferencesHg(**analysis_references_hg_data)
# THEN the model should have been correctly built
assert model.model_dump() == analysis_references_hg_data
def test_analysis_references_hg_empty():
"""Test human genome analysis references model for an empty input."""
# GIVEN no input for the human genome analysis reference model
# WHEN initialising the model
with pytest.raises(ValidationError):
# THEN an empty model should raise a ValidationError
AnalysisReferencesHg()
def test_reference_url(reference_url_data: Dict[str, Any]):
"""Test references URL model."""
# GIVEN an input for the reference URL model
# WHEN initialising the model
model: ReferenceUrl = ReferenceUrl(**reference_url_data)
# THEN the model should have been correctly built
assert model.model_dump() == reference_url_data
def test_reference_url_empty():
"""Test references URL model for an empty input."""
# GIVEN no input for the references URL model
# WHEN initialising the model
with pytest.raises(ValidationError):
# THEN an empty model should raise a ValidationError
ReferenceUrl()
def test_references(references_data: Dict[str, dict], references: References):
"""Test references model."""
# GIVEN an input for the reference model
# WHEN initialising the model
model: References = References(**references_data)
# THEN the model should have been correctly built
assert model == references
def test_references_empty():
"""Test references model for an empty input."""
# GIVEN no input for the references model
# WHEN initialising the model
with pytest.raises(ValidationError):
# THEN an empty model should raise a ValidationError
References()
def test_get_reference_genome_file_paths(references: References):
"""Test reference genome files retrieval."""
# GIVEN a references model
# GIVEN the expected files to be retrieved
expected_file_types: set = {FileType.FASTA, FileType.FAI, FileType.DICT}
expected_file_types.update(BwaIndexFileType)
# WHEN getting the reference genome files
reference_genome_files: List[str] = references.get_reference_genome_file_paths()
# THEN the expected reference genome files should be returned
assert len(reference_genome_files) == len(expected_file_types)
for file_type in expected_file_types:
assert file_type in [file.split(".")[-1] for file in reference_genome_files]
def test_get_reference_genome_bwa_index_file_paths(references: References):
"""Test extraction of reference genome BWA index files."""
# GIVEN a references model
# GIVEN the expected files to be retrieved
expected_file_types: set = set(BwaIndexFileType)
# WHEN getting the reference genome BWA index files
bwa_index_files: List[str] = references.get_reference_genome_bwa_index_file_paths()
# THEN the expected reference genome BWA index files should be returned
assert len(bwa_index_files) == len(expected_file_types)
for file_type in expected_file_types:
assert file_type in [file.split(".")[-1] for file in bwa_index_files]
def test_get_refgene_file_paths(
references: References, refgene_bed_file: Path, refgene_flat_file: Path
):
"""Test extraction of RefSeq's gene files."""
# GIVEN a references model and some mocked RefSeq's gene file
# WHEN getting the RefSeq's gene files
refgene_files: List[str] = references.get_refgene_file_paths()
# THEN the expected RefSeq's gene files should be returned
assert len(refgene_files) == 3
assert references.refgene_txt.file_path in refgene_files
assert refgene_bed_file.as_posix() in refgene_files
assert refgene_flat_file.as_posix() in refgene_files
def test_get_refgene_flat_file_path(references: References, refgene_flat_file: Path):
"""Test extraction of RefSeq's gene FLAT file."""
# GIVEN a references model and a mocked RefSeq's gene FLAT file
# WHEN getting the RefSeq's gene FLAT file
refgene_output_file: str = references.get_refgene_flat_file_path()
# THEN the correctly formatted flat file should be returned
assert refgene_output_file == refgene_flat_file.as_posix()
def test_get_refgene_bed_file_path(references: References, refgene_bed_file: Path):
"""Test extraction of RefSeq's gene BED file."""
# GIVEN a references model and a mocked RefSeq's gene BED file
# WHEN getting the RefSeq's gene BED file
refgene_output_file: str = references.get_refgene_bed_file_path()
# THEN the correctly formatted flat file should be returned
assert refgene_output_file == refgene_bed_file.as_posix()
def test_references_canfam(references_data: Dict[str, dict], references: References):
"""Test canine references model."""
# GIVEN an input for the canine reference model
# WHEN initialising the model
model: ReferencesCanFam = ReferencesCanFam(**references_data)
# THEN the model should have been correctly built
assert model.model_dump() == references.model_dump()
def test_references_canfam_empty():
"""Test canine references model for an empty input."""
# GIVEN no input for the canine references model
# WHEN initialising the model
with pytest.raises(ValidationError):
# THEN an empty model should raise a ValidationError
ReferencesCanFam()
def test_references_hg(
references_hg_data: Dict[str, dict], references_hg: ReferencesHg
):
"""Test human genome references model."""
# GIVEN an input for the human genome reference model
# WHEN initialising the model
model: ReferencesHg = ReferencesHg(**references_hg_data)
# THEN the model should have been correctly built
assert model == references_hg
def test_references_hg_empty():
"""Test human genome references model for an empty input."""
# GIVEN no input for the human genome references model
# WHEN initialising the model
with pytest.raises(ValidationError):
# THEN an empty model should raise a ValidationError
ReferencesHg()
def test_get_cadd_snv_file_paths(
references_hg: ReferencesHg, cadd_snv_indexed_file: Path
):
"""Test get CADD SNV reference output files."""
# GIVEN a human genome references model and a mocked CADD SNV indexed file
# WHEN getting the CADD specific reference files
cadd_snv_files: List[str] = references_hg.get_cadd_snv_file_paths()
# THEN all the CADD SNV reference files should be returned
assert len(cadd_snv_files) == 2
assert references_hg.cadd_snv.file_path in cadd_snv_files
assert cadd_snv_indexed_file.as_posix() in cadd_snv_files
def test_get_delly_file_paths(
references_hg: ReferencesHg, delly_exclusion_converted_file: Path
):
"""Test Delly specific files retrieval."""
# GIVEN a human genome references model and a mocked Delly exclusion converted file
# WHEN getting the Delly specific reference files
delly_files: List[str] = references_hg.get_delly_file_paths()
# THEN all the delly reference files should be returned
assert len(delly_files) == 5
assert references_hg.delly_exclusion.file_path in delly_files
assert delly_exclusion_converted_file.as_posix() in delly_files
assert references_hg.delly_mappability.file_path in delly_files
assert references_hg.delly_mappability_findex.file_path in delly_files
assert references_hg.delly_mappability_gindex.file_path in delly_files
def test_get_delly_exclusion_converted_file_path(
references_hg: ReferencesHg, delly_exclusion_converted_file: Path
):
"""Test get Delly exclusion converted file."""
# GIVEN a human genome references model and a delly exclusion converted file
# WHEN getting the Delly exclusion converted file
converted_file: str = references_hg.get_delly_exclusion_converted_file_path()
# THEN the returned file should match the expected one
assert converted_file == delly_exclusion_converted_file.as_posix()
def test_get_gnomad_file_paths(references_hg: ReferencesHg):
"""Test get gnomad reference files."""
# GIVEN a human genome references model
# WHEN getting the gnomad reference files
gnomad_files: List[str] = references_hg.get_gnomad_file_paths()
# THEN the gnomad files should be returned
assert len(gnomad_files) == 2
assert references_hg.gnomad_variant.file_path in gnomad_files
assert references_hg.gnomad_variant_index.file_path in gnomad_files
def test_get_1k_genome_file_paths(references_hg: ReferencesHg):
"""Test get 1000 Genome related files."""
# GIVEN a human genome references model
# WHEN getting the 1k genome files
genome_files: List[str] = references_hg.get_1k_genome_file_paths()
# THEN the 1k genome files should be returned
assert len(genome_files) == 4
assert f"{references_hg.known_indel_1kg.file_path}.{FileType.GZ}" in genome_files
assert f"{references_hg.mills_1kg.file_path}.{FileType.GZ}" in genome_files
assert f"{references_hg.hc_vcf_1kg.file_path}.{FileType.GZ}" in genome_files
assert f"{references_hg.vcf_1kg.file_path}.{FileType.GZ}" in genome_files
def test_cache_analysis(cache_analysis_data: Dict[str, str]):
"""Test cache analysis model initialisation."""
# GIVEN an input for the cache analysis model
# WHEN initialising the model
model: CacheAnalysis = CacheAnalysis(**cache_analysis_data)
# THEN the model should have been correctly built
assert model.model_dump() == cache_analysis_data
def test_cache_analysis_empty():
"""Test ache analysis model for an empty input."""
# GIVEN no input for the cache analysis model
# WHEN initialising the model
with pytest.raises(ValidationError):
# THEN an empty model should raise a ValidationError
CacheAnalysis()
def test_cache_config(cache_config_data: Dict[str, Any], cache_config: CacheConfig):
"""Test cache config model initialisation."""
# GIVEN an input for the cache config model and a mocked reference model
# WHEN initialising the model
model: CacheConfig = CacheConfig(**cache_config_data)
# THEN the model should have been correctly built
assert model == cache_config
def test_cache_config_empty():
"""Test cache config model for an empty input."""
# GIVEN no input for the cache config model
# WHEN initialising the model
with pytest.raises(ValidationError):
# THEN an empty model should raise a ValidationError
CacheConfig()
def test_cache_config_empty_file_path(cache_config_data: Dict[str, dict]):
"""Test cache config model reference validation method and file path assignment."""
# GIVEN a cache config model data with empty file paths
# WHEN initialising the model
model: CacheConfig = CacheConfig(**cache_config_data)
# THEN the file paths should have been assigned
for reference in model.references:
assert reference[1].file_path
def test_cache_config_empty_cosmic_key(
cache_config_data: Dict[str, dict], cosmic_key: str
):
"""Test cache config model reference validation method and cosmic key assignment."""
# GIVEN a cache config model data with empty cosmic keys
# WHEN initialising the model
model: CacheConfig = CacheConfig(**cache_config_data)
# THEN a cosmic key should only have been assigned to a cosmic reference file
for reference in model.references:
if reference[0] == "cosmic":
assert reference[1].secret == cosmic_key
continue
assert reference[1].secret is None
def test_get_grch_version(cache_config: CacheConfig):
"""Test extraction of the GRCH format version having a specific genome version."""
# GIVEN a cache config model
# WHEN getting the GRCH version
grch_version: GRCHVersion = cache_config.get_grch_version()
# THEN a correct GRCH format version should be returned
assert grch_version == GRCHVersion.GRCH37
def test_get_reference_file_paths(cache_config: CacheConfig):
"""Test reference path extraction."""
# GIVEN a cache config model
# WHEN extracting the list of reference paths
reference_paths: List[str] = cache_config.get_reference_file_paths()
# THEN a complete list of reference path should be returned
assert reference_paths == [
reference[1].file_path for reference in cache_config.references
]
def test_get_reference_by_path(cache_config: CacheConfig):
"""Test reference extraction given its path."""
# GIVEN a cache config model
# WHEN getting the reference genome by path
reference_genome: ReferenceUrl = cache_config.get_reference_by_path(
reference_path=cache_config.references.reference_genome.file_path
)
# THEN the correct reference should be returned
assert reference_genome == cache_config.references.reference_genome
def test_get_reference_by_path_error(
cache_config: CacheConfig, invalid_json_file: Path, caplog: LogCaptureFixture
):
"""Test reference extraction given an invalid path."""
# GIVEN a cache config model
# WHEN getting the reference genome by path
with pytest.raises(BalsamicError):
cache_config.get_reference_by_path(reference_path=invalid_json_file.as_posix())
# THEN a Balsamic error should be returned
assert (
f"No reference with the provided reference path {invalid_json_file.as_posix()}"
in caplog.text
)
def test_get_reference_file_paths_by_file_type_and_compression(
cache_config: CacheConfig,
):
"""Test reference path extraction by file type and compression."""
# GIVEN a cache config model
# WHEN extracting the reference paths by file type and compression status
reference_paths: List[
str
] = cache_config.get_reference_file_paths_by_file_type_and_compression(
file_type=FileType.FASTA, compression=True
)
# THEN the expected reference path should be returned
assert reference_paths == [cache_config.references.reference_genome.file_path]
def test_get_reference_file_paths_by_file_type(cache_config: CacheConfig):
"""Test reference path extraction by file type."""
# GIVEN a cache config model
# WHEN extracting the reference paths by file type
reference_paths: List[str] = cache_config.get_reference_file_paths_by_file_type(
file_type=FileType.FASTA
)
# THEN the TXT file should be returned
assert reference_paths == [cache_config.references.reference_genome.file_path]
def test_get_reference_file_paths_by_compression(cache_config: CacheConfig):
"""Test reference path extraction by compression."""
# GIVEN a cache config model
# WHEN extracting the reference paths by compression status
reference_paths: List[str] = cache_config.get_reference_file_paths_by_compression(
compression=True
)
# THEN the expected reference path should be returned
assert len(reference_paths) == 11
for reference in [
cache_config.references.ascat_gc_correction.file_path,
cache_config.references.clinvar.file_path,
cache_config.references.cosmic.file_path,
cache_config.references.dbsnp.file_path,
cache_config.references.hc_vcf_1kg.file_path,
cache_config.references.known_indel_1kg.file_path,
cache_config.references.mills_1kg.file_path,
cache_config.references.reference_genome.file_path,
cache_config.references.refgene_txt.file_path,
cache_config.references.somalier_sites.file_path,
cache_config.references.vcf_1kg.file_path,
]:
assert reference in reference_paths
def test_get_compressed_indexed_vcf_paths(cache_config: CacheConfig):
"""Test get compressed indexed VCFs."""
# GIVEN a cache config model
# WHEN retrieving the compressed and indexed VCFs
compressed_indexed_vcfs: List[str] = cache_config.get_compressed_indexed_vcf_paths()
# THEN the indexed VCFs should be returned
assert len(compressed_indexed_vcfs) == 8
for reference in [
cache_config.references.dbsnp.file_path,
cache_config.references.vcf_1kg.file_path,
cache_config.references.known_indel_1kg.file_path,
cache_config.references.mills_1kg.file_path,
cache_config.references.clinvar.file_path,
cache_config.references.somalier_sites.file_path,
cache_config.references.hc_vcf_1kg.file_path,
cache_config.references.cosmic.file_path,
]:
assert f"{reference}.{FileType.GZ}.{FileType.TBI}" in compressed_indexed_vcfs
def test_get_container_output_paths(cache_config: CacheConfig, tmp_path: Path):
"""Test retrieval of the containers output paths."""
# GIVEN a cache config model
# WHEN getting the list of container paths
container_paths: List[str] = cache_config.get_container_output_paths()
# THEN all the container paths should be returned
assert len(container_paths) == len(set(DockerContainers))
for container in set(DockerContainers):
assert Path(tmp_path, f"{container}.{FileType.SIF}")
def test_get_reference_output_paths(cache_config: CacheConfig):
"""Test get reference list to be downloaded."""
# GIVEN a cache config model
# WHEN retrieving the reference output paths
reference_output_paths: List[str] = cache_config.get_reference_output_paths()
# THEN all the reference paths should be returned
assert len(reference_output_paths) == 44
def test_get_analysis_references_hg(
cache_config: CacheConfig,
analysis_references_hg_data: Dict[str, Path],
analysis_references_hg: AnalysisReferences,
):
"""Test analysis references retrieval to be used for Balsamic human genome analyses."""
# GIVEN a canine cache config model
cache_config.genome_version = GenomeVersion.HG19
# WHEN getting the analysis references
analysis_references: AnalysisReferencesHg = cache_config.get_analysis_references()
# THEN the retrieved analysis references should match the mocked one
assert type(analysis_references) is AnalysisReferencesHg
assert analysis_references == analysis_references_hg
def test_get_analysis_references_canfam(
cache_config: CacheConfig, analysis_references_data: Dict[str, Path]
):
"""Test analysis references retrieval to be used for Balsamic canine analyses."""
# GIVEN a canine cache config model
cache_config.genome_version = GenomeVersion.CanFam3
# WHEN getting the analysis references
analysis_references: AnalysisReferencesCanFam = (
cache_config.get_analysis_references()
)
# THEN the retrieved analysis references should match the mocked one
assert type(analysis_references) is AnalysisReferencesCanFam
assert analysis_references.model_dump() == analysis_references_data