Skip to content

Commit

Permalink
Remove duplicate counts when aggregating per taxonomy or assembly
Browse files Browse the repository at this point in the history
  • Loading branch information
tcezard committed Apr 12, 2024
1 parent 7ac2345 commit ebe2281
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def _write_per_taxonomy_counts(self, session):
self.info(f"Create persistence for aggregate per taxonomy {taxonomy_id}")
taxonomy_row = RSCountPerTaxonomy(
taxonomy_id=taxonomy_id,
assembly_accessions=species_annotation.get(taxonomy_id).get('assemblies'),
assembly_accessions=list(species_annotation.get(taxonomy_id).get('assemblies')),
release_folder=species_annotation.get(taxonomy_id).get('release_folder'),
release_version=self.release_version,
)
Expand Down Expand Up @@ -375,6 +375,11 @@ def get_assembly_counts_from_database(self):
return results

def parse_count_script_logs(self, all_logs):
'''
Create a list of grouped count
:param all_logs:
:return:
'''
for log_file in all_logs:
with open(log_file) as open_file:
for line in open_file:
Expand All @@ -394,31 +399,40 @@ def generate_per_taxonomy_counts(self):
species_counts = defaultdict(Counter)
species_annotations = defaultdict(dict)
for count_groups in self.all_counts_grouped:
for count_dict in count_groups:
species_counts[count_dict['taxonomy']][count_dict['idtype']] += count_dict['count']
if 'assemblies' not in species_annotations.get(count_dict['taxonomy'], {}):
species_annotations[count_dict['taxonomy']] = {
'assemblies': set(),
'release_folder': None
}

species_annotations[count_dict['taxonomy']]['assemblies'].add(count_dict['assembly'])
species_annotations[count_dict['taxonomy']]['release_folder'] = count_dict['release_folder']
taxonomy_and_types = set([(count_dict['taxonomy'], count_dict['idtype']) for count_dict in count_groups])
for taxonomy, rstype in taxonomy_and_types:
if taxonomy not in species_annotations:
species_annotations[taxonomy] = {'assemblies': set(), 'release_folder': None}
# All count_dict have the same count in a group
species_counts[taxonomy][rstype] += count_groups[0]['count']
species_annotations[taxonomy]['assemblies'].update(
set([
count_dict['assembly']
for count_dict in count_groups
if count_dict['taxonomy'] is taxonomy and count_dict['idtype'] is rstype
])
)
species_annotations[taxonomy]['release_folder'] = count_groups[0]['release_folder']
return species_counts, species_annotations

def generate_per_assembly_counts(self):
assembly_counts = defaultdict(Counter)
assembly_annotations = {}
for count_groups in self.all_counts_grouped:
for count_dict in count_groups:
assembly_counts[count_dict['assembly']][count_dict['idtype']] += count_dict['count']
if 'taxonomies' not in assembly_annotations.get(count_dict['assembly'], {}):
assembly_annotations[count_dict['assembly']] = {
'taxonomies': set(),
'release_folder': None
}
assembly_annotations[count_dict['assembly']]['taxonomies'].add(count_dict['taxonomy'])
assembly_annotations[count_dict['assembly']]['release_folder'] = count_dict['assembly']
assembly_and_types = set([(count_dict['assembly'], count_dict['idtype']) for count_dict in count_groups])
for assembly_accession, rstype in assembly_and_types:
if assembly_accession not in assembly_annotations:
assembly_annotations[assembly_accession] = {'taxonomies': set(), 'release_folder': None}
# All count_dict have the same count in a group
assembly_counts[assembly_accession][rstype] += count_groups[0]['count']
assembly_annotations[assembly_accession]['taxonomies'].update(
set([
count_dict['taxonomy']
for count_dict in count_groups
if count_dict['assembly'] is assembly_accession and count_dict['idtype'] is rstype
]))

assembly_annotations[assembly_accession]['release_folder'] = assembly_accession
return assembly_counts, assembly_annotations

# def generate_per_species_assembly_counts(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
18746871 GCA_000188235.2-oreochromis_niloticus-current,GCA_000188235.2-haplochromini-current,
4882192 GCA_001858045.3-oreochromis_niloticus-current,
237 GCA_000188235.2-oreochromis_niloticus-current,GCA_001858045.3-oreochromis_niloticus-current,GCA_000188235.2-haplochromini-current,
71 GCA_000188235.2-oreochromis_niloticus-deprecated,GCA_000188235.2-haplochromini-deprecated,
17 Unmapped-oreochromis_niloticus-unmapped,
14 GCA_000188235.2-haplochromini-multimap,GCA_000188235.2-oreochromis_niloticus-multimap,
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from itertools import cycle
from unittest import TestCase
from unittest.mock import patch

Expand Down Expand Up @@ -52,8 +53,11 @@ def test_write_counts_to_db(self):
log_files_release1 = [os.path.join(self.resource_folder, 'count_for_release1.log')]
log_files_release2 = [os.path.join(self.resource_folder, 'count_for_release2.log')]
list_cow_assemblies = ['GCA_000003055.3', 'GCA_000003055.5', 'GCA_000003205.1', 'GCA_000003205.4', 'GCA_000003205.6', 'Unmapped']
with patch.object(ReleaseCounter, 'get_taxonomy_and_scientific_name') as ptaxonomy:
ptaxonomy.return_value = (9913, 'Bos taurus')
folder_to_taxonomy = {'bos_taurus': 9913}

with patch.object(ReleaseCounter, 'get_taxonomy') as ptaxonomy:
# ptaxonomy.side_effect = lambda x: folder_to_taxonomy.get(x)
ptaxonomy.return_value = 9913
counter = ReleaseCounter(self.private_config_xml_file, config_profile=self.config_profile,
release_version=1, logs=log_files_release1)
counter.write_counts_to_db()
Expand All @@ -67,7 +71,7 @@ def test_write_counts_to_db(self):
result = session.execute(query).fetchone()
rs_taxonomy_count = result.RSCountPerTaxonomy
assert sorted(rs_taxonomy_count.assembly_accessions) == list_cow_assemblies
assert rs_taxonomy_count.current_rs == 169904286
assert rs_taxonomy_count.current_rs == 102813585
assert rs_taxonomy_count.new_current_rs == 0
assert rs_taxonomy_count.release_folder == 'Cow_9913'

Expand All @@ -76,8 +80,8 @@ def test_write_counts_to_db(self):
result = session.execute(query).fetchone()
rs_taxonomy_count = result.RSCountPerTaxonomy
assert sorted(rs_taxonomy_count.assembly_accessions) == list_cow_assemblies
assert rs_taxonomy_count.current_rs == 169101573
assert rs_taxonomy_count.new_current_rs == -802713
assert rs_taxonomy_count.current_rs == 102605893
assert rs_taxonomy_count.new_current_rs == -207692
assert rs_taxonomy_count.release_folder == 'bos_taurus'

query = select(RSCountPerAssembly).where(RSCountPerAssembly.assembly_accession == 'GCA_000003205.6',
Expand All @@ -89,3 +93,21 @@ def test_write_counts_to_db(self):
assert rs_assembly_count.new_current_rs == 0
assert rs_assembly_count.release_folder == 'GCA_000003205.6'

def test_write_counts_to_db2(self):
log_files_release = [os.path.join(self.resource_folder, 'count_for_haplochromini_oreochromis_niloticus.log')]
folder_to_taxonomy = {'oreochromis_niloticus': 8128, 'haplochromini': 319058}

with patch.object(ReleaseCounter, 'get_taxonomy') as ptaxonomy:
ptaxonomy.side_effect = lambda x: folder_to_taxonomy.get(x)
counter = ReleaseCounter(self.private_config_xml_file, config_profile=self.config_profile,
release_version=4, logs=log_files_release)
counter.write_counts_to_db()
session = Session(counter.sqlalchemy_engine)

query = select(RSCountPerAssembly).where(RSCountPerAssembly.assembly_accession == 'GCA_000188235.2',
RSCountPerAssembly.release_version == 4)
result = session.execute(query).fetchone()
rs_assembly_count = result.RSCountPerAssembly
assert rs_assembly_count.current_rs == 18747108 # 18746871 + 237
assert rs_assembly_count.release_folder == 'GCA_000188235.2'

0 comments on commit ebe2281

Please sign in to comment.