Script to grab small standalone sample of athena files to use for validation steps

In [None]:
import pandas as pd
from omop_alchemy import configure_logging, get_engine_name, load_environment, TEST_PATH, ROOT_PATH
from pathlib import Path

source_path = Path('path/to/your/athena_source')  # Update this path as needed
base_path = TEST_PATH / "fixtures" / "athena_source"

In [None]:
concept = pd.read_csv(source_path / 'CONCEPT.csv', delimiter='\t', low_memory=False)
concept_class = pd.read_csv(source_path / 'CONCEPT_CLASS.csv', delimiter='\t')
relationship = pd.read_csv(source_path / 'RELATIONSHIP.csv', delimiter='\t')
domain = pd.read_csv(source_path / 'DOMAIN.csv', delimiter='\t')
vocabulary = pd.read_csv(source_path / 'VOCABULARY.csv', delimiter='\t')

In [None]:
required_concepts = set(concept_class.concept_class_concept_id) | set(relationship.relationship_concept_id) | set(domain.domain_concept_id) | set(vocabulary.vocabulary_concept_id)

In [None]:
additional_test_concepts = [443388, 40492938, 4273629, 4088217, 1397599, 4134858]

In [None]:
selected = []
for d in set(domain.domain_id):
    try:
        c = concept[(concept.domain_id == d) & (concept.standard_concept == 'S')]
        selected.append(c.sample(min(10, len(c)), random_state=1))
    except ValueError:
        print(f"Not enough standard concepts in domain {d}")
        pass

In [None]:
selected_concept_df = pd.concat(selected + [concept[concept.concept_id.isin(required_concepts) | concept.concept_id.isin(additional_test_concepts)]])

In [None]:
selected_relationships = []

for concept_rel in pd.read_csv(source_path / 'CONCEPT_RELATIONSHIP.csv', delimiter='\t', low_memory=False, chunksize=100000):
    filtered = concept_rel[
        (concept_rel.concept_id_1.isin(selected_concept_df.concept_id)) &
        (concept_rel.concept_id_2.isin(selected_concept_df.concept_id))
    ]
    if not filtered.empty:
        selected_relationships.append(filtered)

In [None]:
selected_ancestry = []

for concept_anc in pd.read_csv(source_path / 'CONCEPT_ANCESTOR.csv', delimiter='\t', low_memory=False, chunksize=100000):
    filtered = concept_anc[
        (concept_anc.ancestor_concept_id.isin(selected_concept_df.concept_id)) &
        (concept_anc.descendant_concept_id.isin(selected_concept_df.concept_id))
    ]
    if not filtered.empty:
        selected_ancestry.append(filtered)

In [None]:
selected_relationship_df = pd.concat(selected_relationships)
selected_ancestry_df = pd.concat(selected_ancestry)

In [None]:
for f in [domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df]:
    for col in f.columns:
        if 'concept_id' in col:
            if len(f[~f[col].isin(selected_concept_df.concept_id)]) > 0:
                raise ValueError(f"Found concept_id in {col} not in selected concepts")

In [None]:
assert len(selected_relationship_df[~selected_relationship_df.relationship_id.isin(relationship.relationship_id.unique())]) == 0, "Found relationship_id not in selected relationships"
assert len(concept[~concept.concept_class_id.isin(concept_class.concept_class_id.unique())]) == 0, "Found concept_class_id not in selected concepts"
assert len(concept[~concept.domain_id.isin(domain.domain_id.unique())]) == 0, "Found domain_id not in selected domains"
assert len(concept[~concept.vocabulary_id.isin(vocabulary.vocabulary_id.unique())]) == 0, "Found vocabulary_id not in selected vocabularies"

In [None]:
for f in [selected_concept_df, domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df]:
    assert(len(f[f.duplicated()]) == 0), f"Found duplicated rows in {f}"

In [None]:
# this is the import issue...TODO: add pk null normalisation on load
vocabulary.loc[vocabulary.vocabulary_id.isna(), 'vocabulary_id'] = 'Unknown_Vocabulary'

In [None]:
selected_relationship_df.to_csv(base_path / 'CONCEPT_RELATIONSHIP.csv', sep='\t', index=False)
selected_ancestry_df.to_csv(base_path / 'CONCEPT_ANCESTOR.csv', sep='\t', index=False)
selected_concept_df.to_csv(base_path / 'CONCEPT.csv', sep='\t', index=False)

In [None]:
domain.to_csv(base_path / 'DOMAIN.csv', sep='\t', index=False)
vocabulary.to_csv(base_path / 'VOCABULARY.csv', sep='\t', index=False)
relationship.to_csv(base_path / 'RELATIONSHIP.csv', sep='\t', index=False)
concept_class.to_csv(base_path / 'CONCEPT_CLASS.csv', sep='\t', index=False)