Script to grab small standalone sample of athena files to use for validation steps

In [None]:
import pandas as pd
from omop_alchemy import configure_logging, get_engine_name, load_environment, TEST_PATH, ROOT_PATH
from pathlib import Path
from dotenv import load_dotenv
import os

base_path = TEST_PATH / "fixtures" / "athena_source"
load_dotenv()
source_path = Path(os.getenv('SOURCE_PATH', 'update/path/to/athena/source/as/required'))

In [10]:
concept = pd.read_csv(source_path / 'CONCEPT.csv', delimiter='\t', low_memory=False)
concept_class = pd.read_csv(source_path / 'CONCEPT_CLASS.csv', delimiter='\t')
relationship = pd.read_csv(source_path / 'RELATIONSHIP.csv', delimiter='\t')
domain = pd.read_csv(source_path / 'DOMAIN.csv', delimiter='\t')
vocabulary = pd.read_csv(source_path / 'VOCABULARY.csv', delimiter='\t')

In [11]:
required_concepts = set(concept_class.concept_class_concept_id) | set(relationship.relationship_concept_id) | set(domain.domain_concept_id) | set(vocabulary.vocabulary_concept_id)

In [12]:
additional_test_concepts = [443388, 40492938, 4273629, 4088217, 1397599, 4134858]

In [13]:
selected = []
for d in set(domain.domain_id):
    try:
        c = concept[(concept.domain_id == d) & (concept.standard_concept == 'S')]
        selected.append(c.sample(min(10, len(c)), random_state=1))
    except ValueError:
        print(f"Not enough standard concepts in domain {d}")
        pass

In [14]:
selected_concept_df = pd.concat(selected + [concept[concept.concept_id.isin(required_concepts) | concept.concept_id.isin(additional_test_concepts)]])

In [15]:
selected_relationships = []

for concept_rel in pd.read_csv(source_path / 'CONCEPT_RELATIONSHIP.csv', delimiter='\t', low_memory=False, chunksize=100000):
    filtered = concept_rel[
        (concept_rel.concept_id_1.isin(selected_concept_df.concept_id)) &
        (concept_rel.concept_id_2.isin(selected_concept_df.concept_id))
    ]
    if not filtered.empty:
        selected_relationships.append(filtered)

In [16]:
selected_ancestry = []

for concept_anc in pd.read_csv(source_path / 'CONCEPT_ANCESTOR.csv', delimiter='\t', low_memory=False, chunksize=100000):
    filtered = concept_anc[
        (concept_anc.ancestor_concept_id.isin(selected_concept_df.concept_id)) &
        (concept_anc.descendant_concept_id.isin(selected_concept_df.concept_id))
    ]
    if not filtered.empty:
        selected_ancestry.append(filtered)

In [17]:
selected_relationship_df = pd.concat(selected_relationships)
selected_ancestry_df = pd.concat(selected_ancestry)

In [18]:
for f in [domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df]:
    for col in f.columns:
        if 'concept_id' in col:
            if len(f[~f[col].isin(selected_concept_df.concept_id)]) > 0:
                raise ValueError(f"Found concept_id in {col} not in selected concepts")

In [19]:
assert len(selected_relationship_df[~selected_relationship_df.relationship_id.isin(relationship.relationship_id.unique())]) == 0, "Found relationship_id not in selected relationships"
assert len(concept[~concept.concept_class_id.isin(concept_class.concept_class_id.unique())]) == 0, "Found concept_class_id not in selected concepts"
assert len(concept[~concept.domain_id.isin(domain.domain_id.unique())]) == 0, "Found domain_id not in selected domains"
assert len(concept[~concept.vocabulary_id.isin(vocabulary.vocabulary_id.unique())]) == 0, "Found vocabulary_id not in selected vocabularies"

In [20]:
for f in [selected_concept_df, domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df]:
    assert(len(f[f.duplicated()]) == 0), f"Found duplicated rows in {f}"

In [21]:
# this is the import issue...TODO: add pk null normalisation on load
vocabulary.loc[vocabulary.vocabulary_id.isna(), 'vocabulary_id'] = 'Unknown_Vocabulary'

In [22]:
selected_relationship_df.to_csv(base_path / 'CONCEPT_RELATIONSHIP.csv', sep='\t', index=False)
selected_ancestry_df.to_csv(base_path / 'CONCEPT_ANCESTOR.csv', sep='\t', index=False)
selected_concept_df.to_csv(base_path / 'CONCEPT.csv', sep='\t', index=False)

In [23]:
domain.to_csv(base_path / 'DOMAIN.csv', sep='\t', index=False)
vocabulary.to_csv(base_path / 'VOCABULARY.csv', sep='\t', index=False)
relationship.to_csv(base_path / 'RELATIONSHIP.csv', sep='\t', index=False)
concept_class.to_csv(base_path / 'CONCEPT_CLASS.csv', sep='\t', index=False)

In [49]:
from random import randint, choice
from sqlalchemy.orm import Session
from omop_alchemy.model.health_system import Location, Care_Site, Provider, Visit_Detail, Visit_Occurrence
from omop_alchemy.model.clinical import Person, Condition_Occurrence
from datetime import date

In [52]:
from omop_alchemy import configure_logging, get_engine_name, load_environment, TEST_PATH, ROOT_PATH
from omop_alchemy.cdm.base import bootstrap
import sqlalchemy as sa
from sqlalchemy.orm import sessionmaker

configure_logging()
load_environment()
engine_string = get_engine_name()
engine = sa.create_engine(engine_string, future=True, echo=False)
bootstrap(engine, create=True)

Session = sessionmaker(bind=engine, future=True)
session = Session()

2025-12-31 17:00:43,695 | INFO     | omop_alchemy.omop_alchemy.config | Environment variables loaded from .env file
2025-12-31 17:00:43,696 | INFO     | omop_alchemy.omop_alchemy.config | Database engine configured
2025-12-31 17:00:43,698 | INFO     | omop_alchemy.omop_alchemy.cdm.base.declarative | Bootstrapping OMOP schema (create=True)
2025-12-31 17:00:43,698 | INFO     | omop_alchemy.omop_alchemy.cdm.base.declarative | Schema creation enabled


In [43]:
avail_gender = list(selected_concept_df[selected_concept_df.domain_id=='Gender'].concept_id)
avail_ethnicity = list(selected_concept_df[selected_concept_df.domain_id=='Ethnicity'].concept_id)
avail_race = list(selected_concept_df[selected_concept_df.domain_id=='Race'].concept_id)
avail_place_of_service = list(selected_concept_df[selected_concept_df.domain_id=='Visit'].concept_id)
avail_country = 80500


In [None]:
def populate_reference_data(session):
    locations = []
    care_sites = []
    providers = []

    for loc_id in range(10):
        loc = Location(
            location_id=loc_id,
            city=f"City {loc_id}",
            country_concept_id=avail_country,
        )
        session.add(loc)
        locations.append(loc)

    session.commit() 

    for i, loc in enumerate(locations):
        for cs_id in range(3):
            cs = Care_Site(
                care_site_id=i*3 + cs_id,
                care_site_name=f"Care Site {i*3 + cs_id}",
                location_id=loc.location_id,
                place_of_service_concept_id=choice(avail_place_of_service),
            )
            session.add(cs)
            care_sites.append(cs)
    session.commit()

    pid = 0
    for cs in care_sites:
        for _ in range(2):
            p = Provider(
                provider_id=pid,
                provider_name=f"Provider {pid}",
                care_site_id=cs.care_site_id,
                gender_concept_id=choice(avail_gender),
            )
            session.add(p)
            providers.append(p)
            pid += 1

    session.commit()

    return locations, care_sites, providers


In [47]:
with Session() as sess:
    populate_reference_data(sess)

In [53]:
with Session() as sess:
    care_sites = sess.query(Care_Site).all()

In [58]:
def populate_people_and_visits(session, care_sites):
    persons = []

    for idx in range(100):
        person = Person(
            person_id=idx,
            year_of_birth=randint(1950, 2020),
            month_of_birth=randint(1, 12),
            gender_concept_id=choice(avail_gender),
            race_concept_id=choice(avail_race),
            ethnicity_concept_id=choice(avail_ethnicity),
        )
        session.add(person)
        persons.append(person)

    session.commit()

    visit_id = 0
    for person in persons:
        cs = choice(care_sites)
        visit = Visit_Occurrence(
            visit_occurrence_id=visit_id,
            person_id=person.person_id,
            care_site_id=cs.care_site_id,
            visit_concept_id=choice(avail_place_of_service),
            visit_start_date=date(2020, 1, 1),
            visit_end_date=date(2020, 1, 1),
        )
        session.add(visit)
        visit_id += 1

    session.commit()


In [60]:
with Session() as sess:
    populate_people_and_visits(sess, care_sites)