In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.functions import max as spark_max
from pyspark.sql.window import Window
from datetime import datetime
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import sum as sql_sum, min as sql_min, max as sql_max

In [0]:


# Define OMOP CDM schema for location table
location_schema = StructType([
    StructField("location_id", LongType(), False, 
                metadata={"comment": "A unique identifier for each geographic location."}),
    StructField("address_1", StringType(), True,
                metadata={"comment": "The first line of the address."}),
    StructField("address_2", StringType(), True,
                metadata={"comment": "The second line of the address"}),
    StructField("city", StringType(), True,
                metadata={"comment": "The city field is the text name of the city."}),
    StructField("state", StringType(), True,
                metadata={"comment": "The state field contains the state name. For addresses outside the US, this field can be used for provinces or other administrative regions."}),
    StructField("zip", StringType(), True,
                metadata={"comment": "The zip or postal code. For US addresses, valid formats are 3-digit, 5-digit or 9-digit ZIP codes. For non-US addresses, the postal code should be stored in the same field."}),
    StructField("county", StringType(), True,
                metadata={"comment": "The county, if available. The county field can also be used to store other regional information."}),
    StructField("location_source_value", StringType(), True,
                metadata={"comment": "The verbatim value for the location as it appears in the source data."}),
    StructField("country_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined Concept table for the country concept id, representing the country portion of the address."}),
    StructField("country_source_value", StringType(), True,
                metadata={"comment": "The source code for the country as it appears in the source data."}),
    StructField("latitude", FloatType(), True,
                metadata={"comment": "The latitude of the location. Must be between -90 and 90."}),
    StructField("longitude", FloatType(), True,
                metadata={"comment": "The longitude of the location. Must be between -180 and 180."}),
    StructField("LSOA", StringType(), True, metadata={"comment": "Local authority district (LSOA) code"}),
    StructField("IMD_Quintile", IntegerType(), True, metadata={"comment": "Index of Multiple Deprivation (IMD) quintile"})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_location_rules = {
    "valid_location_id": "location_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_location_rules = {
    # Field format validations
    "valid_zip_format": "zip IS NULL OR LENGTH(zip) <= 9",
    "valid_address_length": "address_1 IS NULL OR LENGTH(address_1) <= 50",
    "valid_address2_length": "address_2 IS NULL OR LENGTH(address_2) <= 50",
    "valid_city_length": "city IS NULL OR LENGTH(city) <= 50",
    "valid_state_length": "state IS NULL OR LENGTH(state) <= 2",
    "valid_county_length": "county IS NULL OR LENGTH(county) <= 20",
    "valid_location_source_length": "location_source_value IS NULL OR LENGTH(location_source_value) <= 50",
    "valid_country_source_length": "country_source_value IS NULL OR LENGTH(country_source_value) <= 80",
    
    # Geographical constraints
    "valid_latitude": "latitude IS NULL OR (latitude >= -90 AND latitude <= 90)",
    "valid_longitude": "longitude IS NULL OR (longitude >= -180 AND longitude <= 180)",
    
    # Concept ID validation
    "valid_country_concept": "country_concept_id IS NULL OR country_concept_id >= 0"
}


@dlt.table(
    name="location",
    comment="OMOP CDM Location table - Represents a generic way to capture physical location or address information",
    schema=location_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_location_rules)
@dlt.expect_all(advisory_location_rules)
def create_omop_location():
    return spark.table("4_prod.dlt.omop_location")


In [0]:
# Define OMOP CDM schema for care_site table
care_site_schema = StructType([
    StructField("care_site_id", LongType(), False, 
                metadata={"comment": "A unique identifier for each Care Site"}),
    StructField("care_site_name", StringType(), True,
                metadata={"comment": "The name of the care_site as it appears in the source data"}),
    StructField("place_of_service_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined Concept table for the place of service concept id"}),
    StructField("location_id", LongType(), True,
                metadata={"comment": "A foreign key to the Location table, where the detailed address information is stored"}),
    StructField("care_site_source_value", StringType(), True,
                metadata={"comment": "The identifier of the care_site as it appears in the source data"}),
    StructField("place_of_service_source_value", StringType(), True,
                metadata={"comment": "The source code for the place of service as it appears in the source data"})
])

# Data quality rules for care site table
care_site_rules = {
    # Required field checks based on OMOP CDM specification
    "valid_care_site_id": "care_site_id IS NOT NULL",
    # Additional data quality checks
    "valid_concept_id_format": "place_of_service_concept_id IS NULL OR place_of_service_concept_id >= 0",
    "valid_location_id_format": "location_id IS NULL OR location_id >= 0"
}

@dlt.table(
    name="care_site",
    comment="OMOP CDM Care Site table - Contains a list of institutional (physical or organizational) units where healthcare delivery is practiced (offices, wards, hospitals, clinics, etc.)",
    schema=care_site_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(care_site_rules)
def create_omop_care_site():
        return spark.table("4_prod.dlt.omop_care_site")


In [0]:


# Define OMOP CDM schema for provider table
provider_schema = StructType([
    StructField("provider_id", LongType(), False,
                metadata={"comment": "A unique identifier for each Provider."}),
    StructField("provider_name", StringType(), True,
                metadata={"comment": "A description of the Provider, typically the name of the physician or facility."}),
    StructField("npi", StringType(), True,
                metadata={"comment": "The National Provider Identifier (NPI) of the provider."}),
    StructField("dea", StringType(), True,
                metadata={"comment": "The Drug Enforcement Administration (DEA) number of the provider."}),
    StructField("specialty_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a Standard Specialty Concept ID in the Standardized Vocabularies."}),
    StructField("care_site_id", LongType(), True,
                metadata={"comment": "A foreign key to the main Care Site where the provider is practicing."}),
    StructField("year_of_birth", IntegerType(), True,
                metadata={"comment": "The year of birth of the Provider."}),
    StructField("gender_concept_id", IntegerType(), True,
                metadata={"comment": "The gender of the Provider."}),
    StructField("provider_source_value", StringType(), True,
                metadata={"comment": "The identifier used for the Provider in the source data."}),
    StructField("specialty_source_value", StringType(), True,
                metadata={"comment": "The source code for the Provider specialty as it appears in the source data."}),
    StructField("specialty_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a Concept that refers to the code used in the source."}),
    StructField("gender_source_value", StringType(), True,
                metadata={"comment": "The source value for the Provider gender."}),
    StructField("gender_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a Concept that refers to the code used in the source."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_provider_rules = {
    "valid_provider_id": "provider_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_provider_rules = {
    "valid_provider_name_length": "provider_name IS NULL OR LENGTH(provider_name) <= 255",
    "valid_npi_length": "npi IS NULL OR LENGTH(npi) <= 20",
    "valid_dea_length": "dea IS NULL OR LENGTH(dea) <= 20",
    "valid_specialty_concept": "specialty_concept_id IS NULL OR specialty_concept_id >= 0",
    "valid_gender_concept": "gender_concept_id IS NULL OR gender_concept_id >= 0",
    "valid_source_value_length": "provider_source_value IS NULL OR LENGTH(provider_source_value) <= 50",
    "valid_specialty_source_length": "specialty_source_value IS NULL OR LENGTH(specialty_source_value) <= 50",
    "valid_gender_source_length": "gender_source_value IS NULL OR LENGTH(gender_source_value) <= 50"
}

    
@dlt.table(
    name="provider",
    comment="OMOP CDM Provider table - Contains a list of uniquely identified healthcare providers",
    schema=provider_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_provider_rules)
@dlt.expect_all(advisory_provider_rules)
def create_omop_provider():
        return spark.table("4_prod.dlt.omop_provider")


In [0]:


person_schema = StructType([
    StructField("person_id", LongType(), False,
                metadata={"comment": "A unique identifier for each person."}),
    StructField("gender_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key that refers to a standard concept identifier in the Vocabulary for the gender of the person."}),
    StructField("year_of_birth", IntegerType(), True,
                metadata={"comment": "The year of birth of the person."}),
    StructField("month_of_birth", IntegerType(), True,
                metadata={"comment": "The month of birth of the person."}),
    StructField("day_of_birth", IntegerType(), True,
                metadata={"comment": "The day of birth of the person."}),
    StructField("birth_datetime", TimestampType(), True,
                metadata={"comment": "The date and time of birth of the person."}),
    StructField("race_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key that refers to a standard concept identifier in the Vocabulary for the race of the person."}),
    StructField("ethnicity_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key that refers to the standard concept identifier in the Vocabulary for the ethnicity of the person."}),
    StructField("location_id", LongType(), True,
                metadata={"comment": "A foreign key to the location table that indicates where the person is located."}),
    StructField("provider_id", LongType(), True,
                metadata={"comment": "A foreign key to the provider table that indicates the primary care provider of the person."}),
    StructField("care_site_id", LongType(), True,
                metadata={"comment": "A foreign key to the care site table that indicates the primary care site of the person."}),
    StructField("person_source_value", StringType(), True,
                metadata={"comment": "The source code for the person as it appears in the source data."}),
    StructField("gender_source_value", StringType(), True,
                metadata={"comment": "The source code for the gender of the person as it appears in the source data."}),
    StructField("gender_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a concept that refers to the code used in the source."}),
    StructField("race_source_value", StringType(), True,
                metadata={"comment": "The source code for the race of the person as it appears in the source data."}),
    StructField("race_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a concept that refers to the code used in the source."}),
    StructField("ethnicity_source_value", StringType(), True,
                metadata={"comment": "The source code for the ethnicity of the person as it appears in the source data."}),
    StructField("ethnicity_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a concept that refers to the code used in the source."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_person_rules = {
    "valid_person_id": "person_id IS NOT NULL",
    "valid_gender_concept": "gender_concept_id IS NOT NULL",
    "valid_year_of_birth": "year_of_birth IS NOT NULL",
    "valid_race_concept": "race_concept_id IS NOT NULL",
    "valid_ethnicity_concept": "ethnicity_concept_id IS NOT NULL",
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_person_rules = {
    "reasonable_birth_year": "year_of_birth >= 1901",
    "valid_birth_month": "month_of_birth IS NULL OR (month_of_birth >= 1 AND month_of_birth <= 12)",
    "valid_birth_day": "day_of_birth IS NULL OR (day_of_birth >= 1 AND day_of_birth <= 31)",
    "valid_gender_value": "gender_concept_id > 0",
    "valid_race_value": "race_concept_id >= 0",
    "valid_ethnicity_value": "ethnicity_concept_id >= 0",
    "valid_source_values": """
        person_source_value IS NULL OR LENGTH(person_source_value) <= 50
    """
}

@dlt.table(
    name="person",
    comment="OMOP CDM Person table - Contains records that uniquely identify each person in the database",
    schema=person_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_person_rules)
@dlt.expect_all(advisory_person_rules)
def create_omop_person():
     return spark.table("4_prod.dlt.omop_person")


In [0]:


visit_schema = StructType([
    StructField("visit_occurrence_id", LongType(), False,
                metadata={"comment": "A unique identifier for each Person's visit or encounter at a healthcare provider."}),
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the Person who is having the visit."}),
    StructField("visit_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key that refers to a visit concept identifier in the Standardized Vocabularies."}),
    StructField("visit_start_date", DateType(), True,
                metadata={"comment": "The start date of the visit."}),
    StructField("visit_start_datetime", TimestampType(), True,
                metadata={"comment": "The start date and time of the visit."}),
    StructField("visit_end_date", DateType(), True,
                metadata={"comment": "The end date of the visit."}),
    StructField("visit_end_datetime", TimestampType(), True,
                metadata={"comment": "The end date and time of the visit."}),
    StructField("visit_type_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting the type of source data from which the visit record is derived."}),
    StructField("provider_id", LongType(), True,
                metadata={"comment": "A foreign key to the provider in the provider table who was associated with the visit."}),
    StructField("care_site_id", LongType(), True,
                metadata={"comment": "A foreign key to the care site in the care site table that was visited."}),
    StructField("visit_source_value", StringType(), True,
                metadata={"comment": "The source code for the visit as it appears in the source data."}),
    StructField("visit_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a concept that refers to the code used in the source."}),
    StructField("admitted_from_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined concept in the Place of Service vocabulary indicating where the person was admitted from."}),
    StructField("admitted_from_source_value", StringType(), True,
                metadata={"comment": "The source code for the admitted from concept as it appears in the source data."}),
    StructField("discharged_to_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined concept in the Place of Service vocabulary indicating where the person was discharged to."}),
    StructField("discharged_to_source_value", StringType(), True,
                metadata={"comment": "The source code for the discharged to concept as it appears in the source data."}),
    StructField("preceding_visit_occurrence_id", LongType(), True,
                metadata={"comment": "A foreign key to the visit occurrence that immediately preceded this visit."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_visit_rules = {
    "valid_visit_id": "visit_occurrence_id IS NOT NULL",
    "valid_person_id": "person_id IS NOT NULL",
    "valid_visit_concept": "visit_concept_id IS NOT NULL",
    "valid_start_date": "visit_start_date IS NOT NULL",
    "valid_end_date": "visit_end_date IS NOT NULL",
    "valid_type_concept": "visit_type_concept_id IS NOT NULL",
    "valid_dates": "visit_start_date <= visit_end_date"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_visit_rules = {
    "valid_visit_concept_value": "visit_concept_id > 0",
    "valid_type_concept_value": "visit_type_concept_id > 0",
    "valid_source_values": """
        visit_source_value IS NULL OR LENGTH(visit_source_value) <= 50
    """
}


@dlt.table(
    name="visit_occurrence",
    comment="OMOP CDM Visit Occurrence table - Contains records of Events where Persons engage with the healthcare system for a duration of time",
    schema=visit_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_visit_rules)
@dlt.expect_all(advisory_visit_rules)
def create_omop_visit_occurrence():
     return spark.table("4_prod.dlt.omop_visit_occurrence")


In [0]:


condition_schema = StructType([
    StructField("condition_occurrence_id", LongType(), False,
                metadata={"comment": "A unique identifier for each condition occurrence event."}),
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the person who is experiencing the condition."}),
    StructField("condition_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key that refers to a standard condition concept identifier in the Vocabulary."}),
    StructField("condition_start_date", DateType(), True,
                metadata={"comment": "The date when the instance of the condition is recorded."}),
    StructField("condition_start_datetime", TimestampType(), True,
                metadata={"comment": "The date and time when the instance of the condition is recorded."}),
    StructField("condition_end_date", DateType(), True,
                metadata={"comment": "The date when the instance of the condition is considered to have ended."}),
    StructField("condition_end_datetime", TimestampType(), True,
                metadata={"comment": "The date and time when the instance of the condition is considered to have ended."}),
    StructField("condition_type_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting the source data from which the condition was recorded."}),
    StructField("condition_status_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting the status of the condition."}),
    StructField("stop_reason", StringType(), True,
                metadata={"comment": "The reason that the condition was no longer present."}),
    StructField("provider_id", LongType(), True,
                metadata={"comment": "A foreign key to the provider who was responsible for determining the condition."}),
    StructField("visit_occurrence_id", LongType(), True,
                metadata={"comment": "A foreign key to the visit in the VISIT_OCCURRENCE table during which the condition was determined."}),
    StructField("visit_detail_id", LongType(), True,
                metadata={"comment": "A foreign key to the visit detail record during which the condition was determined."}),
    StructField("condition_source_value", StringType(), True,
                metadata={"comment": "The source value for the condition as it appears in the source data."}),
    StructField("condition_source_concept_id", LongType(), True,
                metadata={"comment": "A foreign key to a condition concept that refers to the code used in the source."}),
    StructField("condition_status_source_value", StringType(), True,
                metadata={"comment": "The source value for the condition status as it appears in the source data."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_condition_rules = {
    "valid_condition_id": "condition_occurrence_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",
    "valid_concept": "condition_concept_id IS NOT NULL",
    "valid_start_date": "condition_start_date IS NOT NULL",
    "valid_type_concept": "condition_type_concept_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_condition_rules = {
    "valid_concept_value": "condition_concept_id > 0",
    "valid_type_concept_value": "condition_type_concept_id > 0",
    "valid_status_concept": "condition_status_concept_id IS NULL OR condition_status_concept_id >= 0",
    "valid_date_range": "condition_end_date IS NULL OR condition_end_date >= condition_start_date",
    "valid_stop_reason": "stop_reason IS NULL OR LENGTH(stop_reason) <= 20"
}


@dlt.table(
    name="condition_occurrence",
    comment="OMOP CDM Condition Occurrence table - Contains records of Events suggesting the presence of a disease or medical condition",
    schema=condition_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_condition_rules)
@dlt.expect_all(advisory_condition_rules)
def create_omop_condition_occurrence():
     return spark.table("4_prod.dlt.omop_condition_occurrence")


In [0]:
drug_schema = StructType([
    StructField("drug_exposure_id", LongType(), False,
                metadata={"comment": "A unique identifier for each drug exposure event."}),
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the person who is subjected to the drug."}),
    StructField("drug_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key that refers to a standard drug concept identifier in the Vocabulary."}),
    StructField("drug_exposure_start_date", DateType(), True,
                metadata={"comment": "The start date for the current instance of drug exposure."}),
    StructField("drug_exposure_start_datetime", TimestampType(), True,
                metadata={"comment": "The start date and time for the current instance of drug exposure."}),
    StructField("drug_exposure_end_date", DateType(), True,
                metadata={"comment": "The end date for the current instance of drug exposure."}),
    StructField("drug_exposure_end_datetime", TimestampType(), True,
                metadata={"comment": "The end date and time for the current instance of drug exposure."}),
    StructField("verbatim_end_date", DateType(), True,
                metadata={"comment": "The end date of the drug exposure as it appears in the source data."}),
    StructField("drug_type_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting the type of drug exposure."}),
    StructField("stop_reason", StringType(), True,
                metadata={"comment": "The reason the drug exposure was stopped."}),
    StructField("refills", IntegerType(), True,
                metadata={"comment": "The number of refills after the initial prescription."}),
    StructField("quantity", FloatType(), True,
                metadata={"comment": "The quantity of drug as recorded in the source data."}),
    StructField("days_supply", IntegerType(), True,
                metadata={"comment": "The number of days of supply of the medication."}),
    StructField("sig", StringType(), True,
                metadata={"comment": "The directions (signatur) on the drug prescription as recorded in the source."}),
    StructField("route_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a predefined concept in the Standardized Vocabularies reflecting the route of administration."}),
    StructField("lot_number", StringType(), True,
                metadata={"comment": "The identifier to determine where the product originated."}),
    StructField("provider_id", LongType(), True,
                metadata={"comment": "A foreign key to the provider in the provider table who prescribed the drug."}),
    StructField("visit_occurrence_id", LongType(), True,
                metadata={"comment": "A foreign key to the visit in the visit table during which the drug exposure initiated."}),
    StructField("visit_detail_id", LongType(), True,
                metadata={"comment": "A foreign key to the visit detail record during which the drug exposure initiated."}),
    StructField("drug_source_value", StringType(), True,
                metadata={"comment": "The source code for the drug as it appears in the source data."}),
    StructField("drug_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a drug concept that refers to the code used in the source."}),
    StructField("route_source_value", StringType(), True,
                metadata={"comment": "The source code for the route as it appears in the source data."}),
    StructField("dose_unit_source_value", StringType(), True,
                metadata={"comment": "The information about the dose unit as recorded in the source data."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_drug_rules = {
    "valid_drug_id": "drug_exposure_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",
    "valid_concept": "drug_concept_id IS NOT NULL",
    "valid_start_date": "drug_exposure_start_date IS NOT NULL",
    "valid_end_date": "drug_exposure_end_date IS NOT NULL", 
    "valid_type_concept": "drug_type_concept_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_drug_rules = {
    "valid_concept_value": "drug_concept_id > 0",
    "valid_type_concept_value": "drug_type_concept_id > 0",
    "valid_route": "route_concept_id IS NULL OR route_concept_id >= 0",
    "valid_dates": "drug_exposure_end_date >= drug_exposure_start_date",
    "valid_quantity": "quantity IS NULL OR quantity > 0",
    "valid_days_supply": "days_supply IS NULL OR days_supply > 0",
    "valid_stop_reason": "stop_reason IS NULL OR LENGTH(stop_reason) <= 20",
    "valid_source_values": "drug_source_value IS NOT NULL"
}


@dlt.table(
    name="drug_exposure",
    comment="OMOP CDM Drug Exposure table - Contains records about the exposure to a Drug through prescriptions or administration",
    schema=drug_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_drug_rules)
@dlt.expect_all(advisory_drug_rules)
def create_omop_drug_exposure():
     return spark.table("4_prod.dlt.omop_drug_exposure")


In [0]:

procedure_schema = StructType([
    StructField("procedure_occurrence_id", LongType(), False,
                metadata={"comment": "A unique identifier for each Procedure Occurrence event."}),
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the Person who is subjected to the Procedure."}),
    StructField("procedure_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key that refers to a standard procedure Concept identifier in the Vocabulary."}),
    StructField("procedure_date", DateType(), False,
                metadata={"comment": "The date on which the Procedure was performed."}),
    StructField("procedure_datetime", TimestampType(), True,
                metadata={"comment": "The date and time on which the Procedure was performed."}),
    StructField("procedure_end_date", DateType(), True,
                metadata={"comment": "The end date on which the Procedure was performed."}),
    StructField("procedure_end_datetime", TimestampType(), True,
                metadata={"comment": "The end date and time on which the Procedure was performed."}),
    StructField("procedure_type_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key to the predefined Concept identifier in the Standardized Vocabularies reflecting the type of source data from which the procedure record is derived."}),
    StructField("modifier_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a Standard Concept identifier for a modifier to the Procedure."}),
    StructField("quantity", IntegerType(), True,
                metadata={"comment": "The quantity of procedures ordered or administered."}),
    StructField("provider_id", LongType(), True,
                metadata={"comment": "A foreign key to the Provider in the PROVIDER table who was responsible for carrying out the procedure."}),
    StructField("visit_occurrence_id", LongType(), True,
                metadata={"comment": "A foreign key to the Visit in the VISIT_OCCURRENCE table during which the Procedure was carried out."}),
    StructField("visit_detail_id", LongType(), True,
                metadata={"comment": "A foreign key to the Visit Detail in the VISIT_DETAIL table during which the Procedure was carried out."}),
    StructField("procedure_source_value", StringType(), True,
                metadata={"comment": "The procedure as it appears in the source data."}),
    StructField("procedure_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a Procedure Concept that refers to the code used in the source."}),
    StructField("modifier_source_value", StringType(), True,
                metadata={"comment": "The source code for the modifier as it appears in the source data."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_procedure_rules = {
    "valid_procedure_id": "procedure_occurrence_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",
    "valid_procedure": "procedure_concept_id IS NOT NULL",
    "valid_date": "procedure_date IS NOT NULL",
    "valid_type_concept": "procedure_type_concept_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_procedure_rules = {
    "valid_concept_value": "procedure_concept_id > 0",
    "valid_type_concept_value": "procedure_type_concept_id > 0",
    "valid_modifier": "modifier_concept_id IS NULL OR modifier_concept_id >= 0",
    "valid_quantity": "quantity IS NULL OR quantity > 0",
    "valid_dates": "procedure_end_date IS NULL OR procedure_end_date >= procedure_date",
    "valid_source_values": """
        procedure_source_value IS NULL OR LENGTH(procedure_source_value) <= 50
    """
}


@dlt.table(
    name="procedure_occurrence",
    comment="OMOP CDM Procedure Occurrence table - Contains records of activities or processes ordered by, or carried out by, a healthcare provider on the patient with a diagnostic or therapeutic purpose",
    schema=procedure_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_procedure_rules)
@dlt.expect_all(advisory_procedure_rules)
def create_omop_procedure_occurrence():
     return spark.table("4_prod.dlt.omop_procedure_occurrence")


In [0]:


# Data quality rules for device exposure
device_schema = StructType([
    StructField("device_exposure_id", LongType(), False,
                metadata={"comment": "A unique identifier for each Device exposure event."}),
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the Person who is subjected to the Device."}),
    StructField("device_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key that refers to a Standard Device Concept identifier in the Vocabulary."}),
    StructField("device_exposure_start_date", DateType(), False,
                metadata={"comment": "The start date for the Device exposure."}),
    StructField("device_exposure_start_datetime", TimestampType(), True,
                metadata={"comment": "The start date and time for the Device exposure."}),
    StructField("device_exposure_end_date", DateType(), True,
                metadata={"comment": "The end date for the Device exposure."}),
    StructField("device_exposure_end_datetime", TimestampType(), True,
                metadata={"comment": "The end date and time for the Device exposure."}),
    StructField("device_type_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key to the predefined Concept identifier in the Standardized Vocabularies reflecting the type of Device exposure."}),
    StructField("unique_device_id", StringType(), True,
                metadata={"comment": "The Unique Device Identification (UDI-DI) number for devices regulated by the FDA."}),
    StructField("production_id", StringType(), True,
                metadata={"comment": "The Production Identifier (UDI-PI) portion of the Unique Device Identification."}),
    StructField("quantity", IntegerType(), True,
                metadata={"comment": "The number of individual Devices used."}),
    StructField("provider_id", LongType(), True,
                metadata={"comment": "A foreign key to the Provider in the PROVIDER table who initiated the Device exposure."}),
    StructField("visit_occurrence_id", LongType(), True,
                metadata={"comment": "A foreign key to the Visit in the VISIT_OCCURRENCE table during which the Device exposure initiated."}),
    StructField("visit_detail_id", LongType(), True,
                metadata={"comment": "A foreign key to the Visit Detail in the VISIT_DETAIL table during which the Device exposure initiated."}),
    StructField("device_source_value", StringType(), True,
                metadata={"comment": "The source code for the Device as it appears in the source data."}),
    StructField("device_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a Device Concept that refers to the code used in the source."}),
    StructField("unit_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a predefined Concept in the Standardized Vocabularies reflecting the unit the Device was administered."}),
    StructField("unit_source_value", StringType(), True,
                metadata={"comment": "The source code for the unit as it appears in the source data."}),
    StructField("unit_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a Unit Concept that refers to the code used in the source."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_device_rules = {
    "valid_device_id": "device_exposure_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",
    "valid_device": "device_concept_id IS NOT NULL",
    "valid_start_date": "device_exposure_start_date IS NOT NULL",
    "valid_type_concept": "device_type_concept_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_device_rules = {
    "valid_concept_value": "device_concept_id > 0",
    "valid_type_concept_value": "device_type_concept_id > 0",
    "valid_quantity": "quantity IS NULL OR quantity > 0",
    "valid_unit": "unit_concept_id IS NULL OR unit_concept_id >= 0",
    "valid_dates": "device_exposure_end_date IS NULL OR device_exposure_end_date >= device_exposure_start_date",
    "valid_device_id_advis": "unique_device_id IS NULL OR LENGTH(unique_device_id) <= 255",
    "valid_production_id": "production_id IS NULL OR LENGTH(production_id) <= 255"
}



@dlt.table(
    name="device_exposure",
    comment="OMOP CDM Device Exposure table - Contains records about exposure to a foreign physical object or instrument used for diagnostic or therapeutic purposes",
    schema=device_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_device_rules)
@dlt.expect_all(advisory_device_rules)
def create_omop_device_exposure():
     return spark.table("4_prod.dlt.omop_device_exposure")
  

In [0]:


# Data quality rules for measurements
measurement_schema = StructType([
    StructField("measurement_id", LongType(), False,
                metadata={"comment": "A unique identifier for each Measurement."}),
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the Person about whom the measurement was recorded."}),
    StructField("measurement_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key to the standard measurement concept identifier in the Vocabulary."}),
    StructField("measurement_date", DateType(), False,
                metadata={"comment": "The date of the measurement."}),
    StructField("measurement_datetime", TimestampType(), True,
                metadata={"comment": "The date and time of the measurement."}),
    StructField("measurement_time", StringType(), True,
                metadata={"comment": "The time of the measurement (in the event that MEASUREMENT_DATETIME is not well defined)."}),
    StructField("measurement_type_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting the type of the measurement."}),
    StructField("operator_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a standard concept identifier for the mathematical operator applied to the value."}),
    StructField("value_as_number", FloatType(), True,
                metadata={"comment": "The measurement result stored as a number. This is applicable to measurements where the result is expressed as a numeric value."}),
    StructField("value_as_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a standard concept identifier for a categorical result."}),
    StructField("unit_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a standard concept identifier for the unit used in the measurement."}),
    StructField("range_low", FloatType(), True,
                metadata={"comment": "The lower limit of the normal range of the measurement."}),
    StructField("range_high", FloatType(), True,
                metadata={"comment": "The upper limit of the normal range of the measurement."}),
    StructField("provider_id", LongType(), True,
                metadata={"comment": "A foreign key to the provider in the PROVIDER table who was responsible for taking the measurement."}),
    StructField("visit_occurrence_id", LongType(), True,
                metadata={"comment": "A foreign key to the visit in the VISIT_OCCURRENCE table during which the measurement was taken."}),
    StructField("visit_detail_id", LongType(), True,
                metadata={"comment": "A foreign key to the visit detail in the VISIT_DETAIL table during which the measurement was taken."}),
    StructField("measurement_source_value", StringType(), True,
                metadata={"comment": "The measurement name as it appears in the source data."}),
    StructField("measurement_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a concept that refers to the code used in the source."}),
    StructField("unit_source_value", StringType(), True,
                metadata={"comment": "The source code for the unit as it appears in the source data."}),
    StructField("unit_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a concept that refers to the unit code used in the source."}),
    StructField("value_source_value", StringType(), True,
                metadata={"comment": "The source value associated with the structured value stored as numeric or concept."}),
    StructField("measurement_event_id", LongType(), True,
                metadata={"comment": "A foreign key to the MEASUREMENT_EVENT table."}),
    StructField("meas_event_field_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a concept that refers to the field in the MEASUREMENT_EVENT table."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_measurement_rules = {
    "valid_measurement_id": "measurement_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",
    "valid_concept": "measurement_concept_id IS NOT NULL",
    "valid_date": "measurement_date IS NOT NULL",
    "valid_type_concept": "measurement_type_concept_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_measurement_rules = {
    "valid_concept_value": "measurement_concept_id > 0",
    "valid_type_concept_value": "measurement_type_concept_id > 0",
    "valid_operator": "operator_concept_id IS NULL OR operator_concept_id >= 0",
    "valid_value": """
        (value_as_number IS NOT NULL AND value_as_concept_id IS NULL) OR
        (value_as_number IS NULL AND value_as_concept_id IS NOT NULL) OR
        (value_as_number IS NULL AND value_as_concept_id IS NULL)
    """,
    "valid_unit": "unit_concept_id IS NULL OR unit_concept_id >= 0",
    "valid_range": "range_high IS NULL OR range_low IS NULL OR range_high >= range_low"
}


@dlt.table(
    name="measurement",
    comment="OMOP CDM Measurement table",
    schema=measurement_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_measurement_rules)
@dlt.expect_all(advisory_measurement_rules)
def create_omop_measurement():
     return spark.table("4_prod.dlt.omop_measurement")
 

In [0]:


# Data quality rules for observations
observation_schema = StructType([
    StructField("observation_id", LongType(), False,
                metadata={"comment": "A unique identifier for each observation."}),
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the Person about whom the observation was recorded."}),
    StructField("observation_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key to the standard observation concept identifier in the Vocabulary."}),
    StructField("observation_date", DateType(), False,
                metadata={"comment": "The date of the observation."}),
    StructField("observation_datetime", TimestampType(), True,
                metadata={"comment": "The date and time of the observation."}),
    StructField("observation_type_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting the type of the observation."}),
    StructField("value_as_number", FloatType(), True,
                metadata={"comment": "The observation result stored as a number. This is applicable to observations where the result is expressed as a numeric value."}),
    StructField("value_as_string", StringType(), True,
                metadata={"comment": "The observation result stored as a string."}),
    StructField("value_as_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to an observation result stored as a Concept ID."}),
    StructField("qualifier_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a standard concept identifier for a qualifier."}),
    StructField("unit_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a standard concept identifier for the unit."}),
    StructField("provider_id", LongType(), True,
                metadata={"comment": "A foreign key to the provider in the PROVIDER table who was responsible for making the observation."}),
    StructField("visit_occurrence_id", LongType(), True,
                metadata={"comment": "A foreign key to the visit in the VISIT_OCCURRENCE table during which the observation was recorded."}),
    StructField("visit_detail_id", LongType(), True,
                metadata={"comment": "A foreign key to the visit detail in the VISIT_DETAIL table during which the observation was recorded."}),
    StructField("observation_source_value", StringType(), True,
                metadata={"comment": "The observation code as it appears in the source data."}),
    StructField("observation_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a concept that refers to the code used in the source."}),
    StructField("unit_source_value", StringType(), True,
                metadata={"comment": "The source code for the unit as it appears in the source data."}),
    StructField("qualifier_source_value", StringType(), True,
                metadata={"comment": "The source value associated with a qualifier to characterize the observation."}),
    StructField("value_source_value", StringType(), True,
                metadata={"comment": "The source value associated with the structured value stored as numeric, string, or concept."}),
    StructField("observation_event_id", LongType(), True,
                metadata={"comment": "A foreign key to the event that caused this observation to be made."}),
    StructField("obs_event_field_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting how the event field is used."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_observation_rules = {
    "valid_observation_id": "observation_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",
    "valid_concept": "observation_concept_id IS NOT NULL",
    "valid_date": "observation_date IS NOT NULL",
    "valid_type_concept": "observation_type_concept_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_observation_rules = {
    "valid_concept_value": "observation_concept_id > 0",
    "valid_type_concept_value": "observation_type_concept_id > 0",
    "valid_qualifier": "qualifier_concept_id IS NULL OR qualifier_concept_id >= 0",
    "valid_unit": "unit_concept_id IS NULL OR unit_concept_id >= 0",
    "valid_value": """
        (value_as_number IS NOT NULL) OR 
        (value_as_string IS NOT NULL) OR 
        (value_as_concept_id IS NOT NULL) OR
        (value_as_number IS NULL AND 
         value_as_string IS NULL AND 
         value_as_concept_id IS NULL)
    """,
    "valid_source_values": """
        observation_source_value IS NULL OR LENGTH(observation_source_value) <= 50
    """
}

@dlt.table(
    name="observation",
    comment="OMOP CDM Observation table - Contains clinical facts about a Person obtained in the context of examination, questioning or a procedure",
    schema=observation_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_observation_rules)
@dlt.expect_all(advisory_observation_rules)
def create_omop_observation():
     return spark.table("4_prod.dlt.omop_observation")
 

In [0]:


death_schema = StructType([
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the deceased Person."}),
    StructField("death_date", DateType(), False,
                metadata={"comment": "The date the person was deceased."}),
    StructField("death_datetime", TimestampType(), True,
                metadata={"comment": "The date and time the person was deceased."}),
    StructField("death_type_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting how the death was represented in the source data."}),
    StructField("cause_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting the cause of death"}),
    StructField("cause_source_value", StringType(), True,
                metadata={"comment": "The source code for the cause of death as it appears in the source data."}),
    StructField("cause_source_concept_id", IntegerType(), True,
                metadata={"comment": "A foreign key to a concept that refers to the code used in the source."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_death_rules = {
    "valid_person": "person_id IS NOT NULL",
    "valid_death_date": "death_date IS NOT NULL",
    "valid_type_concept": "death_type_concept_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_death_rules = {
    "valid_cause_concept": "cause_concept_id IS NULL OR cause_concept_id >= 0",
    "valid_source_concept": "cause_source_concept_id IS NULL OR cause_source_concept_id >= 0",
    "valid_dates": "death_datetime IS NULL OR CAST(death_date AS DATE) = CAST(death_datetime AS DATE)",
    "valid_source_length": "cause_source_value IS NULL OR LENGTH(cause_source_value) <= 50"
}

@dlt.table(
    name="death",
    comment="OMOP CDM Death table - Contains the clinical event for how and when a Person dies",
    schema=death_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_death_rules)
@dlt.expect_all(advisory_death_rules)
def create_omop_death():
     return spark.table("4_prod.dlt.omop_death")
 

In [0]:


drug_era_schema = StructType([
    StructField("drug_era_id", IntegerType(), False,
                metadata={"comment": "A unique identifier for each drug era."}),
    StructField("person_id", IntegerType(), False,
                metadata={"comment": "A foreign key identifier to the person who is subjected to the drug during the drug era."}),
    StructField("drug_concept_id", IntegerType(), False, 
                metadata={"comment": "A foreign key that refers to a standard concept identifier in the Vocabulary for the drug concept."}),
    StructField("drug_era_start_date", DateType(), False,
                metadata={"comment": "The start date for the drug era constructed from the individual instances of drug exposures. It is the start date of the very first chronologically recorded instance of utilization of a drug."}),
    StructField("drug_era_end_date", DateType(), False,
                metadata={"comment": "The end date for the drug era constructed from the individual instance of drug exposures. It is the end date of the final continuously recorded instance of utilization of a drug."}),  
    StructField("drug_exposure_count", IntegerType(), True,
                metadata={"comment": "The number of individual drug exposure occurrences used to construct the drug era."}),
    StructField("gap_days", IntegerType(), True, 
                metadata={"comment": "The number of days that separates two drugs that are adjacent to each other, if there is a gap of more than 30 days between two drug eras, then they are considered two separate eras."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_drug_era_rules = {
    "valid_drug_era_id": "drug_era_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",  
    "valid_drug": "drug_concept_id IS NOT NULL",
    "valid_start_date": "drug_era_start_date IS NOT NULL",
    "valid_end_date": "drug_era_end_date IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_drug_era_rules = {
    "valid_concept_value": "drug_concept_id > 0",
    "valid_dates": "drug_era_end_date >= drug_era_start_date",
    "valid_exposure_count": "drug_exposure_count IS NULL OR drug_exposure_count > 0",
    "valid_gap_days": "gap_days IS NULL OR gap_days >= 0"
}


@dlt.table(
    name="drug_era",
    comment="OMOP CDM Drug Era table - Contains records of the span of time when the Person is assumed to be exposed to a particular active ingredient",
    schema=drug_era_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_drug_era_rules)
@dlt.expect_all(advisory_drug_era_rules)
def create_omop_drug_era():
     return spark.table("4_prod.dlt.omop_drug_era")


In [0]:

# Define the dose_era schema
dose_era_schema = StructType([
    StructField("dose_era_id", LongType(), False,
                metadata={"comment": "A unique identifier for each Dose Era."}),
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the Person who is subjected to the drug during the drug era."}),
    StructField("drug_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key that refers to a Standard Concept identifier for the active Ingredient Concept."}),
    StructField("unit_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key that refers to a Standard Concept identifier for the unit concept."}),
    StructField("dose_value", FloatType(), False,
                metadata={"comment": "The numeric value of the daily dose."}),
    StructField("dose_era_start_date", DateType(), False,
                metadata={"comment": "The start date for the drug era constructed from the individual instances of drug exposures."}),
    StructField("dose_era_end_date", DateType(), False,
                metadata={"comment": "The end date for the drug era constructed from the individual instance of drug exposures."})
])

# Mandatory rules for dose_era validation
mandatory_dose_era_rules = {
    "valid_dose_era_id": "dose_era_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",
    "valid_drug_concept": "drug_concept_id IS NOT NULL",
    "valid_unit_concept": "unit_concept_id IS NOT NULL",
    "valid_dose_value": "dose_value IS NOT NULL",
    "valid_start_date": "dose_era_start_date IS NOT NULL",
    "valid_end_date": "dose_era_end_date IS NOT NULL" 
}

# Advisory data quality rules
advisory_dose_era_rules = {
    "valid_dose_value_positive": "dose_value > 0",
    "valid_dates": "dose_era_end_date >= dose_era_start_date",
    "valid_concept_values": "drug_concept_id > 0 AND unit_concept_id > 0"
}


@dlt.table(
    name="dose_era",
    comment="OMOP CDM Dose Era table - Contains records of constant dose exposure to a specific ingredient",
    schema=dose_era_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_dose_era_rules)
@dlt.expect_all(advisory_dose_era_rules)
def create_omop_dose_era():
     return spark.table("4_prod.dlt.omop_dose_era")


In [0]:


# Data quality rules for condition eras
condition_era_schema = StructType([
    StructField("condition_era_id", IntegerType(), False,
                metadata={"comment": "A unique identifier for each Condition Era."}),
    StructField("person_id", IntegerType(), False,
                metadata={"comment": "A foreign key identifier to the Person who is experiencing the condition during the condition era."}),
    StructField("condition_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key that refers to a Standard Condition Concept identifier in the Standardized Vocabularies."}),
    StructField("condition_era_start_date", DateType(), False,
                metadata={"comment": "The start date for the Condition Era constructed from the individual instances of Condition Occurrences. It is the start date of the very first chronologically recorded instance of the Condition."}),
    StructField("condition_era_end_date", DateType(), False,
                metadata={"comment": "The end date for the Condition Era constructed from the individual instances of Condition Occurrences. It is the end date of the final continuously recorded instance of the Condition."}),
    StructField("condition_occurrence_count", IntegerType(), True,
                metadata={"comment": "The number of individual Condition Occurrences used to construct the Condition Era."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_condition_era_rules = {
    "valid_condition_era_id": "condition_era_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",
    "valid_condition": "condition_concept_id IS NOT NULL",
    "valid_start_date": "condition_era_start_date IS NOT NULL",
    "valid_end_date": "condition_era_end_date IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_condition_era_rules = {
    "valid_concept_value": "condition_concept_id > 0",
    "valid_dates": "condition_era_end_date >= condition_era_start_date",
    "valid_count": "condition_occurrence_count IS NULL OR condition_occurrence_count > 0"
}


@dlt.table(
    name="condition_era",
    comment="OMOP CDM Condition Era table - Contains records that represent spans of time when a Person is assumed to have a given condition",
    schema=condition_era_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_condition_era_rules)
@dlt.expect_all(advisory_condition_era_rules)
def create_omop_condition_era():
     return spark.table("4_prod.dlt.omop_condition_era")


In [0]:

observation_period_schema = StructType([
    StructField("observation_period_id", IntegerType(), False, 
                metadata={"comment": "A unique identifier for each observation period."}),
    StructField("person_id", LongType(), False,
                metadata={"comment": "A foreign key identifier to the Person for whom the observation period is defined."}),
    StructField("observation_period_start_date", DateType(), False,
                metadata={"comment": "The start date of the observation period for which data are available from the data source."}),
    StructField("observation_period_end_date", DateType(), False,
                metadata={"comment": "The end date of the observation period for which data are available from the data source."}),
    StructField("period_type_concept_id", IntegerType(), False,
                metadata={"comment": "A foreign key to the predefined concept identifier in the Standardized Vocabularies reflecting the source of the observation period information."})
])

# Mandatory rules - these must be met or the record is dropped
mandatory_observation_period_rules = {
    "valid_observation_id": "observation_period_id IS NOT NULL",
    "valid_person": "person_id IS NOT NULL",
    "valid_start_date": "observation_period_start_date IS NOT NULL",
    "valid_end_date": "observation_period_end_date IS NOT NULL",
    "valid_type_concept": "period_type_concept_id IS NOT NULL"
}

# Advisory data quality rules - these are tracked but don't cause record drops
advisory_observation_period_rules = {
    "valid_dates": "observation_period_end_date >= observation_period_start_date",
    "valid_type_concept_value": "period_type_concept_id > 0"
}

@dlt.table(
    name="observation_period",
    comment="OMOP CDM Observation Period table - Contains records which define spans of time during which clinical events are recorded for a Person",
    schema=observation_period_schema,
    table_properties={"quality": "gold"}
)
@dlt.expect_all_or_drop(mandatory_observation_period_rules)
@dlt.expect_all(advisory_observation_period_rules)
def create_omop_observation_period():
     return spark.table("4_prod.dlt.omop_observation_period")
 