In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import max as spark_max
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from datetime import datetime
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import *
from pyspark.sql import functions as F
from functools import reduce

In [0]:
TARGET_CATALOG = "4_prod"
TARGET_SCHEMA = "silver"

def get_target_table(table_name: str) -> str:
    """Generate fully qualified table name for target environment."""
    return f"{TARGET_CATALOG}.{TARGET_SCHEMA}.{table_name}"

In [0]:
def has_cdf_enabled(table_name: str) -> bool:
    """
    Check if Change Data Feed is enabled for a Delta table.
    """
    try:
        props = spark.sql(f"SHOW TBLPROPERTIES {table_name}").collect()
        for row in props:
            if row.key == "delta.enableChangeDataFeed" and row.value.lower() == "true":
                return True
        return False
    except Exception:
        return False
    
def get_max_timestamp(table_name: str,
                      ts_column: str = "ADC_UPDT",
                      default_date: datetime = datetime(1980, 1, 1)
                     ) -> datetime:
    """
    Returns the greatest value of `ts_column` in `table_name`.
    If CDF is enabled, uses the earlier of max(ts_column) or table's last update time
    to guard against future-dated timestamps.
    If the table or the column does not exist the supplied default is returned.
    """
    try:
        if not table_exists(table_name):
            return default_date
        
        # Get max value from the timestamp column
        max_row = (
            spark.table(table_name)
                 .select(F.max(ts_column).alias("max_ts"))
                 .first()
        )
        max_ts_value = max_row.max_ts or default_date
        
        # If CDF is enabled, check table's actual last update time as a safeguard
        if has_cdf_enabled(table_name):
            try:
                # Get the table's last modification timestamp from Delta history
                history = spark.sql(f"DESCRIBE HISTORY {table_name} LIMIT 1").collect()
                
                if history and len(history) > 0:
                    table_last_update = history[0].timestamp
                    
                    # Convert to datetime if it's not already
                    if isinstance(table_last_update, str):
                        from datetime import datetime
                        table_last_update = datetime.fromisoformat(table_last_update.replace('Z', '+00:00'))
                    
                    # Use the earlier of the two timestamps to be conservative
                    # This protects against future-dated ADC_UPDT values
                    if table_last_update < max_ts_value:
                        print(f"[INFO] {table_name}: Using table update time {table_last_update} instead of max {ts_column} {max_ts_value}")
                        return table_last_update
                    
            except Exception as cdf_error:
                print(f"[WARN] Could not check CDF history for {table_name}: {cdf_error}")
                print(f"[INFO] Falling back to max {ts_column} value")
        
        return max_ts_value
        
    except Exception as e:
        print(f"Warning: could not read {ts_column} from {table_name}: {e}")
        return default_date
    


def table_exists(table_name: str) -> bool:
    """
    Checks whether a table exists without triggering an AnalysisException.
    Works with fully-qualified names: <catalog>.<schema>.<table>
    """
    # Spark 3.4+ – Databricks – works with Unity Catalog
    return spark.catalog.tableExists(table_name)

In [0]:
def detect_schema_changes(target_table: str, target_schema: StructType = None, table_comment: str = None):
    """
    Detect what schema changes are needed between current and target schema.
    Returns a dict with all required changes.
    """
    changes = {
        'has_changes': False,
        'columns_to_update': [],
        'columns_to_add': [],
        'table_comment_update': None
    }
    
    if target_schema:
        current_schema = spark.table(target_table).schema
        current_fields = {f.name: f for f in current_schema.fields}
        target_fields = {f.name for f in target_schema.fields}

        for target_field in target_schema.fields:
            field_name = target_field.name
            
            if field_name in current_fields:
                current_field = current_fields[field_name]
                
                # Compare type and comment
                current_comment = current_field.metadata.get("comment", "")
                target_comment = target_field.metadata.get("comment", "")
                type_changed = current_field.dataType != target_field.dataType
                comment_changed = current_comment != target_comment
                
                if type_changed or comment_changed:
                    changes['columns_to_update'].append({
                        'name': field_name,
                        'type': target_field.dataType.simpleString(),
                        'comment': target_comment,
                        'type_changed': type_changed,
                        'comment_changed': comment_changed
                    })
                    changes['has_changes'] = True
            else:
                # New column to add
                changes['columns_to_add'].append({
                    'name': field_name,
                    'type': target_field.dataType.simpleString(),
                    'comment': target_field.metadata.get("comment", ""),
                    'nullable': target_field.nullable
                })
                changes['has_changes'] = True

    if table_comment:
        # Check current table comment
        try:
            current_props = spark.sql(f"SHOW TBLPROPERTIES {target_table}").collect()
            current_comment = next((row.value for row in current_props if row.key == 'comment'), None)
            
            if current_comment != table_comment:
                changes['table_comment_update'] = table_comment
                changes['has_changes'] = True
        except:
            # If we can't get properties, assume update is needed
            changes['table_comment_update'] = table_comment
            changes['has_changes'] = True
    
    return changes

# ============================================================================
# Schema Application
# ============================================================================
def escape_comment(text: str) -> str:
    if not text:
        return ""
    return text.replace("\\", "\\\\").replace("'", "''")
    
def apply_schema_changes(target_table: str, changes: dict):
    """
    Apply detected schema changes efficiently.
    Minimizes ALTER statements and provides clear feedback.
    
    """
    updates_applied = []
    
    # Add new columns
    for col in changes['columns_to_add']:
        sql = f"ALTER TABLE {target_table} ADD COLUMN `{col['name']}` {col['type']}"
        if col['comment']:
            sql += f" COMMENT '{escape_comment(col['comment'])}'"
        spark.sql(sql)
        updates_applied.append(f"Added column {col['name']}")
    
    # Update existing columns
    type_change_detected = any(col['type_changed'] for col in changes['columns_to_update'])

    if type_change_detected:
        print(f"Type change detected. Recreating table {target_table}...")
        
        # FIXED: Read the FULL existing table, not just incremental data
        df = spark.table(target_table)
        
        cols_to_cast = [col['name'] for col in changes['columns_to_update'] if col['type_changed']]
        pre_counts = df.select([F.count(F.col(c)).alias(c) for c in cols_to_cast]).collect()[0].asDict()

        for col in changes['columns_to_update']:
            if col['type_changed']:
                df = df.withColumn(col['name'], df[col['name']].cast(col['type']))
        
        # Now check the data loss after changing the data type
        post_counts = df.select([F.count(F.col(c)).alias(c) for c in cols_to_cast]).collect()[0].asDict()
        losses = {c: pre_counts[c] - post_counts[c] for c in cols_to_cast if pre_counts[c] - post_counts[c] > 0}
                
        if losses:
            raise ValueError(f"Data loss detected during cast: {losses}")
        else:
            print("✔ All type casts completed without introducing NULLs.")
        
        # FIXED: Preserve table properties when recreating
        df.write.format("delta") \
            .mode("overwrite") \
            .option("overwriteSchema", "true") \
            .option("delta.enableChangeDataFeed", "true") \
            .option("delta.enableRowTracking", "true") \
            .saveAsTable(target_table)

        updates_applied.append(f"Recreated {target_table} due to type change")

    # Handle column comment updates separately
    comment_change_detected = any(col['comment_changed'] for col in changes['columns_to_update'])
    
    if comment_change_detected:
        for col in changes['columns_to_update']:
            if col['comment_changed']:
                spark.sql(f"""
                    ALTER TABLE {target_table}
                    ALTER COLUMN `{col['name']}` COMMENT '{escape_comment(col['comment'])}'
                """)
                updates_applied.append(f"Updated {col['name']} comment")

    # Update table comment
    if changes['table_comment_update']:
        spark.sql(f"""
            ALTER TABLE {target_table}
            SET TBLPROPERTIES ('comment' = '{escape_comment(changes['table_comment_update'])}')
        """)
        updates_applied.append("Updated table comment")
    
    if updates_applied:
        print(f"[INFO] Applied {len(updates_applied)} updates to {target_table}:")
        for i, update in enumerate(updates_applied):
            if i < 5:  # Show first 5 changes
                print(f"  - {update}")
            elif i == 5:
                print(f"  ... and {len(updates_applied) - 5} more")
                break

# ============================================================================
# Table Creation
# ============================================================================

def create_table_with_schema(source_df, target_table: str, target_schema: StructType = None, table_comment: str = None):
    """
    Create a new Delta table with schema and metadata.
    """
    if target_schema:
        # 1. Create the empty table first
        builder = (DeltaTable.createIfNotExists(spark)
                  .tableName(target_table)
                  .addColumns(target_schema))
        
        if table_comment:
            builder = builder.comment(table_comment)
        
        builder = (builder
                  .property("delta.enableChangeDataFeed", "true")
                  .property("delta.enableRowTracking", "true"))
        builder.execute()
        print(f"[INFO] Created table {target_table} with schema and metadata")
        
        # 2. Enforce schema using Select/Cast (Avoiding RDD conversion)
        # This keeps the operation inside the JVM and preserves Catalyst optimizations
        select_expr = []
        for field in target_schema.fields:
            if field.name in source_df.columns:
                select_expr.append(F.col(field.name).cast(field.dataType))
            else:
                # Handle missing columns if necessary, or let it fail depending on requirements
                select_expr.append(F.lit(None).cast(field.dataType).alias(field.name))
        
        source_df_aligned = source_df.select(*select_expr)
        
        # 3. Append data
        source_df_aligned.write.mode("append").saveAsTable(target_table)
        
        apply_column_comments(target_table, target_schema)
    else:
        # Fallback for no schema
        (source_df.write
                  .format("delta")
                  .option("delta.enableChangeDataFeed", "true")
                  .mode("overwrite")
                  .saveAsTable(target_table))

def apply_column_comments(target_table: str, schema: StructType):
    """Helper to apply column comments to a newly created table."""
    comments_applied = 0
    for field in schema.fields:
        if "comment" in field.metadata and field.metadata["comment"]:
            spark.sql(f"""
                ALTER TABLE {target_table}
                ALTER COLUMN `{field.name}` 
                COMMENT '{escape_comment(field.metadata["comment"])}'
            """)
            comments_applied += 1
    
    if comments_applied > 0:
        print(f"[INFO] Applied {comments_applied} column comments to {target_table}")

# ============================================================================
# Main Update Function
# ============================================================================
def update_table(source_df, target_table: str, index_columns, 
                 target_schema: StructType = None, table_comment: str = None):
    """
    Args:
        source_df: DataFrame with updates to merge
        target_table: Fully qualified target table name
        index_columns: Single column name (str) or list of column names for merge key
        target_schema: Optional target schema for schema evolution
        table_comment: Optional table comment
    """
    # Normalize index_columns to list
    if isinstance(index_columns, str):
        index_keys = [index_columns]
    else:
        index_keys = list(index_columns)
    
    if table_exists(target_table):
        print(f"[INFO] Table {target_table} exists. Checking for schema updates...")
        
        if target_schema or table_comment:
            schema_changes = detect_schema_changes(target_table, target_schema, table_comment)
            if schema_changes['has_changes']:
                print(f"[INFO] Schema changes detected for {target_table}")
                apply_schema_changes(target_table, schema_changes)

        # Check if empty using take(1) to avoid full table scan
        if len(source_df.take(1)) == 0:
            print(f"[INFO] Source DataFrame is empty. Skipping update for {target_table}")
            return
        
        # FIXED: Single merge operation with null-safe comparison for all keys
        print(f"[INFO] Performing merge on {target_table} using column(s): {index_keys}")
        
        # Build null-safe merge condition for all index columns
        merge_condition = " AND ".join([f"t.`{col}` <=> s.`{col}`" for col in index_keys])

        tgt = DeltaTable.forName(spark, target_table)
        (
            tgt.alias("t")
            .merge(source_df.alias("s"), merge_condition)
            .whenMatchedUpdateAll()
            .whenNotMatchedInsertAll()
            .execute()
        )
        
        print(f"[INFO] Successfully merged data into {target_table}")
        
    else:
        print(f"[INFO] Table {target_table} does not exist. Creating new table...")
        create_table_with_schema(source_df, target_table, target_schema, table_comment)
        print(f"[INFO] Successfully created and populated {target_table}")


In [0]:
pharos_person_comment = "The table contains demographic information about patients, including identifiers such as person ID, gender, birth year, and ethnicity, etc." 

schema_pharos_person = StructType([
    StructField(
        name="person_id",
        dataType=LongType(),
        nullable=True,
        metadata={"comment": "Assigned unique ID for each participant (TBC)."}
    ),
    StructField(
        name="cohort",
        dataType=StringType(),
        nullable=True,
        metadata={
            "comment": (
                "Pharos cohort groups\n"
                "Cohort 1a: Breast - matching QMUL/Barts OWKIN series\n"
                "Cohort 1b: Breast - matching KCL OWKIN series\n"
                "Cohort 1c: Breast - 100K Genome Participants\n"
                "Cohort 2a: Pancreatic - dataset 1 (TBC)\n"
                "Cohhort 2b: Pancreatic - dataset 2 (TBC)\n"
                "Cohort 3a: Lung - dataset 1 (TBC)\n"
                "Cohort 3b: Lung - dataset 2 (TBC)"
            )
        }
    ),
    StructField(
        name="tumour_group",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Which tumour group the participant belongs to; if more than one for the same participant, create a new row for each tumour group."}
    ),
    StructField(
        name="site",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Center identifier."}
    ),
    StructField(
        name="yob",
        dataType=IntegerType(),
        nullable=True,
        metadata={"comment": "Year of birth."}
    ),
    StructField(
        name="ethnicity",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Ethnicity."}
    ),
    StructField(
        name="ethnic_group",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Ethnicity."}
    ),
    StructField(
        name="sex",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Sex at birth."}
    ),
    StructField(
        name="date_checked",
        dataType=TimestampType(),
        nullable=True,
        metadata={"comment": "Date medical record was checked."}
    )
])

def create_pharos_person_incr():
    """
    """
    target_table = get_target_table("pharos_person")
    max_adc_updt = get_max_timestamp(target_table, ts_column="ADC_UPDT")

    map_person = (
        spark.table("4_prod.bronze.map_person")
        .filter(col("ADC_UPDT") > max_adc_updt)
        .withColumn(
            "sex",
            when(col("gender_cd") == 362.0, lit("F"))
            .when(col("gender_cd") == 363.0, lit("M"))
            .otherwise(lit("Unknown"))
        )
        .withColumn(
            "ethnicity",
            when(col("ethnicity_cd").isin(3767643,3767645,3767650), lit("1 White"))
            .when(col("ethnicity_cd").isin(3767649,3767652,3767653,3767654), lit("2 Mixed"))
            .when(col("ethnicity_cd").isin(3767640,3767644,3767651,3767647), lit("3 Asian"))
            .when(col("ethnicity_cd").isin(3767638,3767641,3767648), lit("4 Black"))
            .when(col("ethnicity_cd").isin(312508, 3767639, 3767642), lit("5 Other ethnic group"))
            .when(col("ethnicity_cd").isin(0, 3767646), "6 Unknown")
            .otherwise(lit("6 Unknown")) 
        )
        .withColumn(
            "ethnic_group", 
            when(col("ethnicity_cd") == 3767643, "11 White - British")
            .when(col("ethnicity_cd") == 3767645, "12 White - Irish")
            .when(col("ethnicity_cd") == 3767650, "13 White - White-Other")
            .when(col("ethnicity_cd") == 3767654, "21 Mixed - White and Black Caribbean")
            .when(col("ethnicity_cd") == 3767653, "22 Mixed - White and Black African")
            .when(col("ethnicity_cd") == 3767652, "23 Mixed - White Asian")
            .when(col("ethnicity_cd") == 3767649, "24 Mixed - Mixed-Other")
            .when(col("ethnicity_cd") == 3767644, "31 Asian - Indian")
            .when(col("ethnicity_cd") == 3767651, "32 Asian - Pakistani")
            .when(col("ethnicity_cd") == 3767640, "33 Asian - Bangladeshi")
            .when(col("ethnicity_cd") == 3767647, "34 Asian - Asian-Other")
            .when(col("ethnicity_cd") == 3767642, "36 Asian - East Asian")
            .when(col("ethnicity_cd") == 3767641, "41 Black - Caribbean")
            .when(col("ethnicity_cd") == 3767638, "42 Black - African")
            .when(col("ethnicity_cd") == 3767648, "43 Black - Black-Other")
            .when(col("ethnicity_cd") == 3767642, "51 Other - Chinese")
            .when(col("ethnicity_cd").isin(3767639, 312508), "54 Other - Other")
            .when(col("ethnicity_cd").isin(0, 3767646), "99 Unknown")
            .otherwise("99 Unknown")
        )
        .dropDuplicates()
    )

    map_diagnosis = (
        spark.table("4_prod.bronze.map_diagnosis")
        .select("PERSON_ID","OMOP_CONCEPT_ID","ICD10_CODE")
    )

    site = (
        map_diagnosis
        .withColumn(
            "tumour_group",
            when(col("ICD10_CODE").like("C50%") | col("OMOP_CONCEPT_ID").isin(45768522, 35624616, 602331), "breast")
            .otherwise(None)
        )
        .dropDuplicates()
    ).alias("s")

    final_df = (
        map_person.alias("p")
        .join(site.alias("s"), col("p.person_id") == col("s.PERSON_ID"), "left")
        .select(
            col("p.person_id").cast(LongType()),
            lit(None).cast(StringType()).alias("cohort"),
            col("s.tumour_group").cast(StringType()),
            lit(None).cast(StringType()).alias("site"),
            col("p.birth_year").cast(IntegerType()).alias("yob"),
            col("p.ethnicity").cast(StringType()),
            col("p.ethnic_group").cast(StringType()),
            col("p.sex").cast(StringType()),
            col("p.ADC_UPDT").cast(TimestampType()).alias("ADC_UPDT")
        )
        .dropDuplicates(["person_id"])
    )

    return final_df

updates_df = create_pharos_person_incr()
update_table(updates_df, get_target_table("pharos_person"), "person_id", schema_pharos_person, pharos_person_comment)

    

In [0]:
pharos_medical_history_comment = "The table contains the medical history of partcipants." 
schema_pharos_medical_history = StructType([
    StructField(
        name="person_id",
        dataType=LongType(),
        nullable=True,
        metadata={"comment": "Assigned unique ID for each participant (TBC)."}
    ),
    StructField(
        name="personal_cancer_history",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Personal history of any cancer other than the cancer of interest, each cancer comma separated."}
    ),
    StructField(
        name="familyhistory_bca",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Family history of breast cancer - assessed at time of first diagnosis."}
    ),
    StructField(
        name="familyhistory_relation",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Where there is a family history of breast or ovarian cancer, the highest degree relation."}
    ),
    StructField(
        name="familyhistory_cancer",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Family history of any cancer - assessed at time of first diagnosis."}
    ),
    StructField(
        name="genetic_testing",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Whether patient underwent genetic testing."}
    ),
    StructField(
        name="any_germline_mutation",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "The presence of any pathological mutation found through genetic testing Mandatory only where genetic_testing =Yes."}
    ),
    StructField(
        name="brca1",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Results from BRCA1 testing."}
    ),
    StructField(
        name="brca2",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Results from BRCA2 testing."}
    ),
    StructField(
        name="tp53",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Results from TP53 testing."}
    ),
    StructField(
        name="palb2",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Results from PALB2 testing."}
    ),
    StructField(
        name="chek2",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Results from CHEK2 testing."}
    ),
    StructField(
        name="atm",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Results from ATM testing."}
    ),
    StructField(
        name="rad51c",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Results from RAD51C testing."}
    ),
    StructField(
        name="rad51d",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Results from RAD51D testing."}
    ),
    StructField(
        name="menopausal_status",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Recorded menopausal status at time of first diagnosis."}
    ),
    StructField(
        name="menopause_age",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Age in years at menopause - leave blank if unknown."}
    ),
    StructField(
        name="inferred_menopausal_status",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Inferred menopausal status based on age and treatment patterns:Postmenopausal if menstrual period had stopped naturally or surgically by bilateral oopherectomy. Those with unknown menopausal age, who reported irregular menses, hysterectomy, or MHT use, considered postmenopausal at age 53.Those taking aromatase inhibitors considered postmenopausal."}
    ),
    StructField(
        name="hrt",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "If patient is currently taking HRT or has in the past  - assessed at time of first diagnosis."}
    ),
    StructField(
        name="hrt_years",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Time in years of HRT use."}
    ),
    StructField(
        name="contraception_use",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Use of contraception."}
    ),
    StructField(
        name="contraception_details",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Details of contraception."}
    ),
    StructField(
        name="presentation",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Method of presentation to oncology."}
    ),
    StructField(
        name="height_diagnosis",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Height at primary cancer diagnosis (cm)."}
    ),
    StructField(
        name="weight_diagnosis",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Weight at primary cancer diagnosis (kg)."}
    ),
    StructField(
        name="bmi_diagnosis",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "BMI at primary cancer diagnosis - closest availabile BMI to diagnosis date within 6 months, with priority given to BMI prior to diagnosis."}
    ),
    StructField(
        name="diabetes",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Whether patient is diabetic according to medical record."}
    ),
    StructField(
        name="medications",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "List of medications taken by the patient, separated by semi colon (;)."}
    ),
    StructField(
        name="smoking",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Smoking status at time of diagnosis."}
    ),
    StructField(
        name="alcohol",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Alcohol use at time of diagnosis."}
    ),
    StructField(
        name="performance_diagnosis",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Performance status (ECOG) closest to diagnosis, within 30 days."}
    ),
    StructField(
        name="parous",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Patient has had pregnancies carried to viable gestational age."}
    ),
    StructField(
        name="parity_no",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Number of live births."}
    ),
    StructField(
        name="age_first_pregnancy",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Age of patient (years) at first pregnancy."}
    ),
    StructField(
        name="pabc",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Pregnancy associated breast cancer, defined as breast cancer diagnosed during pregnancy, or within 12 months after giving birth."}
    ),
    StructField(
        name="time_pregnancytobc",
        dataType=StringType(),
        nullable=True,
        metadata={"comment": "Time calculated between last live birth and breast cancer diagnosis (in years)"}
    )
])

def create_medical_history_incr():
    """

    """
    target_table = get_target_table("pharos_medical_history")
    max_adc_updt = get_max_timestamp(target_table, ts_column="ADC_UPDT")

    map_diagnosis = spark.table("4_prod.bronze.map_diagnosis")
    map_problem = spark.table("4_prod.bronze.map_problem")
    map_numeric_events = spark.table("4_prod.bronze.map_numeric_events")
    map_family_history = spark.table("4_prod.bronze.map_family_history")

    # Get the breast cancer cohort
    breast_cancer_cohort = (
        map_diagnosis
        .filter(col("ADC_UPDT") > max_adc_updt)
        .filter(
            col("ICD10_CODE").like("C50%") |
            col("OMOP_CONCEPT_ID").isin(45768522, 35624616, 602331)
        )
        .select("PERSON_ID", "earliest_diagnosis_date", "ADC_UPDT")
    )

    # Narrow down the cohort to only those who have a diagnosis of breast cancer
    cohort_persons = breast_cancer_cohort.select("PERSON_ID").distinct()

    map_diagnosis_filtered = map_diagnosis.join(cohort_persons, "PERSON_ID", "semi")
    map_problem_filtered = map_problem.join(cohort_persons, "PERSON_ID", "semi")
    map_numeric_events_filtered = map_numeric_events.join(cohort_persons, "PERSON_ID", "semi")
    map_family_history_filtered = map_family_history.join(cohort_persons, "PERSON_ID", "semi")

    # Get the range of dates to check for medical history
    brc_diagdate_range = (
        breast_cancer_cohort
        .withColumn("check_onset_date", date_sub(col("earliest_diagnosis_date"), 365))
        .withColumn("check_offset_date", date_add(col("earliest_diagnosis_date"), 7))
    )

    comb_prob_diag = (
        map_problem_filtered
        .select("PERSON_ID", "SOURCE_STRING", "SOURCE_IDENTIFIER", "OMOP_CONCEPT_ID", "SNOMED_CODE", "ICD10_CODE", col("ONSET_DT_TM").alias("condition_date"))
        .unionByName(
            map_diagnosis_filtered
            .select("PERSON_ID", "SOURCE_STRING", "SOURCE_IDENTIFIER", "OMOP_CONCEPT_ID", "SNOMED_CODE", "ICD10_CODE", col("DIAG_DT_TM").alias("condition_date")))
        .join(brc_diagdate_range.select("PERSON_ID", "check_onset_date", "check_offset_date"), ["PERSON_ID"], "left")
        .filter(col("condition_date").between(col("check_onset_date"), col("check_offset_date")))
    ).alias("c")

    #-------------PERSON_CANCER_HISTORY
    personal_history_ids = [
        4058705,4077068,3180053,4323212,3176981,3175032,3190039,3184244,3189194,
        3181707,46273659,46273500,45763611,37159719,3189077,44782983,4323367,
        37016142,4180113,4324190,4212564,4178782,35610791,4197758,4180131,4187203,
        4190633,4333345,4187205,4190635,4177071,4333465,4327872,46273417,4178640,
        4180749,46273501,4179242,43021271,4179069,4187206,4325868,4216132,4323346,
        4179084,4325186,4181024,1246982,4332932,46270077,4325206,44782997,46273380,
        3176052,3184017,46270545,3184515,3174258,3169330,46273376,46273480,46269969,
        4327107,46270611,4324189
    ]

    person_history = (
        map_diagnosis_filtered
        .withColumn(
            "has_personal_cancer",
            when(
                (col("OMOP_CONCEPT_ID").isin(personal_history_ids)) | (col("ICD10_CODE").like("Z85%")), 
                col("SOURCE_STRING")
            ).otherwise(None)
        )
        .filter(col("has_personal_cancer").isNotNull())
        .groupBy("PERSON_ID")
        .agg(
            concat_ws(";", collect_set("has_personal_cancer"))
            .alias("personal_cancer_history")
        )
    )

    #-------------FAMILY_CANCER_HISTORY
    processed_family_history = (
        map_family_history_filtered
        .withColumn(
            "familyhistory_relation",
            when(col("RELATION_CD").isin(81849783,81849776,81849760,81849757,81849790,81849795,153,160), "1 1st degree")
            .when(col("RELATION_CD").isin(81849796,81849770,81849784,81849774,81849785,81849791,81849786), "2 2nd degree")
            .when(col("RELATION_CD") == 81849761, "3 3rd degree")
            .when(col("RELATION_CD").isin(634771,81849794,81849793,81849762,81849777,81849797),"7 None")
            .otherwise(lit("9 Unknown"))
        )
        .select("PERSON_ID","familyhistory_relation","CONDITION_DESC")
    ).alias("f")

    family_history_add_ids = [4175994, 43530673, 4329111, 4160695, 4195970, 37117109, 42535500, 3078338013, 45884753, 46273481, 1243977, 4326336, 37311977, 4176765, 4334494, 4179232, 46273150, 4328801, 4324202, 46273151, 42535054, 4334339, 4323762, 4322902, 4177058, 4328583, 35624517, 46274041, 4179082, 4324203, 764948, 4327415]

    family_history = (
        comb_prob_diag
        .join(processed_family_history, ["PERSON_ID"], "left")
        .withColumn(
            "breast_cancer",
            when(
                (col("c.ICD10_CODE").like("Z803")) |
                (col("c.OMOP_CONCEPT_ID").isin(4179963, 4329111, 4160695, 42535500, 46270135, 4210263, 4176765, 4328583, 35624517, 46270155, 46270130)) |
                (col("f.CONDITION_DESC") == "Breast cancer"),
                1
            ).otherwise(0)
        )
        .withColumn(
            "ovarian_cancer",
            when(col("c.OMOP_CONCEPT_ID").isin(4326681,37117109,37109210), 1).otherwise(0)
        )
        .groupBy("PERSON_ID")
        .agg(
            F.max("breast_cancer").alias("has_fh_breast"),
            F.max("ovarian_cancer").alias("has_fh_ovarian"),
            F.max(
                when(
                    (col("breast_cancer") == 1) |
                    (col("ovarian_cancer") == 1) |
                    (col("c.OMOP_CONCEPT_ID").isin(family_history_add_ids)) |
                    (col("c.ICD10_CODE").like("Z80%")),
                    1
                ).otherwise(0)
            ).alias("has_any_fh_cancer"),
            F.min("f.familyhistory_relation").alias("familyhistory_relation")
        )
        .withColumn(
            "familyhistory_bca",
            when((col("has_fh_breast")==1) & (col("has_fh_ovarian")==1), "3 Breast and Ovarian")
            .when(col("has_fh_breast")==1, "1 Breast")
            .when(col("has_fh_ovarian")==1, "2 Ovarian")
            .otherwise("9 Unknown")
        )
        .withColumn(
            "familyhistory_cancer",
            when(col("has_any_fh_cancer")==1, "Yes").otherwise("9 Unknown")
        )
        .select("PERSON_ID","familyhistory_bca","familyhistory_relation","familyhistory_cancer")
    )

    #-------------SMOKING, ALCOHOL & DIABETES
    smoke_alcohol_diabetes = (
        comb_prob_diag
        .withColumn(
            "smoking_rank",
            when(col("OMOP_CONCEPT_ID") == 4144272, 0)  # Never smoked
            .when(col("OMOP_CONCEPT_ID").isin(4019979, 4233486, 4222303), 1)  # Non-smoker
            .when(col("OMOP_CONCEPT_ID").isin(4298794,42709996,4276526,4042037,4044776,4041511,4044777,4044778,4044775,37395605,4218917,4246415,44784248), 3)  # Current smoker
            .when(col("OMOP_CONCEPT_ID").isin(4310250,4052032,4092281,4145798,4141782,4141783,4141784,42536346,46270534,35610339,4148416,4052465,4052949,45538046), 2)  # Past smoker
            .when(col("OMOP_CONCEPT_ID").isin(40278848, 4269997), 2)  # Unknown usage
            .otherwise(99)
        )
        .withColumn(
            "alcohol_rank",
            when(col("OMOP_CONCEPT_ID").isin(4022664, 4052945, 4022703), 0)  # No alcohol
            .when(col("OMOP_CONCEPT_ID").isin(4074035,45552103,45766930,37208405,4238768,432456,4330794,4052460,4035931,4027638,44788725,36712761,4042860,4177989,4052028), 1)  # Drinks alcohol
            .when(col("OMOP_CONCEPT_ID") == 4027639, 2)  # Regular drinker
            .otherwise(99)
        )
        .withColumn(
            "diabetes_rank",
            when(col("ICD10_CODE").like("E10%"), 1)  # Type I
            .when((col("ICD10_CODE").like("E11%")) | (col("OMOP_CONCEPT_ID") == 45757508), 2)  # Type II
            .when((col("ICD10_CODE").like("R73%")) | (col("OMOP_CONCEPT_ID").isin(44808385, 37018196)), 3)  # Pre-diabetic
            .when(col("ICD10_CODE").rlike("^(E14|O24|E13|K869|E12|E881)"), 4)  # Other
            .otherwise(99)
        )
        .groupBy("PERSON_ID")
        .agg(
            F.min("smoking_rank").alias("smoking_rank"),
            F.min("alcohol_rank").alias("alcohol_rank"),
            F.min("diabetes_rank").alias("diabetes_rank")
        )
        .withColumn(
            "smoking",
            when(col("smoking_rank") == 0, "0 Never smoked")
            .when(col("smoking_rank") == 1, "1 Non-smoker")
            .when(col("smoking_rank") == 2, "3 Yes, past smoker")
            .when(col("smoking_rank") == 3, "2 Yes, current smoker")
            .otherwise("9 Unknown")
        )
        .withColumn(
            "alcohol",
            when(col("alcohol_rank") == 0, "0 No alcohol use")
            .when(col("alcohol_rank") == 1, "1 Drinks alcohol")
            .when(col("alcohol_rank") == 2, "2 Drinks alcohol - Regularly")
            .otherwise("9 Unknown")
        )
        .withColumn(
            "diabetes",
            when(col("diabetes_rank") == 1, "1 Type I diabetes")
            .when(col("diabetes_rank") == 2, "2 Type II diabetes")
            .when(col("diabetes_rank") == 3, "3 Pre-diabetic/borderline")
            .when(col("diabetes_rank") == 4, "4 Other diabetes types")
            .otherwise("9 Unknown")
        )
        .select("PERSON_ID", "smoking", "alcohol", "diabetes")
    )

    #-------------HEIGHT, WEIGHT, BMI
    processed_numeric_events = (
        map_numeric_events_filtered
        .join(brc_diagdate_range.select("PERSON_ID", "check_onset_date", "check_offset_date", "earliest_diagnosis_date"), ["PERSON_ID"], "left")
        .filter(
            col("PERFORMED_DT_TM").between(col("check_onset_date"), col("check_offset_date"))
        )
        .filter(
            col("OMOP_MANUAL_CONCEPT_NAME").isin("Body height measure","Body weight measure")
        )
        .withColumn(
            "abs_diff",
            abs(datediff(col("PERFORMED_DT_TM"),col("earliest_diagnosis_date")))
        )
    )

    window = (
        Window
        .partitionBy("PERSON_ID", "OMOP_MANUAL_CONCEPT_NAME")
        .orderBy(col("abs_diff").asc())
    )

    closest_height_weight = (
        processed_numeric_events
        .withColumn("rn", row_number().over(window))
        .filter(col("rn") == 1)
    )

    height_weight = (
        closest_height_weight
        .groupBy("PERSON_ID")
        .pivot("OMOP_MANUAL_CONCEPT_NAME",["Body height measure", "Body weight measure"])
        .agg(first("NUMERIC_RESULT"))
        .withColumn("height", col("Body height measure") / 100)
        .withColumnRenamed("Body weight measure", "weight")
        .withColumn("bmi", col("weight") / (col("height") * col("height")))
    )

    #-------------PREGNANCY
    window_parity = Window.partitionBy("PERSON_ID").orderBy(col("NUMERIC_RESULT").desc())
    pregnancy = (
        map_numeric_events_filtered
        .filter((col("EVENT_CD_DISPLAY") == "Parity"))
        .withColumn("rn", row_number().over(window_parity))
        .filter(col("rn") == 1)
        .withColumn("parous", 
                    when((col("EVENT_CD_DISPLAY") == "Parity") & (col("NUMERIC_RESULT")==0), "2 No")
                    .when((col("EVENT_CD_DISPLAY") == "Parity") & (col("NUMERIC_RESULT") > 0), "1 Yes")
                    .otherwise(lit("9 Unknown"))
                    )
        .select("PERSON_ID", "parous", col("NUMERIC_RESULT").alias("parity_no"))
    )

    # Get max ADC_UPDT per person from the cohort
    cohort_with_updt = (
        breast_cancer_cohort
        .groupBy("PERSON_ID")
        .agg(F.max("ADC_UPDT").alias("ADC_UPDT"))
    )

    processed_df = (
        cohort_with_updt
        .join(person_history, ["PERSON_ID"], "left")
        .join(family_history, ["PERSON_ID"], "left")
        .join(smoke_alcohol_diabetes, ["PERSON_ID"], "left")
        .join(height_weight, ["PERSON_ID"], "left")
        .join(pregnancy, ["PERSON_ID"], "left")
    )

    final_df = (
        processed_df
        .select(
            col("PERSON_ID").cast(LongType()).alias("person_id"),
            col("personal_cancer_history"),
            col("familyhistory_bca"),
            col("familyhistory_relation"),
            col("familyhistory_cancer"),
            lit(None).alias("genetic_testing"),
            lit(None).alias("any_germline_mutation"),
            lit(None).alias("brca1"),
            lit(None).alias("brca2"),
            lit(None).alias("tp53"),
            lit(None).alias("palb2"),
            lit(None).alias("chek2"),
            lit(None).alias("atm"),
            lit(None).alias("rad51c"),
            lit(None).alias("rad51d"),
            lit(None).alias("menopausal_status"),
            lit(None).alias("menopause_age"),
            lit(None).alias("inferred_menopausal_status"),
            lit(None).alias("hrt"),
            lit(None).alias("hrt_years"),
            lit(None).alias("contraception_use"),
            lit(None).alias("contraception_details"),
            lit(None).alias("presentation"),
            col("height").cast(StringType()).alias("height_diagnosis"),
            col("weight").cast(StringType()).alias("weight_diagnosis"),
            col("bmi").cast(StringType()).alias("bmi_diagnosis"),
            col("diabetes"),
            lit(None).alias("medications"),
            col("smoking"),
            col("alcohol"),
            lit(None).alias("performance_diagnosis"),
            col("parous"),
            col("parity_no").cast(StringType()),
            lit(None).alias("age_first_pregnancy"),
            lit(None).alias("pabc"),
            lit(None).alias("time_pregnancytobc"),
            col("ADC_UPDT").cast(TimestampType())
        )
        .dropDuplicates(["person_id"])
    )

    return final_df
    
updates_df = create_medical_history_incr()
update_table(updates_df, get_target_table("pharos_medical_history"), "person_id", schema_pharos_medical_history, pharos_medical_history_comment)



In [0]:
pharos_tumour_comment = "Clinical characteristics at breast cancer diagnosis for participants in the PHAROS cohort." 

schema_pharos_tumour = StructType([
    StructField(
        "person_id",
        LongType(),
        True,
        {"comment": "Assigned unique ID for each participant (TBC)"}
    ),
    StructField(
        "date_of_diagnosis",
        DateType(),
        True,
        {"comment": "Patient's initial cancer diagnosis date defined as the date of first biopsy confirming cancer"}
    ),
    StructField(
        "year_of_diagnosis",
        IntegerType(),
        True,
        {"comment": "Patient's initial cancer diagnosis year derived from first biopsy confirming cancer"}
    ),
    StructField(
        "age_at_diagnosis",
        IntegerType(),
        True,
        {"comment": "Age of the patient at date of cancer diagnosis"}
    ),
    StructField(
        "biopsy_type",
        StringType(),
        True,
        {"comment": "Type of diagnostic biopsy"}
    ),
    StructField(
        "imaging_type",
        StringType(),
        True,
        {"comment": "Type of diagnostic imaging; multiple values separated by semicolon"}
    ),
    StructField(
        "breast_density",
        StringType(),
        True,
        {"comment": "Either clinician-assessed or machine-assessed breast density"}
    ),
    StructField(
        "disease_status",
        StringType(),
        True,
        {"comment": "Type of disease at diagnosis"}
    ),
    StructField(
        "laterality",
        StringType(),
        True,
        {"comment": "Laterality of the cancer; if bilateral, a separate row should be created for each side"}
    ),
    StructField(
        "bilateral",
        BooleanType(),
        True,
        {"comment": "Indicates whether the cancer is bilateral"}
    ),
    StructField(
        "invasive_size_clinical",
        DoubleType(),
        True,
        {"comment": "Size of invasive tumour in mm from radiology; if multifocal, size of the largest focus"}
    ),
    StructField(
        "total_size_clinical",
        DoubleType(),
        True,
        {"comment": "Total tumour size in mm from radiology, including in-situ disease"}
    ),
    StructField(
        "ajcc_edition",
        StringType(),
        True,
        {"comment": "TNM staging edition used (AJCC edition)"}
    ),
    StructField(
        "clinical_t_stage",
        StringType(),
        True,
        {"comment": "Clinical/pre-treatment T stage from TNM"}
    ),
    StructField(
        "clinical_n_stage",
        StringType(),
        True,
        {"comment": "Clinical/pre-treatment N stage from TNM"}
    ),
    StructField(
        "clinical_m_stage",
        StringType(),
        True,
        {"comment": "Clinical/pre-treatment M stage from TNM"}
    ),
    StructField(
        "clinical_stage",
        StringType(),
        True,
        {"comment": "Clinical/pre-treatment stage I–IV derived from TNM"}
    ),
    StructField(
        "neoadjuvant_indication",
        StringType(),
        True,
        {"comment": "Indicates whether the patient received neo-adjuvant treatment"}
    ),
    StructField(
        "er_status",
        StringType(),
        True,
        {"comment": "Estrogen receptor (ER) status"}
    ),
    StructField(
        "er_score",
        IntegerType(),
        True,
        {"comment": "ER Allred score"}
    ),
    StructField(
        "pr_status",
        StringType(),
        True,
        {"comment": "Progesterone receptor (PR) status"}
    ),
    StructField(
        "pr_score",
        IntegerType(),
        True,
        {"comment": "PR Allred score"}
    ),
    StructField(
        "her2_status",
        StringType(),
        True,
        {"comment": "HER2 status"}
    ),
    StructField(
        "her2_score",
        StringType(),
        True,
        {"comment": "HER2 immunohistochemistry score"}
    ),
    StructField(
        "her2_fish",
        BooleanType(),
        True,
        {"comment": "Indicates whether FISH testing was undertaken to determine HER2 status"}
    ),
    StructField(
        "grade",
        IntegerType(),
        True,
        {"comment": "Histological grade of cancer"}
    ),
    StructField(
        "oncotype_dx_score",
        IntegerType(),
        True,
        {"comment": "Score from Oncotype DX test"}
    ),
    StructField(
        "clin_response",
        StringType(),
        True,
        {"comment": "Response to treatment reported in end-of-treatment (EOT) imaging"}
    )
])

def create_pharos_tumour_incr():
    """
    """
    target_table = get_target_table("pharos_tumour")
    max_adc_updt = get_max_timestamp(target_table, ts_column="ADC_UPDT")

    map_diagnosis = spark.table("4_prod.bronze.map_diagnosis")
    map_person = spark.table("4_prod.bronze.map_person").select(col("person_id").alias("PERSON_ID"),"birth_year")
    map_procedure = spark.table("4_prod.bronze.map_procedure")
    radiology = spark.table("4_prod.rde.rde_radiology")

    # Get the breast cancer cohort
    breast_cancer_cohort = (
        map_diagnosis
        .filter(col("ADC_UPDT") > max_adc_updt)
        .filter(
            col("ICD10_CODE").like("C50%") |
            col("OMOP_CONCEPT_ID").isin(45768522, 35624616, 602331)
        )
        .withColumn("year_of_diagnosis", year(col("earliest_diagnosis_date")))
        .join(map_person, ["PERSON_ID"], "left")
        .withColumn("age_at_diagnosis", col("year_of_diagnosis") - col("birth_year"))
        .select("PERSON_ID", col("earliest_diagnosis_date").alias("date_of_diagnosis"), "year_of_diagnosis", "age_at_diagnosis", "ADC_UPDT")
        .dropDuplicates(["PERSON_ID"])
    )

    cohort_persons = breast_cancer_cohort.select("PERSON_ID").distinct()

    biopsy = (
        map_procedure
        .join(cohort_persons, ["PERSON_ID"], "semi")
        .withColumn("biopsy_type",
                    when(col("OMOP_CONCEPT_ID").isin(44508921,4140864,44508920,4195312,4196880,44508941), "1 Core Needle Biopsy")
                    .when(col("OMOP_CONCEPT_ID").isin(4333768,4306207), "2 Fine Needle Aspiration (FNA) Biopsy")
                    .when(col("OMOP_CONCEPT_ID").isin(44508922,4178943,37109415,42535639,37109414,37109988), "3 Image-guided Biopsy")
                    .when(col("OMOP_CONCEPT_ID").isin(911787,36675261), "5 Vacuum-assisted Biopsy")
                    .when(col("OMOP_CONCEPT_ID").isin(44808115,44508923,4129191,4047494,44508924,46232205,4164491,4172166,4028789,4066550,4022433), "7 Other types of Biopsy")
                    .otherwise(None)
        )
        .filter(col("biopsy_type").isNotNull())
        .groupBy("PERSON_ID")
        .agg(
            # Take the most specific (lowest numbered) biopsy type
            F.min("biopsy_type").alias("biopsy_type")
        )
    )

    # IMAGING: Modality aggregation within 6-month diagnostic window
    imaging = (
        radiology
        .join(breast_cancer_cohort.select("PERSON_ID", "date_of_diagnosis"), ["PERSON_ID"], "inner")
        .filter(col("ExamStart").cast("timestamp") > date_sub(col("date_of_diagnosis"), 180))
        .groupBy("PERSON_ID")
        .agg(
            concat_ws(";", collect_set("ExamName"))
            .alias("imaging_type")
        )
    )

    final_df = (
        breast_cancer_cohort
        .join(biopsy, ["PERSON_ID"], "left")
        .join(imaging, ["PERSON_ID"], "left")
        .select(
            col("PERSON_ID").cast(LongType()).alias("person_id"),
            col("date_of_diagnosis").cast(DateType()),
            col("year_of_diagnosis").cast(IntegerType()),
            col("age_at_diagnosis").cast(IntegerType()),
            col("biopsy_type").cast(StringType()),
            col("imaging_type").cast(StringType()),
            lit(None).cast(StringType()).alias("breast_density"),
            lit(None).cast(StringType()).alias("disease_status"),
            lit(None).cast(StringType()).alias("laterality"),
            lit(None).cast(StringType()).alias("bilateral"),
            lit(None).cast(DoubleType()).alias("invasive_size_clinical"),
            lit(None).cast(DoubleType()).alias("total_size_clinical"),
            lit(None).cast(StringType()).alias("ajcc_edition"),
            lit(None).cast(StringType()).alias("clinical_t_stage"),
            lit(None).cast(StringType()).alias("clinical_n_stage"),
            lit(None).cast(StringType()).alias("clinical_m_stage"),
            lit(None).cast(StringType()).alias("clinical_stage"),
            lit(None).cast(StringType()).alias("neoadjuvant_indication"),
            lit(None).cast(StringType()).alias("er_status"),
            lit(None).cast(IntegerType()).alias("er_score"),
            lit(None).cast(StringType()).alias("pr_status"),
            lit(None).cast(IntegerType()).alias("pr_score"),
            lit(None).cast(StringType()).alias("her2_status"),
            lit(None).cast(StringType()).alias("her2_score"),
            lit(None).cast(StringType()).alias("her2_fish"),
            lit(None).cast(IntegerType()).alias("grade"),
            lit(None).cast(IntegerType()).alias("oncotype_dx_score"),
            lit(None).cast(StringType()).alias("clin_response"),
            col("ADC_UPDT").cast(TimestampType())
        )
        .dropDuplicates(["person_id"])
    )

    return final_df


tumour_df = create_pharos_tumour_incr()
update_table(tumour_df, get_target_table("pharos_tumour"), "person_id", schema_pharos_tumour, pharos_tumour_comment)



In [0]:
pharos_pathology_comment = "This table contains detailed pathology and surgical information related to breast cancer samples collected from participants."

schema_pharos_pathology = StructType([
    StructField("pharosid", StringType(), True, {"comment": "Assigned unique ID for each participant (TBC)"}),
    
    StructField("sample_collection_date", StringType(), True, {"comment": "Date of sample collection DD-MM-YYYY"}),
    
    StructField("sample_type", StringType(), True, {"comment": "Sample type stored in the Biobank"}),
    
    StructField("tissue_type", StringType(), True, {"comment": "Anatomical site where sample was taken from"}),
    
    StructField("sample_laterality", StringType(), True, {"comment": "Side that the sample was taken from"}),
    
    StructField("laterality_of_surgery", StringType(), True, {"comment": "Side that the surgery was performed on"}),
    
    StructField("surgery_date", StringType(), True, {"comment": "Date of breast surgery DD-MM-YYYY"}),
    
    StructField("days_diagnosis_surgery", IntegerType(), True, {"comment": "Difference in days between primary diagnosis and breast surgery"}),
    
    StructField("breast_surgery_type", StringType(), True, {"comment": "Type of breast surgery"}),
    
    StructField("nodal_surgery_type", StringType(), True, {"comment": "Type of axillary surgery"}),
    
    StructField("invasive_present", StringType(), True, {"comment": "Is invasive disease present?"}),
    
    StructField("morphology", StringType(), True, {"comment": "Morphological subtype of invasive cancer"}),
    
    StructField("invasive_size_path", DoubleType(), True, {"comment": "Size of invasive tumour in mm from breast surgery pathology report. If multifocal, size of largest foci"}),
    
    StructField("total_size_path", DoubleType(), True, {"comment": "Total size of tumour in mm from pathology report including in-situ disease"}),
    
    StructField("pathological_t_stage", StringType(), True, {"comment": "Pathological/post surgery T stage from TNM"}),
    
    StructField("pathological_n_stage", StringType(), True, {"comment": "Pathological/post surgery N stage from TNM"}),
    
    StructField("pathological_m_stage", StringType(), True, {"comment": "Pathological/post surgery M stage from TNM"}),
    
    StructField("pathological_stage", StringType(), True, {"comment": "Pathological/post surgery stage I-IV"}),
    
    StructField("total_nodes_removed", IntegerType(), True, {"comment": "Total number of nodes removed during surgery"}),
    
    StructField("total_positive_nodes", IntegerType(), True, {"comment": "Total number of positive nodes removed during surgery"}),
    
    StructField("path_response", StringType(), True, {"comment": "Response to treatment reported in histology report from surgery"}),
    
    StructField("rcb_group", StringType(), True, {"comment": "RCB group reported after neo-adjuvant therapy"}),
    
    StructField("rcb_volume", DoubleType(), True, {"comment": "RCB volume reported after neo-adjuvant therapy"}),
    
    StructField("nodes_showing_prev_involvement", IntegerType(), True, {"comment": "Number of nodes removed that showed response to treatment after neoadjuvant therapy"}),
    
    StructField("er_score_sample", IntegerType(), True, {"comment": "ER score reported per sample"}),
    
    StructField("pr_score_sample", IntegerType(), True, {"comment": "PR score reported per sample"}),
    
    StructField("her2_score_sample", StringType(), True, {"comment": "HER2 score reported per sample"}),
    
    StructField("lvi", StringType(), True, {"comment": "Lymphovascular invasion"}),
    
    StructField("margin", StringType(), True, {"comment": "Whether surgical margins are involved by tumour on excisional surgery"}),
    
    StructField("multifocal", StringType(), True, {"comment": "Whether tumour was multifocal or not"}),
    
    StructField("insitu", StringType(), True, {"comment": "Whether there is any in-situ disease present in the sample"}),
    
    StructField("insitu_type", StringType(), True, {"comment": "Morphological type of in-situ disease reported in the sample"}),
    
    StructField("dcis_subtype", StringType(), True, {"comment": "Subtype of DCIS"}),
    
    StructField("lcis_subtype", StringType(), True, {"comment": "Subtype of LCIS"}),
    
    StructField("insitu_grade", IntegerType(), True, {"comment": "Grade of in-situ disease (where applicable)"})
])





In [0]:
pharos_treatment_comment = "This table captures comprehensive treatment for breast cancer patients. It includes details on systemic therapies (type, drugs, intent, cycles, start/end dates, and reasons for stopping), as well as radiotherapy administration (sites, dose, fractions, and boost)."

schema_pharos_treatment = StructType([
    StructField(
        "person_id", 
        LongType(), 
        nullable=True, 
        metadata={"comment": "Assigned unique ID for each participant (TBC)"}
    ),
    StructField(
        "treatment_type", 
        StringType(), 
        nullable=True, 
        metadata={"comment": "Type of systemic treatment given (e.g. Endocrine, Chemotherapy, Antibody, CDK4/6 Inhibitor, PARP Inhibitor, Other, Unknown)"}
    ),
    StructField(
        "treatment_name", 
        StringType(), 
        nullable=True, 
        metadata={"comment": "Name of drug or treatment given"}
    ),
    StructField(
        "treatment_intent", 
        StringType(), 
        nullable=True, 
        metadata={"comment": "Treatment intent (e.g. Neoadjuvant, Adjuvant, Advanced)"}
    ),
    StructField(
        "treatment_start_date", 
        TimestampType(), 
        nullable=True, 
        metadata={"comment": "Start date of treatment"}
    ),
    StructField(
        "days_diagnosis_treatmentstart", 
        IntegerType(), 
        nullable=True, 
        metadata={"comment": "Difference in days between primary diagnosis and start of treatment"}
    ),
    StructField(
        "therapy_ongoing", 
        BooleanType(), 
        nullable=True, 
        metadata={"comment": "Is the therapy ongoing?"}
    ),
    StructField(
        "therapy_length", 
        IntegerType(), 
        nullable=True, 
        metadata={"comment": "Planned duration of therapy in days"}
    ),
    StructField(
        "treatment_end_date", 
        TimestampType(), 
        nullable=True, 
        metadata={"comment": "End date of treatment"}
    ),
    StructField(
        "days_diagnosis_treatmentend", 
        IntegerType(), 
        nullable=True, 
        metadata={"comment": "Difference in days between primary diagnosis and end of treatment"}
    ),
    StructField(
        "treatment_end_reason", 
        StringType(), 
        nullable=True, 
        metadata={"comment": "Reason why the treatment was stopped (e.g. Toxicity, Progression, Protocol end, Patient choice, Death, Other, Unknown)"}
    ),
    StructField(
        "date_treatment_last_given", 
        TimestampType(), 
        nullable=True, 
        metadata={"comment": "Date of last record of treatment administration where end date is missing"}
    ),
    StructField(
        "days_treatmentlastgiven", 
        IntegerType(), 
        nullable=True, 
        metadata={"comment": "Difference in days between primary diagnosis and date of last given treatment"}
    ),
    StructField(
        "treatment_duration", 
        IntegerType(), 
        nullable=True, 
        metadata={"comment": "Time in days between treatment start and end dates"}
    ),
    StructField(
        "treatment_cycles", 
        IntegerType(), 
        nullable=True, 
        metadata={"comment": "For systemic therapy given in cycles; total number of cycles given"}
    ),
    StructField(
        "clinical_trial", 
        StringType(), 
        nullable=True, 
        metadata={"comment": "Whether patient was enrolled in an interventional trial"}
    ),
    StructField(
        "radiotherapy_site", 
        StringType(), 
        nullable=True, 
        metadata={"comment": "Anatomical site(s) where radiotherapy was given"}
    ),
    StructField(
        "radiotherapy_dose", 
        StringType(), 
        nullable=True, 
        metadata={"comment": "Dose of radiotherapy administered"}
    ),
    StructField(
        "radiotherapy_fraction", 
        StringType(), 
        nullable=True, 
        metadata={"comment": "Number of radiotherapy fractions delivered"}
    ),
    StructField(
        "radiotherapy_boost", 
        StringType(), 
        nullable=True, 
        metadata={"comment": "Indicates if a boost dose of radiotherapy was given"}
    ),
    StructField(
        "ADC_UPDT", 
        TimestampType(), 
        nullable=True, 
        metadata={"comment": "Last update timestamp."}
    )
])


def create_pharos_treatment_incr():
    """
    """
    target_table = get_target_table("pharos_treatment")
    max_adc_updt = get_max_timestamp(target_table, ts_column="ADC_UPDT")

    # Load Tables
    diagnosis = spark.table("4_prod.bronze.map_diagnosis")
    procedure = spark.table("4_prod.bronze.map_procedure")
    drug = spark.table("4_prod.bronze.map_med_admin")
    chemotherapy = spark.table("4_prod.rde.rde_iqemo")

    breast_cancer_cohort = (
        diagnosis
        .filter(col("ADC_UPDT") > max_adc_updt)
        .filter(
            col("ICD10_CODE").like("C50%") |
            col("OMOP_CONCEPT_ID").isin(45768522, 35624616, 602331)
        )
        .select(
            col("PERSON_ID"),
            col("earliest_diagnosis_date"),
            col("ADC_UPDT").alias("cohort_ADC_UPDT")  
        )
        .distinct()
    )
    
    cohort_persons = breast_cancer_cohort.select("PERSON_ID").distinct()

    # Process Chemotherapy data
    window = Window.partitionBy("PERSON_ID", "Name")
    treatment_chemo = (
        chemotherapy.alias("chemo")
        .join(breast_cancer_cohort.alias("cohort"), "PERSON_ID", "inner")
        .withColumn("treatment_type", lit("chemotherapy"))
        .withColumn("treatment_cycles", F.max(col("chemo.TreatmentCycleID")).over(window))
        .select(
            col("PERSON_ID"),
            col("treatment_type"),
            col("chemo.Name").alias("treatment_name"),
            col("chemo.StartDate").alias("treatment_start_date"),
            col("chemo.CourseFinished").alias("therapy_ongoing"),
            col("chemo.EndDate").alias("treatment_end_date"),
            col("chemo.FinalTreatmentDate").alias("date_treatment_last_given"),
            col("treatment_cycles"),
            # Take the greater of the two ADC_UPDT values
            F.greatest(col("chemo.ADC_UPDT"), col("cohort.cohort_ADC_UPDT")).alias("ADC_UPDT")
        )
        .dropDuplicates()
    )

    # Drug Lists for classification
    endocrine_drugs = [72965, 10324, 258494, 50610, 72143, 282357]
    antibody_drugs = [224905, 1298944, 253337, 1597876, 993449]
    cdk4_6_inhibitors = [1601374, 1873916, 1740938]
    parp_inhibitors = [1597582, 1918231]
    other_drugs = [141704, 3264, 68442, 358255, 26225, 77655, 73056, 11473, 32915, 1894, 39786, 25480, 6313, 8638, 5492, 6902]

    treatment_other = (
        drug.alias("med")
        .join(cohort_persons, col("med.PERSON_ID") == cohort_persons.PERSON_ID, "inner")
        .filter(col("EVENT_TYPE_DISPLAY") == "Administered")
        .withColumn(
            "treatment_type",
            when(col("RXNORM_CUI").isin(endocrine_drugs), "Endocrine")
            .when(col("RXNORM_CUI").isin(antibody_drugs), "Antibody")
            .when(col("RXNORM_CUI").isin(cdk4_6_inhibitors), "CDK4/6 Inhibitor")
            .when(col("RXNORM_CUI").isin(parp_inhibitors), "PARP Inhibitor")
            .when(col("RXNORM_CUI").isin(other_drugs), "Other")
            .otherwise(lit(None))
        )
        .filter(col("treatment_type").isNotNull())
        .select(
            col("med.PERSON_ID").alias("PERSON_ID"),
            col("treatment_type"), 
            col("med.RXNORM_STR").alias("treatment_name"), 
            col("med.ADMIN_START_DT_TM").alias("treatment_start_date"), 
            col("med.ADMIN_END_DT_TM").alias("treatment_end_date"),
            col("med.ADMIN_END_DT_TM").alias("date_treatment_last_given"),
            lit(None).cast(BooleanType()).alias("therapy_ongoing"),
            lit(None).cast(IntegerType()).alias("treatment_cycles"),
            col("med.ADC_UPDT").alias("ADC_UPDT")
        )
    )

    # Combine the treatment and calculate date differences
    # Join back to cohort only for earliest_diagnosis_date (ADC_UPDT already in union)
    treatment = (
        treatment_chemo
        .unionByName(treatment_other)
        .join(
            breast_cancer_cohort.select("PERSON_ID", "earliest_diagnosis_date"), 
            "PERSON_ID", 
            "inner"
        )
        .filter(col("treatment_start_date") > col("earliest_diagnosis_date"))
        .withColumn("days_diagnosis_treatmentstart", datediff(col("treatment_start_date"), col("earliest_diagnosis_date")))
        .withColumn("days_diagnosis_treatmentend", datediff(col("treatment_end_date"), col("earliest_diagnosis_date")))
        .withColumn("days_treatmentlastgiven", datediff(col("date_treatment_last_given"), col("earliest_diagnosis_date")))
        .withColumn("treatment_duration", datediff(col("treatment_end_date"), col("treatment_start_date")))
        .dropDuplicates()
    )

    final_df = (
        treatment
        .select(
            col("PERSON_ID").cast(LongType()).alias("person_id"),
            col("treatment_type").cast(StringType()),
            col("treatment_name").cast(StringType()),
            lit(None).alias("treatment_intent").cast(StringType()),
            col("treatment_start_date").cast(TimestampType()),
            col("days_diagnosis_treatmentstart").cast(IntegerType()),
            col("therapy_ongoing").cast(BooleanType()),
            lit(None).alias("therapy_length").cast(IntegerType()),
            col("treatment_end_date").cast(TimestampType()),
            col("days_diagnosis_treatmentend").cast(IntegerType()),
            lit(None).alias("treatment_end_reason").cast(StringType()),
            col("date_treatment_last_given").cast(TimestampType()),
            col("days_treatmentlastgiven").cast(IntegerType()),
            col("treatment_duration").cast(IntegerType()),
            col("treatment_cycles").cast(IntegerType()),
            lit(None).alias("clinical_trial").cast(StringType()),
            lit(None).alias("radiotherapy_site").cast(StringType()),
            lit(None).alias("radiotherapy_dose").cast(StringType()),
            lit(None).alias("radiotherapy_fraction").cast(StringType()),
            lit(None).alias("radiotherapy_boost").cast(StringType()),
            col("ADC_UPDT").cast(TimestampType())
        )
    )

    return final_df 

treatment_df = create_pharos_treatment_incr()
update_table(
    treatment_df, 
    get_target_table("pharos_treatment"),
    ["person_id", "treatment_name", "treatment_start_date"],  # Composite key
    schema_pharos_treatment, 
    pharos_treatment_comment
)


In [0]:
pharos_followup_comment = "This table tracks longitudinal outcomes for breast cancer patients, including detailed recurrence data (local and distant metastasis), survival metrics, and vital status. It captures the chronology of disease progression through event-specific dates and site locations, alongside calculated time-to-event intervals (years from diagnosis)."

schema_pharos_followup= StructType([
    StructField(
        "person_id",
        LongType(),
        metadata={"comment": "Assigned unique ID for each participant (TBC)"}
    ),
    StructField(
        "local_recurrence",
        StringType(),
        metadata={"comment": "Whether or not patient had local recurrence"}
    ),
    StructField(
        "localrec_dates",
        StringType(),
        metadata={"comment": "Dates of each local recurrence separated by a semi-colon (;)"}
    ),
    StructField(
        "localrec_sites",
        StringType(),
        metadata={"comment": "Sites of each local recurrence, corresponding to dates in the same order, separated by a semi-colon (;)"}
    ),
    StructField(
        "distant_metastasis",
        StringType(),
        metadata={"comment": "Whether or not patient had a distant recurrence"}
    ),
    StructField(
        "metastasis_dates",
        StringType(),
        metadata={"comment": "Dates of each distant recurrence separated by a semi-colon (;)"}
    ),
    StructField(
        "metastasis_sites",
        StringType(),
        metadata={"comment": "Sites of each distant recurrence, corresponding to dates in the same order, separated by a semi-colon (;)"}
    ),
    StructField(
        "years_diagnosis_metastasis",
        IntegerType(),
        metadata={"comment": "Time (in years) between first diagnosis date and first date of distant metastasis"}
    ),
    StructField(
        "years_diagnosis_localrec",
        IntegerType(),
        metadata={"comment": "Time (in years) between first diagnosis date and first date of local recurrence"}
    ),
    StructField(
        "denovo_metastasis",
        StringType(),
        metadata={"comment": "Indication of distant metastasis at time of cancer diagnosis or within 6 months"}
    ),
    StructField(
        "vital_status",
        StringType(),
        metadata={"comment": "Whether patient is alive at date of last follow-up"}
    ),
    StructField(
        "date_of_death",
        TimestampType(),
        metadata={"comment": "Date of death recorded in the medical record (DD-MM-YYYY)"}
    ),
    StructField(
        "years_diagnosistodeath",
        IntegerType(),
        metadata={"comment": "Time (in years) between first diagnosis date and date of death"}
    ),
    StructField(
        "cause_of_death",
        StringType(),
        metadata={"comment": "Cause of death as recorded on death certificate or medical records"}
    ),
    StructField(
        "date_of_last_followup",
        TimestampType(),
        metadata={"comment": "Last date patient was recorded to have contact with hospital staff (DD-MM-YYYY)"}
    ),
    StructField(
        "followup_status",
        StringType(),
        metadata={"comment": "Follow up status at date last seen"}
    ),
    StructField(
        "years_lastfollowup",
        IntegerType(),
        metadata={"comment": "Time (in years) between first diagnosis and last follow-up (alive) or death (deceased)"}
    ),

    StructField(
        "lost_to_followup",
        StringType(),
        metadata={"comment": "Indication of whether patient has been lost to follow-up"}
    ),
    StructField(
        "ADC_UPDT", 
        TimestampType(), 
        nullable=True, 
        metadata={"comment": "Last update timestamp."}
    )

])

def create_pharos_followup_incr():
    """
    FIXED:
    - Uses correct target table for watermark
    - Aggregates encounter data to one row per person BEFORE join
    - Includes missing metastasis_sites column
    """
    target_table = get_target_table("pharos_followup")
    max_adc_updt = get_max_timestamp(target_table, ts_column="ADC_UPDT")

    death = spark.table("4_prod.bronze.map_death")
    diagnosis = spark.table("4_prod.bronze.map_diagnosis")

    encounter = (
        spark.table("4_prod.bronze.map_encounter")
        .groupBy("PERSON_ID")
        .agg(
            F.max(F.coalesce("DEPART_DT_TM", "ARRIVE_DT_TM")).alias("date_of_last_followup")
        )
    )

    # Filter cohort by ICD-10 and incremental update logic
    breast_cancer_cohort = (
        diagnosis
        .filter(col("ADC_UPDT") > max_adc_updt)
        .filter(
            col("ICD10_CODE").like("C50%") |
            col("OMOP_CONCEPT_ID").isin(45768522, 35624616, 602331)
        )
        .select("PERSON_ID", "earliest_diagnosis_date", "ADC_UPDT")
        .groupBy("PERSON_ID")
        .agg(
            F.min("earliest_diagnosis_date").alias("earliest_diagnosis_date"),
            F.max("ADC_UPDT").alias("ADC_UPDT")
        )
    )

    # Process death records and calculate survival time (years)
    death_processed = (
        death.alias("d")
        .join(breast_cancer_cohort.alias("cohort"), "PERSON_ID", "right")
        .withColumn(
            "vital_status", 
            when(col("d.DECEASED_DT_TM").isNotNull(), lit("1 Deceased"))
            .otherwise(lit("0 Alive"))
        )
        .withColumn(
            "years_diagnosistodeath",
            when(col("d.DECEASED_DT_TM").isNotNull(), 
                (datediff(col("d.DECEASED_DT_TM"), col("cohort.earliest_diagnosis_date")).cast(DoubleType()) / 365).cast(IntegerType()))
            .otherwise(lit(None))
        )
        .select(
            col("PERSON_ID"), 
            col("vital_status"), 
            col("d.DECEASED_DT_TM").alias("date_of_death"), 
            col("years_diagnosistodeath"),
            col("cohort.earliest_diagnosis_date").alias("earliest_diagnosis_date"),
            col("cohort.ADC_UPDT").alias("ADC_UPDT") 
        )
    )

    # Merge mortality data with encounter history
    final_df = (
        death_processed
        .join(encounter, "PERSON_ID", "left")
        .withColumn(
            "years_lastfollowup",
            (datediff(col("date_of_last_followup"), col("earliest_diagnosis_date")).cast(DoubleType()) / 365).cast(IntegerType())
        )
        .select(
            col("PERSON_ID").cast(LongType()).alias("person_id"),
            lit(None).cast(StringType()).alias("local_recurrence"),
            lit(None).cast(StringType()).alias("localrec_dates"),
            lit(None).cast(StringType()).alias("localrec_sites"),
            lit(None).cast(StringType()).alias("distant_metastasis"),
            lit(None).cast(StringType()).alias("metastasis_dates"),
            lit(None).cast(StringType()).alias("metastasis_sites"), 
            lit(None).cast(IntegerType()).alias("years_diagnosis_metastasis"),
            lit(None).cast(IntegerType()).alias("years_diagnosis_localrec"),
            lit(None).cast(StringType()).alias("denovo_metastasis"),
            col("vital_status").cast(StringType()),
            col("date_of_death").cast(TimestampType()),
            col("years_diagnosistodeath").cast(IntegerType()),
            lit(None).cast(StringType()).alias("cause_of_death"),
            col("date_of_last_followup").cast(TimestampType()),
            lit(None).cast(StringType()).alias("followup_status"),
            col("years_lastfollowup").cast(IntegerType()),
            lit(None).cast(StringType()).alias("lost_to_followup"),
            col("ADC_UPDT").cast(TimestampType())
        )
        .dropDuplicates(["person_id"])
    )

    return final_df


followup_df = create_pharos_followup_incr()
update_table(followup_df, get_target_table("pharos_followup"), "person_id", schema_pharos_followup, pharos_followup_comment)


In [0]:
print("=" * 60)
print("PHAROS Pipeline Execution Summary")
print("=" * 60)
print(f"Target Catalog: {TARGET_CATALOG}")
print(f"Target Schema: {TARGET_SCHEMA}")
print("")
print("Tables processed:")
print(f"  - {get_target_table('pharos_person')}")
print(f"  - {get_target_table('pharos_medical_history')}")
print(f"  - {get_target_table('pharos_tumour')}")
print(f"  - {get_target_table('pharos_treatment')}")
print(f"  - {get_target_table('pharos_followup')}")
print("")
print("Tables pending implementation:")
print(f"  - {get_target_table('pharos_pathology')} (schema defined only)")
print("=" * 60)