In [0]:
conditions_df = spark.read.format("delta").load(
    '/Volumes/workspace/synthea/synthea_datasets/conditions_delta'
)

# Display sample
display(conditions_df)

In [0]:
patients_df = spark.read.format("delta").load(
    '/Volumes/workspace/synthea/synthea_datasets/patients_delta'
)

# Display sample
display(patients_df)

In [0]:
joined_df=patients_df.join(conditions_df, patients_df.Id==conditions_df.PATIENT,'left')
display(joined_df)

In [0]:
encounters_df = spark.read.format("delta").load(
    '/Volumes/workspace/synthea/synthea_datasets/encounters_delta'
)

# Display sample
display(encounters_df)

In [0]:
critical_columns = {
    "patients": ["Id", "BIRTHDATE", "DEATHDATE", "GENDER"],
    "encounters": ["Id", "PATIENT", "START", "STOP", "TOTAL_CLAIM_COST"],
    "conditions": ["PATIENT", "CODE", "DESCRIPTION", "START"],
    "medications": ["PATIENT", "START", "STOP", "TOTALCOST"],
    "observations": ["PATIENT", "DATE", "VALUE"]
}

In [0]:
# Function to check missing values
def check_missing_values(table_name, critical_columns):
    df = spark.read.format("delta").table(f"delta.{table_name}")
    total_rows = df.count()

In [0]:
from pyspark.sql.functions import col, count, when, sum as sum_

def check_missing_values(path, critical_columns):
    df = spark.read.format("delta").load(path)
    total_rows = df.count()
    
    missing_counts = df.select([
        sum_(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in critical_columns
    ])
    missing_percentages = missing_counts.select([
        ((col(c) / total_rows) * 100).alias(f"{c}_missing_pct") for c in critical_columns
    ])
    
    print(f"Missing Values for {path}:")
    missing_percentages.show()
    return missing_percentages

# Example run
check_missing_values(
    "/Volumes/workspace/synthea/synthea_datasets/patients_delta",
    ["Id", "BIRTHDATE", "DEATHDATE", "GENDER"]
)


In [0]:
from pyspark.sql.functions import sum, when, col, count
from pyspark.sql.types import StructType, StructField, StringType, DateType, DoubleType
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Define the base path
base_path = "/Volumes/workspace/synthea/synthea_datasets"

# Define all your delta table paths
delta_tables = {
    "patients": f"{base_path}/patients_delta",
    "encounters": f"{base_path}/encounters_delta", 
    "conditions": f"{base_path}/conditions_delta",
    "medications": f"{base_path}/medications_delta",
    "observations": f"{base_path}/observations_delta",
    "procedures": f"{base_path}/procedures_delta",
    "allergies": f"{base_path}/allergies_delta",
    "careplans": f"{base_path}/careplans_delta",
    "immunizations": f"{base_path}/immunizations_delta",
    "organizations": f"{base_path}/organizations_delta",
    "providers": f"{base_path}/providers_delta"
}

# Fixed function to read delta tables
def read_delta_table(table_name):
    """Read a delta table from the specified path"""
    table_path = delta_tables.get(table_name)
    if not table_path:
        raise ValueError(f"Table {table_name} not found in delta_tables mapping")
    
    return spark.read.format("delta").load(table_path)

In [0]:
# Test reading one table
patients_df = read_delta_table("patients")
print(f"Patients table schema:")
patients_df.printSchema()
print(f"Patients count: {patients_df.count()}")

In [0]:
def analyze_table_schema(table_name):
    """Comprehensive table analysis for data modeling"""
    df = read_delta_table(table_name)
    
    print(f"\n{'='*60}")
    print(f"TABLE ANALYSIS: {table_name.upper()}")
    print(f"{'='*60}")
    
    # Basic info
    print(f"Total rows: {df.count():,}")
    print(f"Total columns: {len(df.columns)}")
    
    # Schema info
    print("\nSchema:")
    for field in df.schema.fields:
        null_count = df.filter(col(field.name).isNull()).count()
        null_pct = (null_count / df.count()) * 100 if df.count() > 0 else 0
        print(f"  {field.name}: {field.dataType} (Null: {null_count:,} - {null_pct:.1f}%)")
    
    # Sample data
    #print(f"\nSample data (first 5 rows):")
    #df.limit(5).show(vertical=True, truncate=False)
    
    return df

# Analyze all tables for data modeling
table_dfs = {}
for table_name in delta_tables.keys():
    try:
        table_dfs[table_name] = analyze_table_schema(table_name)
        print(f"\n✓ Successfully analyzed {table_name}")
    except Exception as e:
        print(f"\n✗ Failed to analyze {table_name}: {str(e)}")