# Parameters Section

In [5]:
# Parameters
source_workspace_name = "std-000-datamovement"
source_lakehouse_name = "lh_staging"
source_schema_name = "dbo"
source_table_name = "Person_Person"

destination_workspace_name = "std-000-datamovement"
destination_lakehouse_name = "lh_operations"
destination_schema_name = "dbo"
destination_table_name = "Person"

merge_ID_column ="BusinessEntityID"


StatementMeta(, a5ed9434-b35e-48b2-a425-5d89c8867427, 7, Finished, Available, Finished)

In [6]:
# Formulate the full table names
source_full_table_name = f"`{source_lakehouse_name}`.`{source_table_name}`"
destination_full_table_name = f"`{destination_lakehouse_name}`.`{destination_table_name}`"

StatementMeta(, a5ed9434-b35e-48b2-a425-5d89c8867427, 8, Finished, Available, Finished)

# Function to Create or Update the Table Schema

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from py4j.protocol import Py4JJavaError
from datetime import datetime

# Function to create or update the table schema
def create_or_update_table(source_full_table_name, destination_full_table_name):
    try:
        table_exists = False
        try:
            # Check if table exists
            spark.sql(f"DESCRIBE {destination_full_table_name}")
            table_exists = True
            print(f"Table {destination_full_table_name} already exists.")
            description =f"Table {destination_full_table_name} already exists."
            status="Succeed"
        except (AnalysisException, Py4JJavaError):
            pass
        
        if table_exists:
            # Get existing columns from the persistent staging area table
            existing_columns = {row[0]: row[1] for row in spark.sql(f"DESCRIBE {destination_full_table_name}").collect()}
            
            # Get columns and their data types from the source table
            source_columns = {row[0]: row[1] for row in spark.sql(f"DESCRIBE {source_full_table_name}").collect()}
            
            # Add new columns and change data types if necessary
            for col, dtype in source_columns.items():
                if col not in existing_columns:
                    # Add new column
                    alter_table_query = f"ALTER TABLE {destination_full_table_name} ADD COLUMNS ({col} {dtype})"
                    spark.sql(alter_table_query)
                    print(f"Added new column {col} to {destination_full_table_name}.")
                elif existing_columns[col] != dtype:
                    # Change data type
                    alter_table_query = f"ALTER TABLE {destination_full_table_name} CHANGE COLUMN {col} {col} {dtype}"
                    spark.sql(alter_table_query)
                    print(f"Changed data type of column {col} to {dtype} in {destination_full_table_name}.")
                
                description =f"Table {destination_full_table_name} columns modified."
                status="Succeed"
        
        else:
            # If table does not exist, create it
            source_columns = [f"{row[0]} {row[1]}" for row in spark.sql(f"DESCRIBE {source_full_table_name}").collect()]
            schema_str = ", ".join(source_columns)
            additional_columns = "DWIsCurrent BOOLEAN, DWStartDate TIMESTAMP, DWEndDate TIMESTAMP"
            create_table_query = f"""
            CREATE TABLE {destination_full_table_name} (
                {schema_str},
                {additional_columns}
            )
            """
            print(create_table_query)
            spark.sql(create_table_query)
            print(f"Table {destination_full_table_name} created with additional columns.")
            description =f"Table {destination_full_table_name} created with additional columns."
            status="Succeed"
        return status, description
    except Exception as e:
        print(f"An error occurred: {e}")
        description =f"An error occurred: {e}"
        status="Error"
        return status, description

StatementMeta(, a5ed9434-b35e-48b2-a425-5d89c8867427, 9, Finished, Available, Finished)

# Main Script Execution

In [8]:
from pyspark.sql import SparkSession
import json

# Create or update the persistent staging table schema
status, description = create_or_update_table(source_full_table_name, destination_full_table_name)

# Prepare the result as a JSON string
result = {
    "status": status,
    "description": description
}

# Check the merge status and exit with an error if the merge failed
if status == "Error":
    print(f"Error: {description}")
    raise Exception(result)

# If the merge succeeded, exit normally
mssparkutils.notebook.exit(result)

StatementMeta(, a5ed9434-b35e-48b2-a425-5d89c8867427, 10, Finished, Available, Finished)


            CREATE TABLE `lh_operations`.`Person` (
                BusinessEntityID int, PersonType string, NameStyle boolean, Title string, FirstName string, MiddleName string, LastName string, Suffix string, EmailPromotion int, AdditionalContactInfo string, Demographics string, rowguid string, ModifiedDate timestamp,
                DWIsCurrent BOOLEAN, DWStartDate TIMESTAMP, DWEndDate TIMESTAMP
            )
            
Table `lh_operations`.`Person` created with additional columns.
ExitValue: {'status': 'Succeed', 'description': 'Table `lh_operations`.`Person` created with additional columns.'}