# Setup Environment and Dependencies
Install necessary libraries such as PySpark, Snowflake connector, and Iceberg connector. Configure the Spark environment with the required dependencies.

In [None]:
# Install necessary libraries
!pip install pyspark
!pip install snowflake-connector-python
!pip install iceberg

# Configure the Spark environment with the required dependencies
from pyspark.sql import SparkSession

# Initialize Spark session with Snowflake and Iceberg connectors
spark = SparkSession.builder \
    .appName("DataReplicationFramework") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc,org.apache.iceberg:iceberg-spark-runtime-3.2_2.12") \
    .getOrCreate()

# Verify Spark session
spark.sql("SELECT 'Spark session initialized' AS status").show()

# Define Configuration Parameters
Define all configuration parameters in a single cell, including source and target table names, database credentials, processing modes, and logging levels.

In [None]:
# Define Configuration Parameters

# Configuration parameters for source and target tables
source_table = "source_table_name"
target_table = "target_table_name"

# Snowflake connection parameters
snowflake_options = {
    "sfURL": "your_snowflake_account_url",
    "sfUser": "your_snowflake_username",
    "sfPassword": "your_snowflake_password",
    "sfDatabase": "your_snowflake_database",
    "sfSchema": "your_snowflake_schema",
    "sfWarehouse": "your_snowflake_warehouse"
}

# Iceberg connection parameters
iceberg_catalog = "your_iceberg_catalog"
iceberg_namespace = "your_iceberg_namespace"

# Processing modes
processing_mode = "full_load"  # Options: full_load, chunk_load, partition_load, bucketing, incremental_load

# Logging levels
logging_level = "INFO"  # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL

# Control table name
control_table = "control_table_name"

# Spark session configuration for Snowflake
spark.conf.set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
spark.conf.set("spark.sql.catalog.spark_catalog.type", "hive")
spark.conf.set("spark.sql.catalog.spark_catalog.uri", "thrift://localhost:9083")

# Function to log messages
def log_message(level, message):
    if level == "DEBUG":
        print(f"DEBUG: {message}")
    elif level == "INFO":
        print(f"INFO: {message}")
    elif level == "WARNING":
        print(f"WARNING: {message}")
    elif level == "ERROR":
        print(f"ERROR: {message}")
    elif level == "CRITICAL":
        print(f"CRITICAL: {message}")

# Set logging level
log_message(logging_level, "Configuration parameters defined successfully.")

# Initialize Spark Session
Initialize a Spark session with the necessary configurations for connecting to Snowflake and Iceberg.

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session with Snowflake and Iceberg connectors
spark = SparkSession.builder \
    .appName("DataReplicationFramework") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc,org.apache.iceberg:iceberg-spark-runtime-3.2_2.12") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.spark_catalog.uri", "thrift://localhost:9083") \
    .getOrCreate()

# Verify Spark session
spark.sql("SELECT 'Spark session initialized' AS status").show()

# Define Utility Functions
Define utility functions for logging, error handling, and schema conversion.

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException

# Utility function for logging
def log_message(level, message):
    if level == "DEBUG":
        print(f"DEBUG: {message}")
    elif level == "INFO":
        print(f"INFO: {message}")
    elif level == "WARNING":
        print(f"WARNING: {message}")
    elif level == "ERROR":
        print(f"ERROR: {message}")
    elif level == "CRITICAL":
        print(f"CRITICAL: {message}")

# Utility function for error handling
def handle_error(e):
    log_message("ERROR", str(e))
    raise e

# Utility function for schema conversion from Snowflake to Iceberg
def convert_schema(snowflake_df: DataFrame) -> str:
    try:
        iceberg_schema = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in snowflake_df.schema.fields])
        return iceberg_schema
    except Exception as e:
        handle_error(e)

# Example usage of utility functions
try:
    # Example Snowflake query
    snowflake_query = f"SELECT * FROM {source_table} LIMIT 1"
    snowflake_df = spark.read \
        .format("snowflake") \
        .options(**snowflake_options) \
        .option("query", snowflake_query) \
        .load()
    
    # Convert schema
    iceberg_schema = convert_schema(snowflake_df)
    log_message("INFO", f"Converted schema: {iceberg_schema}")
except AnalysisException as e:
    handle_error(e)

# Implement Schema Handling
Implement functions to fetch the schema from the Snowflake source table and convert it to a compatible Iceberg schema.

In [None]:
# Implement Schema Handling

from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException

# Utility function for logging
def log_message(level, message):
    if level == "DEBUG":
        print(f"DEBUG: {message}")
    elif level == "INFO":
        print(f"INFO: {message}")
    elif level == "WARNING":
        print(f"WARNING: {message}")
    elif level == "ERROR":
        print(f"ERROR: {message}")
    elif level == "CRITICAL":
        print(f"CRITICAL: {message}")

# Utility function for error handling
def handle_error(e):
    log_message("ERROR", str(e))
    raise e

# Utility function for schema conversion from Snowflake to Iceberg
def convert_schema(snowflake_df: DataFrame) -> str:
    try:
        iceberg_schema = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in snowflake_df.schema.fields])
        return iceberg_schema
    except Exception as e:
        handle_error(e)

# Example usage of utility functions
try:
    # Example Snowflake query
    snowflake_query = f"SELECT * FROM {source_table} LIMIT 1"
    snowflake_df = spark.read \
        .format("snowflake") \
        .options(**snowflake_options) \
        .option("query", snowflake_query) \
        .load()
    
    # Convert schema
    iceberg_schema = convert_schema(snowflake_df)
    log_message("INFO", f"Converted schema: {iceberg_schema}")
except AnalysisException as e:
    handle_error(e)

# Implement Control Table Management
Implement functions to manage the control table, including inserting new table entries, updating batch sequence keys, and reverting updates on failure. Use SQL queries for control table management.

In [None]:
# Implement Control Table Management

from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException

# Function to insert a new entry into the control table
def insert_control_table_entry(table_name: str, batch_sk: int):
    try:
        insert_query = f"""
        INSERT INTO {control_table} (table_name, batch_sk)
        VALUES ('{table_name}', {batch_sk})
        """
        spark.sql(insert_query)
        log_message("INFO", f"Inserted new entry into control table: {table_name}, batch_sk: {batch_sk}")
    except AnalysisException as e:
        handle_error(e)

# Function to update the batch_sk in the control table
def update_control_table_batch_sk(table_name: str, batch_sk: int):
    try:
        update_query = f"""
        UPDATE {control_table}
        SET batch_sk = {batch_sk}
        WHERE table_name = '{table_name}'
        """
        spark.sql(update_query)
        log_message("INFO", f"Updated batch_sk in control table: {table_name}, batch_sk: {batch_sk}")
    except AnalysisException as e:
        handle_error(e)

# Function to revert the batch_sk update in the control table
def revert_control_table_batch_sk(table_name: str, previous_batch_sk: int):
    try:
        revert_query = f"""
        UPDATE {control_table}
        SET batch_sk = {previous_batch_sk}
        WHERE table_name = '{table_name}'
        """
        spark.sql(revert_query)
        log_message("INFO", f"Reverted batch_sk in control table: {table_name}, batch_sk: {previous_batch_sk}")
    except AnalysisException as e:
        handle_error(e)

# Example usage of control table management functions
try:
    # Insert a new entry
    insert_control_table_entry(source_table, 0)
    
    # Update the batch_sk
    update_control_table_batch_sk(source_table, 1)
    
    # Revert the batch_sk update
    revert_control_table_batch_sk(source_table, 0)
except AnalysisException as e:
    handle_error(e)

# Implement Data Loading Strategies
Implement different data loading strategies such as full load, chunk load, partition load, and incremental load. Provide sample queries for verification.

In [None]:
# Implement Data Loading Strategies

from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException

# Function to perform full load
def full_load(source_table: str, target_table: str):
    try:
        # Load data from Snowflake
        snowflake_df = spark.read \
            .format("snowflake") \
            .options(**snowflake_options) \
            .option("dbtable", source_table) \
            .load()
        
        # Write data to Iceberg
        snowflake_df.write \
            .format("iceberg") \
            .mode("overwrite") \
            .save(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        
        log_message("INFO", f"Full load completed for table: {source_table}")
    except AnalysisException as e:
        handle_error(e)

# Function to perform chunk load
def chunk_load(source_table: str, target_table: str, chunk_size: int):
    try:
        offset = 0
        while True:
            # Load chunk of data from Snowflake
            snowflake_query = f"SELECT * FROM {source_table} LIMIT {chunk_size} OFFSET {offset}"
            snowflake_df = spark.read \
                .format("snowflake") \
                .options(**snowflake_options) \
                .option("query", snowflake_query) \
                .load()
            
            if snowflake_df.count() == 0:
                break
            
            # Write chunk of data to Iceberg
            snowflake_df.write \
                .format("iceberg") \
                .mode("append") \
                .save(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
            
            offset += chunk_size
            log_message("INFO", f"Chunk load completed for offset: {offset}")
    except AnalysisException as e:
        handle_error(e)

# Function to perform partition load
def partition_load(source_table: str, target_table: str, partition_column: str):
    try:
        # Load data from Snowflake
        snowflake_df = spark.read \
            .format("snowflake") \
            .options(**snowflake_options) \
            .option("dbtable", source_table) \
            .load()
        
        # Write data to Iceberg with partitioning
        snowflake_df.write \
            .format("iceberg") \
            .mode("overwrite") \
            .partitionBy(partition_column) \
            .save(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        
        log_message("INFO", f"Partition load completed for table: {source_table}")
    except AnalysisException as e:
        handle_error(e)

# Function to perform incremental load
def incremental_load(source_table: str, target_table: str, batch_sk_column: str):
    try:
        # Get the max batch_sk from control table
        max_batch_sk_query = f"SELECT MAX(batch_sk) AS max_batch_sk FROM {control_table} WHERE table_name = '{source_table}'"
        max_batch_sk_df = spark.sql(max_batch_sk_query)
        max_batch_sk = max_batch_sk_df.collect()[0]["max_batch_sk"]
        
        # Load new data from Snowflake
        snowflake_query = f"SELECT * FROM {source_table} WHERE {batch_sk_column} > {max_batch_sk}"
        snowflake_df = spark.read \
            .format("snowflake") \
            .options(**snowflake_options) \
            .option("query", snowflake_query) \
            .load()
        
        # Write new data to Iceberg
        snowflake_df.write \
            .format("iceberg") \
            .mode("append") \
            .save(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        
        # Update control table with new max batch_sk
        new_max_batch_sk = snowflake_df.agg({batch_sk_column: "max"}).collect()[0][0]
        insert_control_table_entry(source_table, new_max_batch_sk)
        
        log_message("INFO", f"Incremental load completed for table: {source_table}")
    except AnalysisException as e:
        handle_error(e)

# Example usage of data loading strategies
try:
    if processing_mode == "full_load":
        full_load(source_table, target_table)
    elif processing_mode == "chunk_load":
        chunk_load(source_table, target_table, chunk_size=1000)
    elif processing_mode == "partition_load":
        partition_load(source_table, target_table, partition_column="partition_column_name")
    elif processing_mode == "incremental_load":
        incremental_load(source_table, target_table, batch_sk_column="batch_sk")
except AnalysisException as e:
    handle_error(e)

# Implement Data Change Handling
Implement logic to capture INSERT, UPDATE, and DELETE operations from Snowflake and reflect these changes in Iceberg efficiently.

In [None]:
# Implement Data Change Handling

from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException

# Function to capture INSERT operations from Snowflake and reflect in Iceberg
def handle_insert_operations(source_table: str, target_table: str, batch_sk_column: str):
    try:
        # Load new data from Snowflake
        snowflake_query = f"SELECT * FROM {source_table} WHERE {batch_sk_column} > (SELECT MAX(batch_sk) FROM {control_table} WHERE table_name = '{source_table}')"
        snowflake_df = spark.read \
            .format("snowflake") \
            .options(**snowflake_options) \
            .option("query", snowflake_query) \
            .load()
        
        # Write new data to Iceberg
        snowflake_df.write \
            .format("iceberg") \
            .mode("append") \
            .save(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        
        # Update control table with new max batch_sk
        new_max_batch_sk = snowflake_df.agg({batch_sk_column: "max"}).collect()[0][0]
        insert_control_table_entry(source_table, new_max_batch_sk)
        
        log_message("INFO", f"Insert operations handled for table: {source_table}")
    except AnalysisException as e:
        handle_error(e)

# Function to capture UPDATE operations from Snowflake and reflect in Iceberg
def handle_update_operations(source_table: str, target_table: str, batch_sk_column: str):
    try:
        # Load updated data from Snowflake
        snowflake_query = f"SELECT * FROM {source_table} WHERE {batch_sk_column} > (SELECT MAX(batch_sk) FROM {control_table} WHERE table_name = '{source_table}')"
        snowflake_df = spark.read \
            .format("snowflake") \
            .options(**snowflake_options) \
            .option("query", snowflake_query) \
            .load()
        
        # Write updated data to Iceberg
        snowflake_df.write \
            .format("iceberg") \
            .mode("overwrite") \
            .save(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        
        # Update control table with new max batch_sk
        new_max_batch_sk = snowflake_df.agg({batch_sk_column: "max"}).collect()[0][0]
        insert_control_table_entry(source_table, new_max_batch_sk)
        
        log_message("INFO", f"Update operations handled for table: {source_table}")
    except AnalysisException as e:
        handle_error(e)

# Function to capture DELETE operations from Snowflake and reflect in Iceberg
def handle_delete_operations(source_table: str, target_table: str, batch_sk_column: str):
    try:
        # Load deleted data from Snowflake
        snowflake_query = f"SELECT * FROM {source_table} WHERE {batch_sk_column} > (SELECT MAX(batch_sk) FROM {control_table} WHERE table_name = '{source_table}')"
        snowflake_df = spark.read \
            .format("snowflake") \
            .options(**snowflake_options) \
            .option("query", snowflake_query) \
            .load()
        
        # Get primary key column(s) for deletion
        primary_key_columns = [field.name for field in snowflake_df.schema.fields if field.metadata.get("primary_key", False)]
        
        # Delete data from Iceberg
        for row in snowflake_df.collect():
            delete_condition = " AND ".join([f"{col} = '{row[col]}'" for col in primary_key_columns])
            delete_query = f"DELETE FROM {target_table} WHERE {delete_condition}"
            spark.sql(delete_query)
        
        # Update control table with new max batch_sk
        new_max_batch_sk = snowflake_df.agg({batch_sk_column: "max"}).collect()[0][0]
        insert_control_table_entry(source_table, new_max_batch_sk)
        
        log_message("INFO", f"Delete operations handled for table: {source_table}")
    except AnalysisException as e:
        handle_error(e)

# Example usage of data change handling functions
try:
    handle_insert_operations(source_table, target_table, batch_sk_column="batch_sk")
    handle_update_operations(source_table, target_table, batch_sk_column="batch_sk")
    handle_delete_operations(source_table, target_table, batch_sk_column="batch_sk")
except AnalysisException as e:
    handle_error(e)

# Implement Merge Strategies
Implement configurable merge strategies such as append, overwrite, and merge (upsert) for handling data changes in Iceberg.

In [None]:
# Implement Merge Strategies

from pyspark.sql import DataFrame
from pyspark.sql.utils import AnalysisException

# Function to perform append operation
def append_data(source_df: DataFrame, target_table: str):
    try:
        source_df.write \
            .format("iceberg") \
            .mode("append") \
            .save(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        log_message("INFO", f"Append operation completed for table: {target_table}")
    except AnalysisException as e:
        handle_error(e)

# Function to perform overwrite operation
def overwrite_data(source_df: DataFrame, target_table: str):
    try:
        source_df.write \
            .format("iceberg") \
            .mode("overwrite") \
            .save(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        log_message("INFO", f"Overwrite operation completed for table: {target_table}")
    except AnalysisException as e:
        handle_error(e)

# Function to perform merge (upsert) operation
def merge_data(source_df: DataFrame, target_table: str, primary_key: str):
    try:
        target_df = spark.read \
            .format("iceberg") \
            .load(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        
        # Perform merge (upsert) operation
        merged_df = source_df.alias("source").join(
            target_df.alias("target"),
            source_df[primary_key] == target_df[primary_key],
            "outer"
        ).select(
            *[f"coalesce(source.{col}, target.{col}) as {col}" for col in source_df.columns]
        )
        
        merged_df.write \
            .format("iceberg") \
            .mode("overwrite") \
            .save(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        
        log_message("INFO", f"Merge (upsert) operation completed for table: {target_table}")
    except AnalysisException as e:
        handle_error(e)

# Example usage of merge strategies
try:
    # Load data from Snowflake
    snowflake_query = f"SELECT * FROM {source_table}"
    source_df = spark.read \
        .format("snowflake") \
        .options(**snowflake_options) \
        .option("query", snowflake_query) \
        .load()
    
    # Perform append operation
    append_data(source_df, target_table)
    
    # Perform overwrite operation
    overwrite_data(source_df, target_table)
    
    # Perform merge (upsert) operation
    merge_data(source_df, target_table, primary_key="id")
except AnalysisException as e:
    handle_error(e)

# Implement Logging
Implement logging at each step to track progress and potential errors. Use a logging framework to manage different logging levels.

In [None]:
import logging

# Configure logging
logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Utility function for logging
def log_message(level, message):
    if level == "DEBUG":
        logger.debug(message)
    elif level == "INFO":
        logger.info(message)
    elif level == "WARNING":
        logger.warning(message)
    elif level == "ERROR":
        logger.error(message)
    elif level == "CRITICAL":
        logger.critical(message)

# Example usage of logging
log_message("INFO", "Logging is configured successfully.")

# Define Main Replication Function
Define the main replication function that orchestrates the entire data replication process, including schema handling, control table management, data loading, and merge strategies.

In [None]:
# Define Main Replication Function

def replicate_data(source_table: str, target_table: str, batch_sk_column: str, processing_mode: str):
    try:
        # Step 1: Schema Handling
        snowflake_query = f"SELECT * FROM {source_table} LIMIT 1"
        snowflake_df = spark.read \
            .format("snowflake") \
            .options(**snowflake_options) \
            .option("query", snowflake_query) \
            .load()
        
        iceberg_schema = convert_schema(snowflake_df)
        log_message("INFO", f"Schema converted: {iceberg_schema}")
        
        # Step 2: Control Table Management
        max_batch_sk_query = f"SELECT MAX(batch_sk) AS max_batch_sk FROM {control_table} WHERE table_name = '{source_table}'"
        max_batch_sk_df = spark.sql(max_batch_sk_query)
        max_batch_sk = max_batch_sk_df.collect()[0]["max_batch_sk"] or 0
        
        # Step 3: Data Loading
        if processing_mode == "full_load":
            full_load(source_table, target_table)
        elif processing_mode == "chunk_load":
            chunk_load(source_table, target_table, chunk_size=1000)
        elif processing_mode == "partition_load":
            partition_load(source_table, target_table, partition_column="partition_column_name")
        elif processing_mode == "incremental_load":
            incremental_load(source_table, target_table, batch_sk_column)
        
        # Step 4: Merge Strategy
        snowflake_query = f"SELECT * FROM {source_table} WHERE {batch_sk_column} > {max_batch_sk}"
        source_df = spark.read \
            .format("snowflake") \
            .options(**snowflake_options) \
            .option("query", snowflake_query) \
            .load()
        
        merge_strategy = "append"  # Options: append, overwrite, merge
        if merge_strategy == "append":
            append_data(source_df, target_table)
        elif merge_strategy == "overwrite":
            overwrite_data(source_df, target_table)
        elif merge_strategy == "merge":
            merge_data(source_df, target_table, primary_key="id")
        
        # Step 5: Data Change Handling
        handle_insert_operations(source_table, target_table, batch_sk_column)
        handle_update_operations(source_table, target_table, batch_sk_column)
        handle_delete_operations(source_table, target_table, batch_sk_column)
        
        log_message("INFO", f"Data replication completed for table: {source_table}")
    except AnalysisException as e:
        handle_error(e)

# Example usage of the main replication function
try:
    replicate_data(source_table, target_table, batch_sk_column="batch_sk", processing_mode=processing_mode)
except AnalysisException as e:
    handle_error(e)

# Test Replication Framework
Test the replication framework with different scenarios, including initial table creation, new data insertion, and data change handling. Verify the data in Iceberg against the source data in Snowflake.

In [None]:
# Test Replication Framework

# Test the replication framework with different scenarios, including initial table creation, new data insertion, and data change handling.
# Verify the data in Iceberg against the source data in Snowflake.

# Function to verify data in Iceberg against the source data in Snowflake
def verify_data(source_table: str, target_table: str, batch_sk_column: str):
    try:
        # Load data from Snowflake
        snowflake_query = f"SELECT * FROM {source_table}"
        snowflake_df = spark.read \
            .format("snowflake") \
            .options(**snowflake_options) \
            .option("query", snowflake_query) \
            .load()
        
        # Load data from Iceberg
        iceberg_df = spark.read \
            .format("iceberg") \
            .load(f"{iceberg_catalog}.{iceberg_namespace}.{target_table}")
        
        # Compare data
        snowflake_count = snowflake_df.count()
        iceberg_count = iceberg_df.count()
        
        if snowflake_count == iceberg_count:
            log_message("INFO", f"Data verification successful for table: {source_table}")
        else:
            log_message("ERROR", f"Data verification failed for table: {source_table}. Snowflake count: {snowflake_count}, Iceberg count: {iceberg_count}")
    except AnalysisException as e:
        handle_error(e)

# Test Case 1: Initial Table Creation & Data Insertion
try:
    # Replicate data
    replicate_data(source_table, target_table, batch_sk_column="batch_sk", processing_mode="full_load")
    
    # Verify data
    verify_data(source_table, target_table, batch_sk_column="batch_sk")
except AnalysisException as e:
    handle_error(e)

# Test Case 2: Existing Table with New Data
try:
    # Insert new data into Snowflake (this is a placeholder, actual insertion code will depend on your setup)
    # spark.sql("INSERT INTO source_table_name (columns) VALUES (values)")
    
    # Replicate data
    replicate_data(source_table, target_table, batch_sk_column="batch_sk", processing_mode="incremental_load")
    
    # Verify data
    verify_data(source_table, target_table, batch_sk_column="batch_sk")
except AnalysisException as e:
    handle_error(e)

# Test Case 3: Data Change Handling
try:
    # Handle data changes
    handle_insert_operations(source_table, target_table, batch_sk_column="batch_sk")
    handle_update_operations(source_table, target_table, batch_sk_column="batch_sk")
    handle_delete_operations(source_table, target_table, batch_sk_column="batch_sk")
    
    # Verify data
    verify_data(source_table, target_table, batch_sk_column="batch_sk")
except AnalysisException as e:
    handle_error(e)