# Summary

This is a simple training script to try and learn Delta Tables Change Data Feed feature.

# Imports

In [None]:
import os
import sys

import pyspark.sql.functions as F
from delta.tables import DeltaTable
from pyspark.sql import SparkSession

# Required for Spark to find Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Start Spark App

In [None]:
spark = None
if spark:
    spark.stop()

spark = (SparkSession.builder 
    .appName("spark_delta") 
    # These 3 lines configure Spark to work with Delta Tables
    # If dependencies are not present in your claspath yet they will be automatically downloaded
    # It is important to have an adequate Java environment. Having more than 1 version
    # installed and env variables not properly set may lead to errors
    .config("spark.jars.packages", "io.delta:delta-spark_2.13:4.0.0") 
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") 
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
    # Set this to prevent Spark from creating a directory called spark-warehouse 
    # where date is persisted
    .config("spark.sql.warehouse.dir", "../data/") 
    .getOrCreate())

# Reading data

In [None]:
netflix_df = spark.read.csv('../data/netflix_titles.csv',
                            header=True,  # the first line of the file is a header
                            multiLine=True,  # rows can have break lines
                            # quote = Sets a single character used for escaping quoted values where the separator can be part of the value.
                            quote='"',
                            # escape = Sets a single character used for escaping quotes inside an already quoted value.
                            escape='"'
                            )

In [None]:
netflix_df.count()

# Write to Delta Table

In [None]:
netflix_delta_table_name = "netflix_delta"

## write_delta_table

Helper method for creating and appending data into table

In [None]:
def write_delta_table(mode: str):
    (netflix_df.write
     .format('delta')
     .mode(mode)
     .saveAsTable(f"{netflix_delta_table_name}"))

In [None]:
write_delta_table('overwrite')

In [None]:
# Show TBLPROPERTIES so we can see that initially it doesn't have
# CDF (Change Data Feed) enabled by default
spark.sql(f'SHOW TBLPROPERTIES {netflix_delta_table_name}').show(truncate=False)

# Enable CDF

In [None]:
spark.sql(
    "ALTER TABLE netflix_delta SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

# Append data

In [None]:
# Append the same data, just to create a new write version
# so we can search the difference between versions
write_delta_table('append')

# Get latest version

In [None]:
netflix_delta_table = DeltaTable.forName(spark, "netflix_delta")

In [None]:
write_operations_df = (netflix_delta_table
                       .history()
                       .where("operation IN ('WRITE', 'RESTORE')")
                       )
write_operations_df.show()

In [None]:
last_write_version = (write_operations_df
                      .select(F.max('version').alias('last_version'))
                      .collect()[0]['last_version'])
last_write_version

# Read changes between version

## Spark SQL way

In [None]:
last_changes_df_sql = (spark
                   .sql(f"SELECT * FROM table_changes('{netflix_delta_table_name}',{last_write_version})"))
last_changes_df_sql.show()

## Delta Table way

In [None]:
last_changes_df_dt = (spark.read.format("delta")
                      .option("readChangeFeed", "true")
                      .option("startingVersion", last_write_version)
                      #   .option("endingVersion", 10)
                      .table(netflix_delta_table_name))
last_changes_df_dt.show()