In [0]:
df = spark.read.load("/databricks-datasets/learning-spark-v2/people/people-10m.delta")

# Write the data to a table.
table_name = "people_10m"
df.write.saveAsTable(table_name)



In [0]:
display(spark.sql('DESCRIBE DETAIL people_10m'))

format,id,name,description,location,createdAt,lastModified,partitionColumns,clusteringColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics,clusterByAuto
delta,9f91a622-46fb-433a-895a-5ae7262e3c04,hexadatabrickswp_v2.default.people_10m,,abfss://unity-catalog-storage@dbstoragehqcphemuvt2a2.dfs.core.windows.net/837983396487364/__unitystorage/catalogs/7649e656-9f74-4d84-a5f1-cdb7afe69d9e/tables/a8da6df9-26c2-4b0b-b236-59b5679c2d56,2025-08-11T06:35:33.515Z,2025-08-11T06:35:48.000Z,List(),List(),4,218583738,Map(delta.enableDeletionVectors -> true),3,7,"List(appendOnly, deletionVectors, invariants)","Map(numRowsDeletedByDeletionVectors -> 0, numDeletionVectors -> 0)",False


In [0]:
%sql
CREATE TABLE IF NOT EXISTS people10m (
  id INT,
  firstName STRING,
  middleName STRING,
  lastName STRING,
  gender STRING,
  birthDate TIMESTAMP,
  ssn STRING,
  salary INT
)


In [0]:
# Use the existing catalog, for example 'hive_metastore'
spark.sql("USE CATALOG hive_metastore")

# Create the schema if it does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS default")

from delta.tables import DeltaTable
DeltaTable.createIfNotExists(spark) \
    .tableName("default.people_10m") \
    .addColumn("id", "INT") \
    .addColumn("firstName", "STRING") \
    .addColumn("middleName", "STRING") \
    .addColumn("lastName", "STRING", comment="surname") \
    .addColumn("gender", "STRING") \
    .addColumn("birthDate", "TIMESTAMP") \
    .addColumn("ssn", "STRING") \
    .addColumn("salary", "INT") \
    .execute()

<delta.connect.tables.DeltaTable at 0x7f62c49b2fd0>

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from datetime import date

schema = StructType([
  StructField("id", IntegerType(), True),
  StructField("firstName", StringType(), True),
  StructField("middleName", StringType(), True),
  StructField("lastName", StringType(), True),
  StructField("gender", StringType(), True),
  StructField("birthDate", DateType(), True),
  StructField("ssn", StringType(), True),
  StructField("salary", IntegerType(), True)
])

data = [
  (9999998, 'Billy', 'Tommie', 'Luppitt', 'M', date.fromisoformat('1992-09-17'), '953-38-9452', 55250),
  (9999999, 'Elias', 'Cyril', 'Leadbetter', 'M', date.fromisoformat('1984-05-22'), '906-51-2137', 48500),
  (10000000, 'Joshua', 'Chas', 'Broggio', 'M', date.fromisoformat('1968-07-22'), '988-61-6247', 90000),
  (20000001, 'John', '', 'Doe', 'M', date.fromisoformat('1978-01-14'), '345-67-8901', 55500),
  (20000002, 'Mary', '', 'Smith', 'F', date.fromisoformat('1982-10-29'), '456-78-9012', 98250),
  (20000003, 'Jane', '', 'Doe', 'F', date.fromisoformat('1981-06-25'), '567-89-0123', 89900)
]

people_10m_updates = spark.createDataFrame(data, schema)
people_10m_updates.createOrReplaceTempView("people_10m_updates")

# Check available catalogs
spark.sql("SHOW CATALOGS")

# Use an existing catalog or create a new one
spark.sql("USE CATALOG hive_metastore")

# Create the Delta table if it does not exist
spark.sql("""
CREATE TABLE IF NOT EXISTS default.people_10m (
  id INT,
  firstName STRING,
  middleName STRING,
  lastName STRING,
  gender STRING,
  birthDate DATE,
  ssn STRING,
  salary INT
) USING DELTA
""")

from delta.tables import DeltaTable

deltaTable = DeltaTable.forName(spark, 'hive_metastore.default.people_10m')

(deltaTable.alias("people_10m")
  .merge(
    people_10m_updates.alias("people_10m_updates"),
    "people_10m.id = people_10m_updates.id")
  .whenMatchedUpdateAll()
  .whenNotMatchedInsertAll()
  .execute()
)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
df = spark.read.table("default.people_10m")
df_filtered = df.filter(df["id"] >= 9999998)
display(df_filtered)

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
9999999,Elias,Cyril,Leadbetter,M,1984-05-22T00:00:00.000Z,906-51-2137,48500
9999998,Billy,Tommie,Luppitt,M,1992-09-17T00:00:00.000Z,953-38-9452,55250
10000000,Joshua,Chas,Broggio,M,1968-07-22T00:00:00.000Z,988-61-6247,90000
20000002,Mary,,Smith,F,1982-10-29T00:00:00.000Z,456-78-9012,98250
20000001,John,,Doe,M,1978-01-14T00:00:00.000Z,345-67-8901,55500
20000003,Jane,,Doe,F,1981-06-25T00:00:00.000Z,567-89-0123,89900


In [0]:
people_df = spark.read.table("default.people_10m")
display(people_df)

id,firstName,middleName,lastName,gender,birthDate,ssn,salary
9999999,Elias,Cyril,Leadbetter,M,1984-05-22T00:00:00.000Z,906-51-2137,48500
9999998,Billy,Tommie,Luppitt,M,1992-09-17T00:00:00.000Z,953-38-9452,55250
10000000,Joshua,Chas,Broggio,M,1968-07-22T00:00:00.000Z,988-61-6247,90000
20000002,Mary,,Smith,F,1982-10-29T00:00:00.000Z,456-78-9012,98250
20000001,John,,Doe,M,1978-01-14T00:00:00.000Z,345-67-8901,55500
20000003,Jane,,Doe,F,1981-06-25T00:00:00.000Z,567-89-0123,89900


In [0]:
people_df.write.mode("append").saveAsTable("default.people_10m")

In [0]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")

# Declare the predicate by using a SQL-formatted string.
deltaTable.update(
  condition = "gender = 'F'",
  set = { "gender": "'Female'" }
)

# Declare the predicate by using Spark SQL functions.
deltaTable.update(
  condition = col('gender') == 'M',
  set = { 'gender': lit('Male') }
)

DataFrame[num_affected_rows: bigint]

In [0]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")

# Declare the predicate by using a SQL-formatted string.
deltaTable.delete("birthDate < '1955-01-01'")

# Declare the predicate by using Spark SQL functions.
deltaTable.delete(col('birthDate') < '1960-01-01')

DataFrame[num_affected_rows: bigint]

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
display(deltaTable.history())

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
9,2025-08-11T07:49:01.000Z,146576795388468,azuser4018_mml.local@techademy.com,DELETE,"Map(predicate -> [""(birthDate#14209 < 1960-01-01 00:00:00)""])",,,0811-063515-jsf2de8u-v2n,8.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 143, numDeletionVectorsUpdated -> 0, numDeletedRows -> 0, scanTimeMs -> 143, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/17.0.x-photon-scala2.13
8,2025-08-11T07:49:00.000Z,146576795388468,azuser4018_mml.local@techademy.com,DELETE,"Map(predicate -> [""(birthDate#14107 < 1955-01-01 00:00:00)""])",,,0811-063515-jsf2de8u-v2n,7.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 145, numDeletionVectorsUpdated -> 0, numDeletedRows -> 0, scanTimeMs -> 143, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/17.0.x-photon-scala2.13
7,2025-08-11T07:47:42.000Z,146576795388468,azuser4018_mml.local@techademy.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,,0811-063515-jsf2de8u-v2n,6.0,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 12, numOutputBytes -> 2580)",,Databricks-Runtime/17.0.x-photon-scala2.13
6,2025-08-11T07:47:09.000Z,146576795388468,azuser4018_mml.local@techademy.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,,0811-063515-jsf2de8u-v2n,5.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 5039, p25FileSize -> 2580, numDeletionVectorsRemoved -> 1, minFileSize -> 2580, numAddedFiles -> 1, maxFileSize -> 2580, p75FileSize -> 2580, p50FileSize -> 2580, numAddedBytes -> 2580)",,Databricks-Runtime/17.0.x-photon-scala2.13
5,2025-08-11T07:47:08.000Z,146576795388468,azuser4018_mml.local@techademy.com,UPDATE,"Map(predicate -> [""(gender#13294 = M)""])",,,0811-063515-jsf2de8u-v2n,3.0,WriteSerializable,False,"Map(numRemovedFiles -> 5, numRemovedBytes -> 10493, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 1, numAddedChangeFiles -> 0, executionTimeMs -> 1267, conflictDetectionTimeMs -> 370, numDeletionVectorsUpdated -> 0, scanTimeMs -> 580, numAddedFiles -> 1, numUpdatedRows -> 8, numAddedBytes -> 2468, rewriteTimeMs -> 686)",,Databricks-Runtime/17.0.x-photon-scala2.13
4,2025-08-11T07:47:05.000Z,146576795388468,azuser4018_mml.local@techademy.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,,0811-063515-jsf2de8u-v2n,3.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 6, numRemovedBytes -> 12809, p25FileSize -> 2571, numDeletionVectorsRemoved -> 1, minFileSize -> 2571, numAddedFiles -> 1, maxFileSize -> 2571, p75FileSize -> 2571, p50FileSize -> 2571, numAddedBytes -> 2571)",,Databricks-Runtime/17.0.x-photon-scala2.13
3,2025-08-11T07:47:04.000Z,146576795388468,azuser4018_mml.local@techademy.com,UPDATE,"Map(predicate -> [""(gender#12881 = F)""])",,,0811-063515-jsf2de8u-v2n,2.0,WriteSerializable,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 3975, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1387, numDeletionVectorsUpdated -> 0, scanTimeMs -> 658, numAddedFiles -> 1, numUpdatedRows -> 4, numAddedBytes -> 2316, rewriteTimeMs -> 723)",,Databricks-Runtime/17.0.x-photon-scala2.13
2,2025-08-11T07:44:32.000Z,146576795388468,azuser4018_mml.local@techademy.com,WRITE,"Map(mode -> Append, statsOnLoad -> false, partitionBy -> [])",,,0811-063515-jsf2de8u-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 1, numOutputRows -> 6, numOutputBytes -> 2387)",,Databricks-Runtime/17.0.x-photon-scala2.13
1,2025-08-11T07:40:21.000Z,146576795388468,azuser4018_mml.local@techademy.com,MERGE,"Map(predicate -> [""(id#12224 = id#12256)""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [{""actionType"":""insert""}])",,,0811-063515-jsf2de8u-v2n,0.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 6, numTargetBytesAdded -> 12081, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 0, executionTimeMs -> 1141, materializeSourceTimeMs -> 75, numTargetRowsInserted -> 6, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 419, numTargetRowsUpdated -> 0, numOutputRows -> 6, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 6, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 616)",,Databricks-Runtime/17.0.x-photon-scala2.13
0,2025-08-11T07:29:51.000Z,146576795388468,azuser4018_mml.local@techademy.com,CREATE TABLE,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,,0811-063515-jsf2de8u-v2n,,WriteSerializable,True,Map(),,Databricks-Runtime/17.0.x-photon-scala2.13


In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
deltaHistory = deltaTable.history()

display(deltaHistory.where("version == 0"))
# Or:
display(deltaHistory.where("timestamp == '2024-05-15T22:43:15.000+00:00'"))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2025-08-11T07:29:51.000Z,146576795388468,azuser4018_mml.local@techademy.com,CREATE TABLE,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableDeletionVectors"":""true""}, statsOnLoad -> false)",,,0811-063515-jsf2de8u-v2n,,WriteSerializable,True,Map(),,Databricks-Runtime/17.0.x-photon-scala2.13


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo


In [0]:
df = spark.read.option('versionAsOf', 0).table("default.people_10m")

display(df)

id,firstName,middleName,lastName,gender,birthDate,ssn,salary


In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
deltaTable.optimize().executeCompaction()

DataFrame[path: string, metrics: struct<autoCompactParallelismStats:void,clusteringMetrics:void,clusteringStats:void,deletionVectorStats:struct<numDeletionVectorRowsRemoved:bigint,numDeletionVectorsRemoved:bigint>,endTimeMs:bigint,filesAdded:struct<avg:double,max:bigint,min:bigint,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<avg:double,max:bigint,min:bigint,totalFiles:bigint,totalSize:bigint>,numBatches:bigint,numBins:bigint,numBytesSkippedToReduceWriteAmplification:bigint,numFilesAdded:bigint,numFilesRemoved:bigint,numFilesSkippedToReduceWriteAmplification:bigint,numTableColumns:bigint,numTableColumnsWithStats:bigint,partitionsOptimized:bigint,preserveInsertionOrder:boolean,recompressionCodec:void,skippedArchivedFiles:bigint,startTimeMs:bigint,totalClusterParallelism:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,totalScheduledTasks:bigint,totalTaskExecutionTimeMs:bigint,zOrderStats:void>]

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
deltaTable.optimize().executeZOrderBy("gender")

DataFrame[path: string, metrics: struct<autoCompactParallelismStats:void,clusteringMetrics:void,clusteringStats:void,deletionVectorStats:struct<numDeletionVectorRowsRemoved:bigint,numDeletionVectorsRemoved:bigint>,endTimeMs:bigint,filesAdded:struct<avg:double,max:void,min:void,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<avg:double,max:void,min:void,totalFiles:bigint,totalSize:bigint>,numBatches:bigint,numBins:bigint,numBytesSkippedToReduceWriteAmplification:bigint,numFilesAdded:bigint,numFilesRemoved:bigint,numFilesSkippedToReduceWriteAmplification:bigint,numTableColumns:bigint,numTableColumnsWithStats:bigint,partitionsOptimized:bigint,preserveInsertionOrder:boolean,recompressionCodec:void,skippedArchivedFiles:bigint,startTimeMs:bigint,totalClusterParallelism:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,totalScheduledTasks:bigint,totalTaskExecutionTimeMs:bigint,zOrderStats:struct<inputCubeFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,inputOtherFil

In [0]:
from delta.tables import *

deltaTable = DeltaTable.forName(spark, "default.people_10m")
deltaTable.vacuum()