In [1]:
# Source - https://stackoverflow.com/a/69297835
# Posted by jmPicaza
# Retrieved 2026-01-17, License - CC BY-SA 4.0

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))


In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName('iceberg-intro')
    
    # Iceberg package (same as --packages)
    .config(
        "spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.2"
    )
    
    # Catalog configuration
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.local.type", "hadoop")
    .config("spark.sql.catalog.local.warehouse", "/Users/codebase/Documents/codebase/Courses/LearnApacheIceberg/warehouse")
        
    .getOrCreate()
    )

spark

26/01/18 11:01:52 WARN Utils: Your hostname, codebase.local resolves to a loopback address: 127.0.0.1; using 192.168.1.7 instead (on interface en0)
26/01/18 11:01:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/codebase/.ivy2/cache
The jars for the packages stored in: /Users/codebase/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4763e84e-f0d0-41ef-a3b2-0062a8961b57;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.9.2 in central
:: resolution report :: resolve 46ms :: artifacts dl 1ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.9.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


26/01/18 11:01:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


#### Create an Iceberg Table

In [3]:
# Create a table using Iceberg
spark.sql("""
    CREATE TABLE local.db.iceberg_table (
        id BIGINT,
        name STRING,
        age INT,
        ts TIMESTAMP
    )
    USING iceberg
    TBLPROPERTIES (
      'write.metadata.metrics.default' = 'truncate(16)'
    )
    PARTITIONED BY (days(ts))
""")

DataFrame[]

#### Insert Data into iceberg table

In [4]:
# Insert rows using SQL
spark.sql("""
    INSERT INTO local.db.iceberg_table VALUES
    (1, 'Alice', 30, CURRENT_TIMESTAMP()),
    (2, 'Bob', 25, CURRENT_TIMESTAMP())
""")

DataFrame[]

In [5]:
# Insert rows using PySpark DataFrame
from datetime import datetime as dt

data = [(3, "Charlie", 28, dt.now()),
        (4, "Diana", 34, dt.now())]

df = spark.createDataFrame(data, ["id", "name", "age", "ts"])

df.writeTo("local.db.iceberg_table").append()

#### Querying the Iceberg Table

In [6]:
# Query the Iceberg table
result = spark.sql("SELECT * FROM local.db.iceberg_table WHERE age > 25")
result.show()

+---+-------+---+--------------------+
| id|   name|age|                  ts|
+---+-------+---+--------------------+
|  3|Charlie| 28|2026-01-17 14:40:...|
|  4|  Diana| 34|2026-01-17 14:40:...|
|  1|  Alice| 30|2026-01-17 14:40:...|
+---+-------+---+--------------------+



#### Time Travel

In [7]:
history = spark.sql(
    """
    SELECT * FROM local.db.iceberg_table.history
    """
)
history.show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2026-01-17 14:40:...|3929309049952964673|               NULL|               true|
|2026-01-17 14:40:...|8166654969485390864|3929309049952964673|               true|
+--------------------+-------------------+-------------------+-------------------+



In [8]:
# Use snapshot ID to time travel
snapshot_id = history.collect()[0]['snapshot_id']  # Fetching the snapshot ID


time_travel_df = spark.read \
    .format("iceberg") \
    .option("snapshot-id", snapshot_id) \
    .load("local.db.iceberg_table")

time_travel_df.show()

+---+-----+---+--------------------+
| id| name|age|                  ts|
+---+-----+---+--------------------+
|  1|Alice| 30|2026-01-17 14:40:...|
|  2|  Bob| 25|2026-01-17 14:40:...|
+---+-----+---+--------------------+



In [9]:
history.collect()[1]['snapshot_id']

8166654969485390864

#### Query by timestamp

In [10]:
# timestamp_in_ms = 1768638825.557978  # Example: Jan 17, 2026, in milliseconds

# time_travel_df = spark.read \
#     .format("iceberg") \
#     .option("as-of-timestamp", ) \
#     .load("local.db.iceberg_table")

# time_travel_df.show()

#### Schema Evolution

In [11]:
# Add a new column to the table
spark.sql("ALTER TABLE local.db.iceberg_table ADD COLUMN email STRING")

DataFrame[]

In [12]:
# Drop the 'age' column from the table
spark.sql("ALTER TABLE local.db.iceberg_table DROP COLUMN age")

DataFrame[]

In [13]:
spark.sql(
    """
    SELECT * FROM local.db.iceberg_table.history
    """
).show()

# No change in history -> because the data is not yet added after ALTER Column

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2026-01-17 14:40:...|3929309049952964673|               NULL|               true|
|2026-01-17 14:40:...|8166654969485390864|3929309049952964673|               true|
+--------------------+-------------------+-------------------+-------------------+



#### Partitioning

In [14]:
# Create a partitioned Iceberg table by day based on the timestamp 'ts' column
spark.sql("""
    CREATE TABLE local.db.partitioned_table (
        id BIGINT,
        name STRING,
        age INT,
        ts TIMESTAMP
    )
    USING iceberg
    PARTITIONED BY (days(ts))
""")

DataFrame[]

#### Table metadata

In [15]:
# View table schema
schema = spark.sql("DESCRIBE local.db.iceberg_table")
schema.show()

+--------------+---------+-------+
|      col_name|data_type|comment|
+--------------+---------+-------+
|            id|   bigint|   NULL|
|          name|   string|   NULL|
|            ts|timestamp|   NULL|
|         email|   string|   NULL|
|              |         |       |
|# Partitioning|         |       |
|        Part 0| days(ts)|       |
+--------------+---------+-------+



In [16]:
# View table history
history = spark.sql(
    """
    SELECT * FROM local.db.iceberg_table.history
    """
)

history.show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2026-01-17 14:40:...|3929309049952964673|               NULL|               true|
|2026-01-17 14:40:...|8166654969485390864|3929309049952964673|               true|
+--------------------+-------------------+-------------------+-------------------+



In [17]:
spark.sql(
    """
    SELECT MAX(upper_bounds['ts'])
    FROM local.db.iceberg_table.files
    """
).show(truncate=False)

+---------------------+
|max(upper_bounds[ts])|
+---------------------+
|NULL                 |
+---------------------+



In [18]:
spark.sql(
    """
    SELECT MAX(data_file.upper_bounds['ts']) AS max_ts
    FROM local.db.iceberg_table.entries
    """
).show(truncate=False)

+------+
|max_ts|
+------+
|NULL  |
+------+



In [19]:
spark.sql(
    """
    SHOW TBLPROPERTIES local.db.iceberg_table
    """
).show(truncate=False)

+-------------------------------+-------------------+
|key                            |value              |
+-------------------------------+-------------------+
|current-snapshot-id            |8166654969485390864|
|format                         |iceberg/parquet    |
|format-version                 |2                  |
|write.metadata.metrics.default |truncate(16)       |
|write.parquet.compression-codec|zstd               |
+-------------------------------+-------------------+



In [33]:
spark.sql(
    """
    SELECT MAX(upper_bounds['ts'])
    FROM local.db.iceberg_table.files;
    """
).show()
# MAX(partition.ts_day)

+---------------------+
|max(upper_bounds[ts])|
+---------------------+
|                 NULL|
+---------------------+



In [35]:
time_travel_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- ts: timestamp (nullable = true)



In [36]:
spark.sql(
    """
    ALTER TABLE local.db.iceberg_table SET TBLPROPERTIES (
      'write.metadata.metrics.default' = 'none',
      'write.metadata.metrics.ts' = 'full',
      'write.metadata.metrics.age' = 'full'
    );
    """
)

DataFrame[]

In [37]:
spark.sql(
    """
    INSERT OVERWRITE local.db.iceberg_table
    SELECT * FROM local.db.iceberg_table;

    """
)

DataFrame[]

In [41]:
spark.sql(
    """
    SELECT
      upper_bounds,
      lower_bounds
    FROM local.db.iceberg_table.files
    LIMIT 1;
    """
).show()
# MAX(partition.ts_day)

+------------+------------+
|upper_bounds|lower_bounds|
+------------+------------+
|          {}|          {}|
+------------+------------+

