In [1]:
import os

## import pyspark
import pyspark
from pyspark.sql import SparkSession

In [2]:
conf = (
    pyspark.SparkConf()
        .setAppName('app_name')
  		#packages
        .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.3.1,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')
  		#SQL Extensions
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')
  		#Configuring Catalog
        .set('spark.sql.catalog.spark_catalog','org.apache.iceberg.spark.SparkSessionCatalog')
        .set('spark.sql.catalog.spark_catalog.type','hive')
        .set('spark.sql.catalog.iceberg', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.iceberg.type', 'hadoop')
        .set('spark.sql.catalog.iceberg.warehouse', 'iceberg-warehouse')
)

spark.appName: The name of the Spark application.
spark.jars.packages: A comma-separated list of JARs that should be added to the Spark classpath. These JARs are necessary for using Iceberg with Spark.
spark.sql.extensions: The name of the SQL extension that should be used for Iceberg.
spark.sql.catalog.iceberg: The name of the catalog that should be used for Iceberg.
spark.sql.catalog.iceberg.type: The type of the catalog that should be used for Iceberg.
spark.sql.catalog.iceberg.warehouse: The location of the warehouse for the Iceberg catalog.


In [3]:
## Start Spark Session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark Running")

:: loading settings :: url = jar:file:/opt/anaconda3/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/anshumanr/.ivy2/cache
The jars for the packages stored in: /Users/anshumanr/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.4_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5ad9f415-e843-40f4-823b-61a468442c94;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.4_2.12;1.3.1 in central
	found software.amazon.awssdk#bundle;2.17.178 in central
	found software.amazon.eventstream#eventstream;1.0.1 in central
	found software.amazon.awssdk#url-connection-client;2.17.178 in central
	found software.amazon.awssdk#utils;2.17.178 in central
	found org.reactivestreams#reactive-streams;1.0.3 in central
	found software.amazon.awssdk#annotations;2.17.178 in central
	found org.slf4j#slf4j-api;1.7.30 in local-m2-cache
	found software.amazon.awssdk#http-client-spi;2.17.178 in central


Spark Running


# Creating a table inside warehouse

In [None]:
## Run a Query to create a table
spark.sql("CREATE TABLE iceberg.table1 (id integer,name string) USING iceberg;")

## Run a Query to insert into the table
spark.sql("INSERT INTO iceberg.table1 VALUES (1,'Alex'), (2,'Dipankar'), (3,'Jason')")


In [4]:
## Run a Query to get data
df = spark.sql("SELECT * FROM iceberg.table1")
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+---+--------+
| id|    name|
+---+--------+
|  1|    Alex|
|  2|Dipankar|
|  3|   Jason|
+---+--------+



                                                                                

# Creating table inside the database

In [5]:
spark.sql("CREATE TABLE iceberg.db.my_table (name string, age int) USING iceberg;")

DataFrame[]

In [7]:
spark.sql("use iceberg").show()

++
||
++
++



In [8]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|       db|
+---------+



In [9]:
spark.sql("INSERT INTO iceberg.db.my_table VALUES ('Bob', 20), ('Steve', 36), ('Fiona', 25), ('Roger', 25);")

[Stage 1:>                                                          (0 + 4) / 4]                                                                                

DataFrame[]

In [10]:
spark.sql("select * from iceberg.db.my_table").show()

+-----+---+
| name|age|
+-----+---+
|  Bob| 20|
|Steve| 36|
|Fiona| 25|
|Roger| 25|
+-----+---+



# Update records in table

In [11]:
spark.sql("UPDATE iceberg.db.my_table SET name='Alex' WHERE name='Steve';")

[Stage 3:>                                                          (0 + 1) / 1]                                                                                

DataFrame[]

In [12]:
spark.sql("select * from iceberg.db.my_table").show()

+-----+---+
| name|age|
+-----+---+
| Alex| 36|
|  Bob| 20|
|Fiona| 25|
|Roger| 25|
+-----+---+



# Changing schema of table

In [13]:
spark.sql("ALTER TABLE iceberg.db.my_table ADD COLUMNS (email string) ")

23/08/10 15:37:32 WARN BaseTransaction: Failed to load metadata for a committed snapshot, skipping clean-up


DataFrame[]

In [14]:
spark.sql("select * from iceberg.db.my_table").show()

+-----+---+-----+
| name|age|email|
+-----+---+-----+
| Alex| 36| null|
|  Bob| 20| null|
|Fiona| 25| null|
|Roger| 25| null|
+-----+---+-----+



In [15]:
spark.sql("INSERT INTO iceberg.db.my_table VALUES ('John', 56, 'email@email.email');")

DataFrame[]

In [16]:
spark.sql("select * from iceberg.db.my_table").show()

+-----+---+-----------------+
| name|age|            email|
+-----+---+-----------------+
| John| 56|email@email.email|
|  Bob| 20|             null|
| Alex| 36|             null|
|Fiona| 25|             null|
|Roger| 25|             null|
+-----+---+-----------------+



In [17]:
spark.sql("UPDATE iceberg.db.my_table SET email='Alex@gmail.com' WHERE name='Alex';")

DataFrame[]

In [18]:
spark.sql("select * from iceberg.db.my_table").show()

+-----+---+-----------------+
| name|age|            email|
+-----+---+-----------------+
| Alex| 36|   Alex@gmail.com|
|  Bob| 20|             null|
| John| 56|email@email.email|
|Fiona| 25|             null|
|Roger| 25|             null|
+-----+---+-----------------+



# Inspecting our table

In [None]:
#inspecting table history

In [45]:
df=spark.sql("SELECT * FROM iceberg.db.my_table.history")
df.show()
df.write.save("/Users/anshumanr/Documents/Iceberg/Apache_iceberg/iceberg-warehouse/db/my_table/history",header=True)

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2023-08-10 15:36:...|2311303090824495162|               null|               true|
|2023-08-10 15:37:...|2559888429599540403|2311303090824495162|               true|
|2023-08-10 15:42:...|4939622892415754228|2559888429599540403|               true|
|2023-08-10 15:42:...|6500316511377472248|4939622892415754228|               true|
+--------------------+-------------------+-------------------+-------------------+



In [46]:
#inspecting snapshots

In [47]:
df=spark.sql("select * from iceberg.db.my_table.snapshots")
df.show()
df.write.save("/Users/anshumanr/Documents/Iceberg/Apache_iceberg/iceberg-warehouse/db/my_table/snapshots",header=True)

+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2023-08-10 15:36:...|2311303090824495162|               null|   append|iceberg-warehouse...|{spark.app.id -> ...|
|2023-08-10 15:37:...|2559888429599540403|2311303090824495162|overwrite|iceberg-warehouse...|{spark.app.id -> ...|
|2023-08-10 15:42:...|4939622892415754228|2559888429599540403|   append|iceberg-warehouse...|{spark.app.id -> ...|
|2023-08-10 15:42:...|6500316511377472248|4939622892415754228|overwrite|iceberg-warehouse...|{spark.app.id -> ...|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+



In [43]:
# inpecting file path

In [48]:
df=spark.sql("SELECT * FROM iceberg.db.my_table.files")
df.show()
df.write.save("/Users/anshumanr/Documents/Iceberg/Apache_iceberg/iceberg-warehouse/db/my_table/inspect_files",header=True)

+-------+--------------------+-----------+-------+------------+------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+------------+-------------+------------+-------------+--------------------+
|content|           file_path|file_format|spec_id|record_count|file_size_in_bytes|        column_sizes|        value_counts|   null_value_counts|nan_value_counts|        lower_bounds|        upper_bounds|key_metadata|split_offsets|equality_ids|sort_order_id|    readable_metrics|
+-------+--------------------+-----------+-------+------------+------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+------------+-------------+------------+-------------+--------------------+
|      0|iceberg-warehouse...|    PARQUET|      0|           1|               976|{1 -> 55, 2 -> 51...|{1 -> 1, 2 -> 1, ...|{1 -> 0, 2 -> 0, ...|              {

[Stage 37:>                                                         (0 + 1) / 1]                                                                                

In [51]:
df=spark.sql("SELECT * FROM iceberg.db.my_table.manifests;")
df.show()
df.write.save("/Users/anshumanr/Documents/Iceberg/Apache_iceberg/iceberg-warehouse/db/my_table/manifests",header=True)

+-------+--------------------+------+-----------------+-------------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+
|content|                path|length|partition_spec_id|  added_snapshot_id|added_data_files_count|existing_data_files_count|deleted_data_files_count|added_delete_files_count|existing_delete_files_count|deleted_delete_files_count|partition_summaries|
+-------+--------------------+------+-----------------+-------------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+
|      0|iceberg-warehouse...|  5852|                0|6500316511377472248|                     1|                        0|                       0|                       0|                          0|                         0|                 []|


In [53]:
df=spark.sql("SELECT * FROM iceberg.db.my_table.partitions")
df.show()
df.write.save("/Users/anshumanr/Documents/Iceberg/Apache_iceberg/iceberg-warehouse/db/my_table/partitions",header=True)

+------------+----------+----------------------------+--------------------------+----------------------------+--------------------------+
|record_count|file_count|position_delete_record_count|position_delete_file_count|equality_delete_record_count|equality_delete_file_count|
+------------+----------+----------------------------+--------------------------+----------------------------+--------------------------+
|           5|         5|                           0|                         0|                           0|                         0|
+------------+----------+----------------------------+--------------------------+----------------------------+--------------------------+

