In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, StructType, IntegerType

In [2]:
spark = SparkSession.builder \
    .appName("IcebergTableLoad") \
    .config("spark.master", "spark://spark-master:7077") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.spark_catalog.uri", "thrift://hive-metastore:9083") \
    .config("spark.hadoop.fs.s3a.access.key", "dp-root-user") \
    .config("spark.hadoop.fs.s3a.secret.key", "dp-root-password") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.2.0,org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .enableHiveSupport() \
    .getOrCreate()


In [3]:
bucket_name = "testing-iceberg"
file_path = f"s3a://{bucket_name}/people/"

In [4]:
spark.sql(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS default.people 
USING ICEBERG
LOCATION '{file_path}'
""")

DataFrame[]

In [5]:
spark.sql("SHOW TABLES;").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|   people|      false|
+---------+---------+-----------+



In [6]:
spark.sql("DESCRIBE FORMATTED people;").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|                name|              string|   NULL|
|                 age|              bigint|   NULL|
|                    |                    |       |
|  # Metadata Columns|                    |       |
|            _spec_id|                 int|       |
|          _partition|            struct<>|       |
|               _file|              string|       |
|                _pos|              bigint|       |
|            _deleted|             boolean|       |
|                    |                    |       |
|# Detailed Table ...|                    |       |
|                Name|spark_catalog.def...|       |
|                Type|            EXTERNAL|       |
|            Location|s3a://testing-ice...|       |
|            Provider|             iceberg|       |
|               Owner|              jovyan|       |
|    Table P

In [7]:
spark.sql("SELECT * FROM people").show()

+------+---+
|  name|age|
+------+---+
|  John| 30|
| Steve| 25|
|  Bill| 40|
|Donald| 45|
| Jenny| 23|
| Lucas| 27|
|  Emma| 35|
| Grace| 28|
|  Liam| 32|
|Claire| 29|
+------+---+



In [8]:
df = spark.read \
    .format("iceberg") \
    .load("people")

df.show()

+------+---+
|  name|age|
+------+---+
|  John| 30|
| Steve| 25|
|  Bill| 40|
|Donald| 45|
| Jenny| 23|
| Lucas| 27|
|  Emma| 35|
| Grace| 28|
|  Liam| 32|
|Claire| 29|
+------+---+

