# Hive Catalog

This notebook provides an example of external table registration on Hive Metastore and Iceberg table interaction.

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
from pyspark.sql import SparkSession

load_dotenv(find_dotenv("../.env", raise_error_if_not_found=True))

os.environ["PYSPARK_SUBMIT_ARGS"] = (
            "--packages org.apache.hadoop:hadoop-aws:3.3.4,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.8.1 pyspark-shell"
        )

print("Initializing spark...")
print(os.getenv("AWS_ACCESS_KEY_ID"))
print(os.getenv("AWS_SECRET_ACCESS_KEY"))
spark = (
    SparkSession.builder.appName("Test")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.spark_catalog.uri", "thrift://localhost:9083")
    .config('spark.sql.catalog.spark_catalog.warehouse', "s3a://lakehouse/")
    .config("spark.sql.catalog.spark_catalog.type", "hive")
    .config("spark.hive.metastore.uris", "thrift://localhost:9083")
    .config("spark.sql.catalogImplementation", "hive")
    .config('spark.sql.warehouse.dir', "s3a://lakehouse/")
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000")
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID"))
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY"))
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

sc = spark.sparkContext
sc.setLogLevel("WARN")

Initializing spark...
admin
password


your 131072x1 screen size is bogus. expect trouble
25/08/12 09:46:17 WARN Utils: Your hostname, CPC-12806 resolves to a loopback address: 127.0.1.1; using 172.26.242.248 instead (on interface eth0)
25/08/12 09:46:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/arthur/dev/dbt-test/.venv/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/arthur/.ivy2/cache
The jars for the packages stored in: /home/arthur/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3b381d59-3707-46b7-97c6-cdcba6999f20;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.8.1 in central
:: resolution report :: resolve 193ms :: artifacts dl 8ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.8.1 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	-----------------

In [2]:
spark.sql("SHOW CATALOGS;").show()

25/08/12 09:46:22 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


+-------------+
|      catalog|
+-------------+
|spark_catalog|
+-------------+



In [5]:
spark.sql("""
    CREATE TABLE default.sample_iceberg (
        id INT,
        name STRING
    )
    USING iceberg
    LOCATION 's3a://lakehouse/iceberg/sample_iceberg';
"""
)

DataFrame[]

In [6]:
spark.sql("""
INSERT INTO default.sample_iceberg VALUES
    (1, 'Alice'),
    (2, 'Bob'),
    (3, 'Charlie');
""")

25/08/12 09:47:31 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to iceberg/sample_iceberg/data/00002-2-f0520798-09ba-4bf1-9c3f-42abfb0f12b5-0-00001.parquet. This is unsupported
                                                                                

DataFrame[]

In [7]:
spark.sql("SELECT * FROM default.sample_iceberg;").show()

+---+-------+
| id|   name|
+---+-------+
|  1|  Alice|
|  2|    Bob|
|  3|Charlie|
+---+-------+

