In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW DATABASES
    """
    
).show()

+--------------+
|     namespace|
+--------------+
|americancrimes|
|       default|
|       economy|
|      politics|
+--------------+



In [3]:
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+--------------+
|     namespace|
+--------------+
|americancrimes|
|       default|
|       economy|
|      politics|
+--------------+



In [4]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+----------------+-----------+
|      database|       tableName|isTemporary|
+--------------+----------------+-----------+
|americancrimes|civillian_deaths|      false|
|americancrimes|    crime_by_age|      false|
|americancrimes|   crime_by_race|      false|
|americancrimes|          crimes|      false|
|americancrimes|      demography|      false|
|americancrimes|         economy|      false|
|americancrimes|   police_deaths|      false|
+--------------+----------------+-----------+



In [5]:
spark.sql(
    """
    DROP TABLE IF EXISTS americancrimes.politics
    """
)

spark.sql(
    """
    CREATE TABLE americancrimes.politics (
        state VARCHAR(70),
        party VARCHAR(50),
        candidatevotes INT,
        totalvotes INT
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year DATE
    )
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/americancrimes.db/politics/'
    """
)

DataFrame[]

In [6]:
# recover partitions is needed so that the Hive Metastore (Catalog)
# is updated. Otherwise Hive and the querying engines do not know
# that there are new parittions in the partitioned table.
spark.catalog.recoverPartitions("americancrimes.politics")

spark.sql(
    """
    SELECT *
    FROM americancrimes.politics
    """
).show()

+-----+-----+--------------+----------+----+
|state|party|candidatevotes|totalvotes|year|
+-----+-----+--------------+----------+----+
+-----+-----+--------------+----------+----+

