In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
spark.sql(
    """
    SHOW DATABASES
    """
    
).show()

+--------------+
|     namespace|
+--------------+
|americancrimes|
|       default|
|       economy|
|      politics|
+--------------+



In [3]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show() 

+--------------+----------------+-----------+
|      database|       tableName|isTemporary|
+--------------+----------------+-----------+
|americancrimes|civillian_deaths|      false|
|americancrimes|    crime_by_age|      false|
|americancrimes|   crime_by_race|      false|
|americancrimes|          crimes|      false|
|americancrimes|      demography|      false|
+--------------+----------------+-----------+



In [4]:
spark.sql(
    """
    DROP TABLE IF EXISTS americancrimes.economy
    """
)

spark.sql(
    """
    CREATE TABLE americancrimes.economy (
        state VARCHAR(70),
        household_median_income FLOAT,
        min_wage_state FLOAT,
        min_wage_federal FLOAT,
        cpi_average FLOAT,
        unemployed INT,
        employed INT,
        employable_pop INT,
        gdp_state FLOAT
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year DATE
    )
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/americancrimes.db/economy/'
    """
)

DataFrame[]

In [5]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+----------------+-----------+
|      database|       tableName|isTemporary|
+--------------+----------------+-----------+
|americancrimes|civillian_deaths|      false|
|americancrimes|    crime_by_age|      false|
|americancrimes|   crime_by_race|      false|
|americancrimes|          crimes|      false|
|americancrimes|      demography|      false|
|americancrimes|         economy|      false|
+--------------+----------------+-----------+



In [6]:
# Let's look into HDFS

In [7]:
spark.sql(
    """
    DESCRIBE FORMATTED americancrimes.economy
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,state,varchar(70),
1,household_median_income,float,
2,min_wage_state,float,
3,min_wage_federal,float,
4,cpi_average,float,
5,unemployed,int,
6,employed,int,
7,employable_pop,int,
8,gdp_state,float,
9,year,date,


In [8]:
# Let's put the files into HDFS

In [9]:
# recover partitions is needed so that the Hive Metastore (Catalog)
# is updated. Otherwise Hive and the querying engines do not know
# that there are new parittions in the partitioned table.
spark.catalog.recoverPartitions("americancrimes.economy")

spark.sql(
    """
    SELECT *
    FROM americancrimes.economy
    """
).toPandas()

Unnamed: 0,state,household_median_income,min_wage_state,min_wage_federal,cpi_average,unemployed,employed,employable_pop,gdp_state,year


In [10]:
spark.stop()