In [5]:
# (1) Download databricks.spark.xml library 
# (2) https://repo1.maven.org/maven2/com/databricks/spark-xml_2.12/0.12.0/spark-xml_2.12-0.12.0.jar
# (3) Upload jar file to HDFS root
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.jars", "hdfs://hdfs-nn:9000/spark-xml_2.12-0.12.0.jar") \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+--------------------+-----------+
|      database|           tableName|isTemporary|
+--------------+--------------------+-----------+
|americancrimes|    civillian_deaths|      false|
|americancrimes|        crime_by_age|      false|
|americancrimes|       crime_by_race|      false|
|americancrimes|              crimes|      false|
|americancrimes|          demography|      false|
|americancrimes|derived_country_s...|      false|
|americancrimes|  derived_demography|      false|
|americancrimes| derived_state_stats|      false|
|americancrimes|             economy|      false|
|americancrimes|       police_deaths|      false|
|americancrimes|            politics|      false|
+--------------+--------------------+-----------+



In [7]:
spark.sql(
    """
    DROP TABLE IF EXISTS americancrimes.derived_social_stats
    """
)

spark.sql(
    """
    CREATE TABLE americancrimes.derived_social_stats (
        state VARCHAR(70),
        household_median_income FLOAT,
        min_wage_state FLOAT,
        gdp_state FLOAT,
        candidatevotes INT,
        totalvotes INT,
        violent_crime INT,
        property_crime INT,
        total_pop INT,
        party VARCHAR(40),
        party_results_ratio DOUBLE,
        crimes_100k DOUBLE
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year DATE
    )
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/americancrimes.db/derived_social_stats/'
    
    """
)

DataFrame[]

In [8]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+--------------------+-----------+
|      database|           tableName|isTemporary|
+--------------+--------------------+-----------+
|americancrimes|    civillian_deaths|      false|
|americancrimes|        crime_by_age|      false|
|americancrimes|       crime_by_race|      false|
|americancrimes|              crimes|      false|
|americancrimes|          demography|      false|
|americancrimes|derived_country_s...|      false|
|americancrimes|  derived_demography|      false|
|americancrimes|derived_social_stats|      false|
|americancrimes| derived_state_stats|      false|
|americancrimes|             economy|      false|
|americancrimes|       police_deaths|      false|
|americancrimes|            politics|      false|
+--------------+--------------------+-----------+



In [9]:
spark.sql(
    """
    DESCRIBE FORMATTED americancrimes.derived_social_stats
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,state,varchar(70),
1,household_median_income,float,
2,min_wage_state,float,
3,gdp_state,float,
4,candidatevotes,int,
5,totalvotes,int,
6,violent_crime,int,
7,property_crime,int,
8,total_pop,int,
9,party,varchar(40),


In [10]:
# Let's put the files into HDFS

In [11]:
# recover partitions is needed so that the Hive Metastore (Catalog)
# is updated. Otherwise Hive and the querying engines do not know
# that there are new parittions in the partitioned table.
spark.catalog.recoverPartitions("americancrimes.derived_social_stats")

spark.sql(
    """
    SELECT *
    FROM americancrimes.derived_social_stats
    """
).show()

+-----+-----------------------+--------------+---------+--------------+----------+-------------+--------------+---------+-----+-------------------+-----------+----+
|state|household_median_income|min_wage_state|gdp_state|candidatevotes|totalvotes|violent_crime|property_crime|total_pop|party|party_results_ratio|crimes_100k|year|
+-----+-----------------------+--------------+---------+--------------+----------+-------------+--------------+---------+-----+-------------------+-----------+----+
+-----+-----------------------+--------------+---------+--------------+----------+-------------+--------------+---------+-----+-------------------+-----------+----+

