In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
# que bases de dados é que temos
spark.sql(
    """
    SHOW DATABASES
    """
).show()

+--------------+
|     namespace|
+--------------+
|americancrimes|
|       default|
|       economy|
|      politics|
+--------------+



In [3]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+----------------+-----------+
|      database|       tableName|isTemporary|
+--------------+----------------+-----------+
|americancrimes|civillian_deaths|      false|
+--------------+----------------+-----------+



In [4]:
spark.sql(
    """
    DROP TABLE IF EXISTS americancrimes.crimes
    """
)

spark.sql(
    """
    CREATE EXTERNAL TABLE americancrimes.crimes (
        violent_crime INT,
        property_crime INT,
        state STRING,
        state_abb STRING
    )
    STORED AS PARQUET
    PARTITIONED BY (
        year DATE
    )
    LOCATION 'hdfs://hdfs-nn:9000/warehouse/americancrimes.db/crimes/'
    """
)

DataFrame[]

In [5]:
spark.sql(
    """
    SHOW TABLES FROM americancrimes
    """
).show()

+--------------+----------------+-----------+
|      database|       tableName|isTemporary|
+--------------+----------------+-----------+
|americancrimes|civillian_deaths|      false|
|americancrimes|          crimes|      false|
+--------------+----------------+-----------+



In [6]:
spark.sql(
    """
    SELECT *
    FROM americancrimes.crimes
    """
).show()

+-------------+--------------+-----+---------+----+
|violent_crime|property_crime|state|state_abb|year|
+-------------+--------------+-----+---------+----+
+-------------+--------------+-----+---------+----+



In [7]:
spark.sql(
    """
    DESCRIBE FORMATTED americancrimes.crimes
    """
).toPandas()

Unnamed: 0,col_name,data_type,comment
0,violent_crime,int,
1,property_crime,int,
2,state,string,
3,state_abb,string,
4,year,date,
5,# Partition Information,,
6,# col_name,data_type,comment
7,year,date,
8,,,
9,# Detailed Table Information,,


In [8]:
# recover partitions is needed so that the Hive Metastore (Catalog)
# is updated. Otherwise Hive and the querying engines do not know
# that there are new parittions in the partitioned table.
spark.catalog.recoverPartitions("americancrimes.crimes")

spark.sql(
    """
    SELECT *
    FROM americancrimes.crimes
    where year = "2011-01-01"
    """
).show()

+-------------+--------------+-----+---------+----+
|violent_crime|property_crime|state|state_abb|year|
+-------------+--------------+-----+---------+----+
+-------------+--------------+-----+---------+----+



In [9]:
spark.stop()