In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip

# warehouse_location points to the default location for managed databases and tables
warehouse = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql(
    """
    SELECT * FROM database2.jobs
    """
).show(50)

+--------------------+--------+--------------------+--------------+----+---------+
|           pais_name|pais_cod|      indicador_name|indicador_code| ano|    valor|
+--------------------+--------+--------------------+--------------+----+---------+
|East Asia & Pacif...|     EAP|Access to electri...|EG.ELC.ACCS.ZS|1990| 82.59917|
|East Asia & Pacif...|     EAP|Access to electri...|EG.ELC.ACCS.ZS|1991|81.601685|
|East Asia & Pacif...|     EAP|Access to electri...|EG.ELC.ACCS.ZS|1992| 83.88937|
|East Asia & Pacif...|     EAP|Access to electri...|EG.ELC.ACCS.ZS|1993| 83.24599|
|East Asia & Pacif...|     EAP|Access to electri...|EG.ELC.ACCS.ZS|1994|  84.5359|
|East Asia & Pacif...|     EAP|Access to electri...|EG.ELC.ACCS.ZS|1995| 85.45995|
|East Asia & Pacif...|     EAP|Access to electri...|EG.ELC.ACCS.ZS|1996| 86.57579|
|East Asia & Pacif...|     EAP|Access to electri...|EG.ELC.ACCS.ZS|1997| 87.74911|
|East Asia & Pacif...|     EAP|Access to electri...|EG.ELC.ACCS.ZS|1998| 88.50378|
|Eas

In [3]:
tempo_nec = spark.table("database2.jobs")
tempo_nec = tempo_nec.filter((col("indicador_name").like("%Time required to start a business%")) | (col("indicador_name").like("%Own-account%")))
tempo_nec = tempo_nec.filter(col("pais_name").like("%Portugal%"))
tempo_nec = tempo_nec.filter(col("ano").like("2010"))
tempo_nec = tempo_nec.drop(col("pais_cod"), col("indicador_code"))
tempo_nec.show()

+---------+--------------------+----+------+
|pais_name|      indicador_name| ano| valor|
+---------+--------------------+----+------+
| Portugal|Own-account worke...|2010|16.272|
| Portugal|Own-account worke...|2010|17.693|
| Portugal|Own-account worke...|2010|17.016|
| Portugal|Time required to ...|2010|   5.5|
+---------+--------------------+----+------+



In [4]:
tempo_nec \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("hdfs://hdfs-nn:9000/Projeto/gold/TabelaTempNec/")

In [5]:
spark.sql(
    """
    SELECT * FROM database2.TempoNec
    """
).show()

+---------+--------------------+----+------+
|pais_name|      indicador_name| ano| valor|
+---------+--------------------+----+------+
| Portugal|Own-account worke...|2010|16.272|
| Portugal|Own-account worke...|2010|17.693|
| Portugal|Own-account worke...|2010|17.016|
| Portugal|Time required to ...|2010|   5.5|
+---------+--------------------+----+------+



In [6]:
spark.stop()