In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip

# warehouse_location points to the default location for managed databases and tables
warehouse = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# Carregar tabelas
own_account = spark.table("database2.jobs")
publicpayment = spark.table("database2.gender")

# Filtrar dados nas tabelas
own_account = own_account.filter(col("indicador_name").like("%Own-account%"))
own_account = own_account.filter(col("pais_name").like("%Portugal%"))
own_account = own_account.filter(col("ano").like("2014"))

publicpayment = publicpayment.filter(col("nome_pais").like("%United States%"))
publicpayment = publicpayment.filter(col("nome_indicador").like("%Saved at a financial institution%"))
publicpayment = publicpayment.filter(col("ano").like("2014"))

In [3]:
# Selecionar apenas as colunas desejadas
own_account = own_account.select("ano", "indicador_name", "valor")
publicpayment = publicpayment.select("ano", "nome_pais", "nome_indicador", "valor")


In [4]:
# Renomear colunas nas tabelas originais
own_account = own_account.withColumnRenamed("ano", "ano_own_account").withColumnRenamed("valor", "valor_own_account")
publicpayment = publicpayment.withColumnRenamed("ano", "ano_publicpayment").withColumnRenamed("valor", "valor_publicpayment")
# Remover linhas com valor null
own_account = own_account.na.drop()
publicpayment = publicpayment.na.drop()
own_account.show()
publicpayment.show()

+---------------+--------------------+-----------------+
|ano_own_account|      indicador_name|valor_own_account|
+---------------+--------------------+-----------------+
|           2014|Own-account worke...|           11.158|
|           2014|Own-account worke...|           16.705|
|           2014|Own-account worke...|           14.019|
+---------------+--------------------+-----------------+

+-----------------+-------------+--------------------+-------------------+
|ano_publicpayment|    nome_pais|      nome_indicador|valor_publicpayment|
+-----------------+-------------+--------------------+-------------------+
|             2014|United States|Saved at a financ...|              54.11|
|             2014|United States|Saved at a financ...|              53.47|
|             2014|United States|Saved at a financ...|              54.77|
+-----------------+-------------+--------------------+-------------------+



In [5]:
# Realizar a junção
joined_data = own_account.join(publicpayment, own_account["ano_own_account"] == publicpayment["ano_publicpayment"], "inner")


In [6]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

joined_data.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("hdfs://hdfs-nn:9000/Projeto/gold/TabelaPublicPayment")


In [7]:
# Exibir os resultados
joined_data.show(20)

+---------------+--------------------+-----------------+-----------------+-------------+--------------------+-------------------+
|ano_own_account|      indicador_name|valor_own_account|ano_publicpayment|    nome_pais|      nome_indicador|valor_publicpayment|
+---------------+--------------------+-----------------+-----------------+-------------+--------------------+-------------------+
|           2014|Own-account worke...|           14.019|             2014|United States|Saved at a financ...|              54.11|
|           2014|Own-account worke...|           16.705|             2014|United States|Saved at a financ...|              54.11|
|           2014|Own-account worke...|           11.158|             2014|United States|Saved at a financ...|              54.11|
|           2014|Own-account worke...|           14.019|             2014|United States|Saved at a financ...|              53.47|
|           2014|Own-account worke...|           16.705|             2014|United States|Sa

In [8]:
spark.stop()