In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip

# warehouse_location points to the default location for managed databases and tables
warehouse = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
# Carregar tabelas
own_account = spark.table("database2.jobs")
borrowed_money = spark.table("database2.gender")

# Filtrar dados nas tabelas
own_account = own_account.filter(col("indicador_name").like("%Own-account%"))
own_account = own_account.filter(col("pais_name").like("%United States%"))
own_account = own_account.filter(col("ano").like("2014"))

borrowed_money = borrowed_money.filter(col("nome_pais").like("%United States%"))
borrowed_money = borrowed_money.filter(col("nome_indicador").like("%Owns a credit card%"))
borrowed_money = borrowed_money.filter(col("ano").like("2014"))

In [3]:
# Selecionar apenas as colunas desejadas
own_account = own_account.select("ano", "indicador_name", "valor")
borrowed_money = borrowed_money.select("ano", "nome_pais", "nome_indicador", "valor")
borrowed_money.show(10)
own_account.show(10)

+----+-------------+--------------------+-----+
| ano|    nome_pais|      nome_indicador|valor|
+----+-------------+--------------------+-----+
|2014|United States|Owns a credit car...|60.13|
|2014|United States|Owns a credit car...|61.55|
|2014|United States|Owns a credit car...|58.71|
+----+-------------+--------------------+-----+

+----+--------------------+-----+
| ano|      indicador_name|valor|
+----+--------------------+-----+
|2014|Own-account worke...| 5.21|
|2014|Own-account worke...|6.988|
|2014|Own-account worke...| 6.17|
+----+--------------------+-----+



In [4]:
# Renomear colunas nas tabelas originais
own_account = own_account.withColumnRenamed("ano", "ano_own_account").withColumnRenamed("valor", "valor_own_account")
borrowed_money = borrowed_money.withColumnRenamed("ano", "ano_CreditCard").withColumnRenamed("valor", "valor_CreditCard")


In [5]:
# Realizar a junção
joined_data = own_account.join(borrowed_money, own_account["ano_own_account"] == borrowed_money["ano_CreditCard"], "inner")


In [6]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")

joined_data.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("hdfs://hdfs-nn:9000/Projeto/gold/TabelaCreditCard")

In [7]:
# Exibir os resultados
joined_data.show()

+---------------+--------------------+-----------------+--------------+-------------+--------------------+----------------+
|ano_own_account|      indicador_name|valor_own_account|ano_CreditCard|    nome_pais|      nome_indicador|valor_CreditCard|
+---------------+--------------------+-----------------+--------------+-------------+--------------------+----------------+
|           2014|Own-account worke...|             6.17|          2014|United States|Owns a credit car...|           60.13|
|           2014|Own-account worke...|            6.988|          2014|United States|Owns a credit car...|           60.13|
|           2014|Own-account worke...|             5.21|          2014|United States|Owns a credit car...|           60.13|
|           2014|Own-account worke...|             6.17|          2014|United States|Owns a credit car...|           61.55|
|           2014|Own-account worke...|            6.988|          2014|United States|Owns a credit car...|           61.55|
|       

In [8]:
spark.stop()