In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip

# warehouse_location points to the default location for managed databases and tables
warehouse = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
spark.sql(
    """
    SHOW TABLES FROM database
    """
).show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
| database|       gender|      false|
| database|         jobs|      false|
| database|literaciagold|      false|
+---------+-------------+-----------+



In [3]:
literacia_df = spark.table("database.jobs")
popu_df = spark.table("database.population")
literacia_df = literacia_df.filter(col("indicador_name").like("%Literacy rate%"))
literacia_df = literacia_df.drop(col("indicador_name"), col("pais_cod"), col("indicador_code"))
literacia_df = literacia_df.join(popu_df, ["pais_name", "ano"])
literacia_df = literacia_df.withColumn("valorI", (col("value") * col("value_in_total_df")).cast("int"))
literacia_df.show()

root
 |-- pais_name: string (nullable = true)
 |-- ano: integer (nullable = true)
 |-- valor: float (nullable = true)



In [4]:
spark.stop()