In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip

# warehouse_location points to the default location for managed databases and tables
warehouse = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
literacia_df = spark.table("database.jobs")
popu_df = spark.table("database.population")
popu_df.withColumnRenamed("valor", "valorI")
literacia_df = literacia_df.filter(col("indicador_name").like("%Literacy rate%"))
literacia_df = literacia_df.drop(col("indicador_name"), col("pais_cod"), col("indicador_code"))
popu_df = popu_df.filter(col("Indicator_code").like("%SP.POP.TOTL%"))
popu_df = popu_df.drop(col("cod_pais"), col("cod_indicador"), col("nome_indicador"), col("nome_pais"), col("Country_code"), col("Indicator_name"), col("Indicator_code"))
popu_df = popu_df.withColumnRenamed("Country_name", "pais_name")
popu_df = popu_df.withColumnRenamed("valor", "valorI")
literacia_df = literacia_df.join(popu_df, ["pais_name", "ano"])
literacia_df = literacia_df.withColumn("valorI", ((col("valor") / 100) * col("valorI")).cast("int"))

In [3]:
countries_to_filter = ["United States", "Canada", "United Kingdom", "Germany", "France"]
literacia_df = literacia_df.filter(col("pais_name").isin(countries_to_filter))

In [4]:
from pyspark.sql import functions as F
north_america_df = literacia_df.filter(F.col("pais_name").isin("United States", "Canada")) \
    .withColumn("regiao", F.lit("North America")) \
    .groupBy("regiao", "ano") \
    .agg(F.avg("valor").alias("valor"), F.avg("valorI").alias("valorI"))

# Create a new DataFrame for Europe
europe_df = literacia_df.filter(F.col("pais_name").isin("United Kingdom", "Germany", "France")) \
    .withColumn("regiao", F.lit("Europe")) \
    .groupBy("regiao", "ano") \
    .agg(F.avg("valor").alias("valor"), F.avg("valorI").alias("valorI"))

# Combine the DataFrames for North America and Europe
literacia_df = north_america_df.unionAll(europe_df)

literacia_df = literacia_df.withColumn("valor", col("valor").cast(FloatType()))

literacia_df = literacia_df.withColumn("valorI", col("valorI").cast(IntegerType()))


In [5]:
literacia_df \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/gold/TabelaLiteracia/")

In [6]:
spark.sql(
    """
    SELECT * FROM database.LiteraciaGold
    """
).show()

+-------------+----+-----+------+
|       regiao| ano|valor|valorI|
+-------------+----+-----+------+
|North America|1994| null|  null|
|North America|2015| null|  null|
|North America|2013| null|  null|
|North America|1990| null|  null|
|North America|1993| null|  null|
|North America|2006| null|  null|
|North America|1998| null|  null|
|North America|2003| null|  null|
|North America|2008| null|  null|
|North America|2001| null|  null|
|North America|2000| null|  null|
|North America|2011| null|  null|
|North America|1996| null|  null|
|North America|2014| null|  null|
|North America|2004| null|  null|
|North America|1991| null|  null|
|North America|2002| null|  null|
|North America|2012| null|  null|
|North America|1997| null|  null|
|North America|1999| null|  null|
+-------------+----+-----+------+
only showing top 20 rows



In [7]:
spark.stop()