In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip, lit, avg, when, broadcast

# warehouse_location points to the default location for managed databases and tables
warehouse = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
jobs_df = spark.table("database.jobs")
popu_df = spark.table("database.population")
jobs_df = jobs_df.filter(col("indicador_name").like("%enrollment, tertiary%"))
jobs_df = jobs_df.drop(col("pais_cod"), col("indicador_code"))
popu_df = popu_df.filter(col("cod_indicador").like("%SP.POP.TOTL%"))
popu_df = popu_df.drop(col("cod_pais"), col("cod_indicador"), col("nome_indicador"), col("Country_name"), col("Country_code"), col("Indicator_name"), col("Indicator_code"))
popu_df = popu_df.withColumnRenamed("valor", "valorI")
popu_df = popu_df.withColumnRenamed("nome_pais", "pais_name")
popu_df = popu_df.withColumn("ano", col("ano").cast(IntegerType()))

In [3]:
paises = ["%Liechtenstein%", "%Germany%", "%Denmark%", "%Switzerland%", "%Poland%", "%Czech%", 
          "%Slovakia%", "%Slovenia%", "%Hungary%", "%Austria%"]
anos = ["2010", "2011", "2012", "2013", "2014", "2015"]

jobs_df = jobs_df.filter(
    col("pais_name").like(paises[0]) |
    col("pais_name").like(paises[1]) |
    col("pais_name").like(paises[2]) |
    col("pais_name").like(paises[3]) |
    col("pais_name").like(paises[4]) |
    col("pais_name").like(paises[5]) |
    col("pais_name").like(paises[6]) |
    col("pais_name").like(paises[7]) |
    col("pais_name").like(paises[8]) |
    col("pais_name").like(paises[9]) 
).filter(
    col("ano").isin(anos) &
    col("valor").isNotNull()
)

popu_df = popu_df.filter(
    col("pais_name").like(paises[0]) |
    col("pais_name").like(paises[1]) |
    col("pais_name").like(paises[2]) |
    col("pais_name").like(paises[3]) |
    col("pais_name").like(paises[4]) |
    col("pais_name").like(paises[5]) |
    col("pais_name").like(paises[6]) |
    col("pais_name").like(paises[7]) |
    col("pais_name").like(paises[8]) |
    col("pais_name").like(paises[9]) 
).filter(
    col("ano").isin(anos) &
    col("valorI").isNotNull()
)

In [4]:
literacia_df = popu_df.join(jobs_df, ["pais_name", "ano"], "left")
literacia_df = literacia_df.withColumn("valorI", ((col("valor") / 100) * (col("valorI") / 2)).cast("int"))
literacia_df = literacia_df.withColumnRenamed("pais_name", "paises")

In [5]:
literacia_df = literacia_df.withColumn("genero",
    when(col("indicador_name").like("% female%"), "F")
    .when(col("indicador_name").like("% male%"), "M")
)

literacia_df = literacia_df.filter(col("genero").isNotNull())

literacia_df = literacia_df.drop(col("indicador_name"))

In [6]:
literacia_df \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/gold/TabelaMatriculaS/")

In [7]:
spark.sql(
    """
    SELECT * FROM database.MatriculaSGold
    """
).show()

+-------+----+--------+--------+------+
| paises| ano|   valor|  valorI|genero|
+-------+----+--------+--------+------+
|Austria|2014|86.10368| 3679363|     F|
|Austria|2015| 87.8598| 3796729|     F|
|Austria|2014|72.15393| 3083265|     M|
|Austria|2015|73.60846| 3180878|     M|
|Denmark|2010|87.44201| 2425502|     F|
|Denmark|2011|90.58639| 2523089|     F|
|Denmark|2012|93.11905| 2603409|     F|
|Denmark|2013|94.50449| 2653181|     F|
|Denmark|2014|95.34254| 2690316|     F|
|Denmark|2015|96.29233| 2736379|     F|
|Denmark|2010| 60.3624| 1674357|     M|
|Denmark|2011|63.56284| 1770406|     M|
|Denmark|2012|65.77505| 1838929|     M|
|Denmark|2013|68.03267| 1909994|     M|
|Denmark|2014|67.44238| 1903047|     M|
|Denmark|2015|68.70282| 1952356|     M|
|Germany|2013|58.47609|23579198|     F|
|Germany|2014|62.61369|25353064|     F|
|Germany|2015| 65.5982|26792471|     F|
|Germany|2013| 62.3437|25138728|     M|
+-------+----+--------+--------+------+
only showing top 20 rows



In [8]:
spark.stop()