In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip, when  # Importe a função 'when'

# warehouse_location points to the default location for managed databases and tables
warehouse = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport()

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [2]:
# Lista de países desejados
desired_countries_educ = ["Liechtenstein", "Germany", "Denmark", "Switzerland", "Poland", "Czech", "Slovakia", "Slovenia", "Hungary", "Austria"]

In [3]:
# Filtrar dados para o indicador e país específicos nos anos de 2010 a 2015
educ = spark.table("database2.jobs")
educ = educ.filter((col("indicador_name").like("%Labor force with advanced education%")) | (col("indicador_name").like("%Own-account%")))
educ = educ.filter(col("pais_name").isin(desired_countries_educ))
educ = educ.filter(col("ano").between("2010", "2015"))  # Modificação para incluir anos de 2010 a 2015
educ = educ.drop(col("pais_cod"), col("indicador_code"))
educ = educ.withColumn("genero",
    when(col("indicador_name").like("% female%"), "F")
    .when(col("indicador_name").like("% male%"), "M")
)

educ = educ.filter(col("genero").isNotNull())
educ.show()

+---------+--------------------+----+-------+------+
|pais_name|      indicador_name| ano|  valor|genero|
+---------+--------------------+----+-------+------+
|  Austria|Labor force with ...|2010|75.7348|     F|
|  Austria|Labor force with ...|2011|75.4548|     F|
|  Austria|Labor force with ...|2012|75.6985|     F|
|  Austria|Labor force with ...|2013|75.9328|     F|
|  Austria|Labor force with ...|2014|76.1737|     F|
|  Austria|Labor force with ...|2015|76.4943|     F|
|  Austria|Labor force with ...|2010| 75.245|     M|
|  Austria|Labor force with ...|2011|75.9108|     M|
|  Austria|Labor force with ...|2012|76.5963|     M|
|  Austria|Labor force with ...|2013|76.9093|     M|
|  Austria|Labor force with ...|2014|76.7986|     M|
|  Austria|Labor force with ...|2015|76.8852|     M|
|  Austria|Own-account worke...|2010|  6.176|     F|
|  Austria|Own-account worke...|2011|  6.052|     F|
|  Austria|Own-account worke...|2012|  6.086|     F|
|  Austria|Own-account worke...|2013|  6.317| 

In [4]:
educ \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("hdfs://hdfs-nn:9000/Projeto/gold/TabelaEducation/")

In [5]:
spark.sql(
    """
    SELECT * FROM database2.Education
    """
).show()

+---------+--------------------+----+-------+------+
|pais_name|      indicador_name| ano|  valor|genero|
+---------+--------------------+----+-------+------+
|  Austria|Labor force with ...|2010|75.7348|     F|
|  Austria|Labor force with ...|2011|75.4548|     F|
|  Austria|Labor force with ...|2012|75.6985|     F|
|  Austria|Labor force with ...|2013|75.9328|     F|
|  Austria|Labor force with ...|2014|76.1737|     F|
|  Austria|Labor force with ...|2015|76.4943|     F|
|  Austria|Labor force with ...|2010| 75.245|     M|
|  Austria|Labor force with ...|2011|75.9108|     M|
|  Austria|Labor force with ...|2012|76.5963|     M|
|  Austria|Labor force with ...|2013|76.9093|     M|
|  Austria|Labor force with ...|2014|76.7986|     M|
|  Austria|Labor force with ...|2015|76.8852|     M|
|  Austria|Own-account worke...|2010|  6.176|     F|
|  Austria|Own-account worke...|2011|  6.052|     F|
|  Austria|Own-account worke...|2012|  6.086|     F|
|  Austria|Own-account worke...|2013|  6.317| 

In [6]:
spark.stop()