In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip, when  # Importe a função 'when'

# warehouse_location points to the default location for managed databases and tables
warehouse = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport()

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [2]:
# Lista de países desejados
desired_countries = ["Liechtenstein", "Germany", "Denmark", "Switzerland", "Poland", "Czech", "Slovakia", "Slovenia", "Hungary", "Austria"]


In [3]:
# Filtrar dados para o indicador e país específicos nos anos de 2010 a 2015
tempo_nec = spark.table("database2.jobs")
tempo_nec = tempo_nec.filter((col("indicador_name").like("%Labor force participation rate%")) | (col("indicador_name").like("%Own-account%")))
tempo_nec = tempo_nec.filter(col("pais_name").isin(desired_countries))
tempo_nec = tempo_nec.filter(col("ano").between("2010", "2015"))  # Modificação para incluir anos de 2010 a 2015
tempo_nec = tempo_nec.drop(col("pais_cod"), col("indicador_code"))
tempo_nec = tempo_nec.withColumn("genero",
    when(col("indicador_name").like("% female%"), "F")
    .when(col("indicador_name").like("% male%"), "M")
)

tempo_nec = tempo_nec.filter(col("genero").isNotNull())
tempo_nec.show()

+---------+--------------------+----+------+------+
|pais_name|      indicador_name| ano| valor|genero|
+---------+--------------------+----+------+------+
|  Austria|Labor force parti...|2010|53.507|     F|
|  Austria|Labor force parti...|2011| 53.84|     F|
|  Austria|Labor force parti...|2012|54.336|     F|
|  Austria|Labor force parti...|2013|54.597|     F|
|  Austria|Labor force parti...|2014|54.561|     F|
|  Austria|Labor force parti...|2015|54.536|     F|
|  Austria|Labor force parti...|2010|66.864|     M|
|  Austria|Labor force parti...|2011|66.625|     M|
|  Austria|Labor force parti...|2012|66.648|     M|
|  Austria|Labor force parti...|2013|66.642|     M|
|  Austria|Labor force parti...|2014|66.102|     M|
|  Austria|Labor force parti...|2015|65.994|     M|
|  Austria|Own-account worke...|2010| 6.176|     F|
|  Austria|Own-account worke...|2011| 6.052|     F|
|  Austria|Own-account worke...|2012| 6.086|     F|
|  Austria|Own-account worke...|2013| 6.317|     F|
|  Austria|O

In [4]:
tempo_nec \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("hdfs://hdfs-nn:9000/Projeto/gold/TabelaTempNec/")

In [5]:
spark.sql(
    """
    SELECT * FROM database.TempoNec
    """
).show()

+---------+--------------------+----+------+------+
|pais_name|      indicador_name| ano| valor|genero|
+---------+--------------------+----+------+------+
|   Poland|Labor force parti...|2010|48.272|     F|
|   Poland|Labor force parti...|2011|48.372|     F|
|   Poland|Labor force parti...|2012|48.764|     F|
|   Poland|Labor force parti...|2013|48.778|     F|
|   Poland|Labor force parti...|2014|49.164|     F|
|   Poland|Labor force parti...|2015|48.996|     F|
|   Poland|Labor force parti...|2010|64.264|     M|
|   Poland|Labor force parti...|2011|64.483|     M|
|   Poland|Labor force parti...|2012| 64.73|     M|
|   Poland|Labor force parti...|2013|64.874|     M|
|   Poland|Labor force parti...|2014|65.231|     M|
|   Poland|Labor force parti...|2015|65.082|     M|
|   Poland|Own-account worke...|2010|11.781|     F|
|   Poland|Own-account worke...|2011|11.697|     F|
|   Poland|Own-account worke...|2012|11.361|     F|
|   Poland|Own-account worke...|2013|10.885|     F|
|   Poland|O

In [6]:
spark.stop()