In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from delta import *
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip, when  # Importe a função 'when'

# warehouse_location points to the default location for managed databases and tables
warehouse = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport()

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
# Lista de países desejados
desired_countries_self_employed = ["Liechtenstein", "Germany", "Denmark", "Switzerland", "Poland", "Czech", "Slovakia", "Slovenia", "Hungary", "Austria"]

# Filtrar dados para o indicador e país específicos nos anos de 2010 a 2015
self_employed = spark.table("database2.jobs")
self_employed = self_employed.filter((col("indicador_name").like("%Self-employed%")) | (col("indicador_name").like("%Own-account%")))
self_employed = self_employed.filter(col("pais_name").isin(desired_countries_self_employed))
self_employed = self_employed.filter(col("ano").between("2010", "2015"))  # Modificação para incluir anos de 2010 a 2015
self_employed = self_employed.drop(col("pais_cod"), col("indicador_code"))
self_employed = self_employed.withColumn("genero",
    when(col("indicador_name").like("% female%"), "F")
    .when(col("indicador_name").like("% male%"), "M")
)

self_employed = self_employed.filter(col("genero").isNotNull())
self_employed.show()

In [None]:
self_employed \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("hdfs://hdfs-nn:9000/Projeto/gold/TabelaSelfEmployed/")

In [None]:
spark.sql(
    """
    SELECT * FROM database2.SelfEmployed
    """
).show()

In [None]:
spark.stop()