In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip,concat,lit
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
path = "hdfs://hdfs-nn:9000/Projeto/bronze/Proportion_Of_Time.csv"

# Create the StructType schema
Custom_schema = StructType([
    StructField("Indicator_name", StringType(), True),
    StructField("Indicator_name_2", StringType(), True),
    StructField("Indicator_code", StringType(), True),
    StructField("Country_name", StringType(), True),
    StructField("Country_code", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Value", StringType(), True),
    StructField("Disaggregation", StringType(), True)
])

time = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(Custom_schema) \
    .csv(path)

time.show()

+--------------------+--------------------+--------------+------------+------------+----+--------+--------------+
|      Indicator_name|    Indicator_name_2|Indicator_code|Country_name|Country_code|Year|   Value|Disaggregation|
+--------------------+--------------------+--------------+------------+------------+----+--------+--------------+
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|     Albania|         ALB|2011|21.73611|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|     Algeria|         DZA|2012|21.66667|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|   Argentina|         ARG|2013|23.41864|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|   Argentina|         ARG|2010|    17.5|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|   Argentina|         ARG|2005|16.73611|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|     Armenia|         ARM|2008

In [3]:
indicatores_to_exclude = [
    "EGY", "HKG", "IRN", "KOR"
]

countries_to_exclude = [
    "West Bank and Gaza"
]

filtered_time = time.filter(
    (~col("Indicator_code").isin(indicatores_to_exclude)) &
    (~col("Country_name").isin(countries_to_exclude))
)
# Mostrar o DataFrame resultante
filtered_time.show()

+--------------------+--------------------+--------------+------------+------------+----+--------+--------------+
|      Indicator_name|    Indicator_name_2|Indicator_code|Country_name|Country_code|Year|   Value|Disaggregation|
+--------------------+--------------------+--------------+------------+------------+----+--------+--------------+
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|     Albania|         ALB|2011|21.73611|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|     Algeria|         DZA|2012|21.66667|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|   Argentina|         ARG|2013|23.41864|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|   Argentina|         ARG|2010|    17.5|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|   Argentina|         ARG|2005|16.73611|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|     Armenia|         ARM|2008

In [4]:
spark.sql(
    """
    SHOW TABLES FROM database
    """
).show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
| database|     time|      false|
+---------+---------+-----------+



In [5]:
filtered_time = filtered_time.select(concat(col("Indicator_name"), lit(" "), col("Indicator_name_2")).alias("Indicator_name"),"Indicator_code", "Country_name", "Country_code", "Year", "value", "Disaggregation")
filtered_time = filtered_time.withColumnRenamed("Indicator_name", "nome_indicador")
filtered_time = filtered_time.withColumnRenamed("Indicator_code", "cod_indicador")
filtered_time = filtered_time.withColumnRenamed("Country_name", "nome_pais")
filtered_time = filtered_time.withColumnRenamed("Country_code", "cod_pais")
filtered_time = filtered_time.withColumnRenamed("Year", "ano")
filtered_time = filtered_time.withColumnRenamed("Value", "valor")
filtered_time = filtered_time.withColumnRenamed("Disaggregation", "desagregacao")
filtered_time = filtered_time.withColumn("valor", col("valor").cast("float"))

In [6]:
# Caminho para a tabela externa no Delta Lake
tabela_time_path = "hdfs://hdfs-nn:9000/Projeto/silver/TabelaTime"

# Escrever o DataFrame filtered_time como uma tabela Delta no local desejado
filtered_time.write.mode("overwrite").option("mergeSchema", "true").format("delta").save(tabela_time_path)

In [7]:
spark.sql(
    """
    SELECT * FROM database.time
    """
).show()

+--------------------+--------------+----------+--------+----+--------+------------+
|      nome_indicador| cod_indicador| nome_pais|cod_pais| ano|   valor|desagregacao|
+--------------------+--------------+----------+--------+----+--------+------------+
|Proportion of tim...|SG.TIM.UWRK.FE|   Albania|     ALB|2011|21.73611|      female|
|Proportion of tim...|SG.TIM.UWRK.FE|   Algeria|     DZA|2012|21.66667|      female|
|Proportion of tim...|SG.TIM.UWRK.FE| Argentina|     ARG|2013|23.41864|      female|
|Proportion of tim...|SG.TIM.UWRK.FE| Argentina|     ARG|2010|    17.5|      female|
|Proportion of tim...|SG.TIM.UWRK.FE| Argentina|     ARG|2005|16.73611|      female|
|Proportion of tim...|SG.TIM.UWRK.FE|   Armenia|     ARM|2008|21.65833|      female|
|Proportion of tim...|SG.TIM.UWRK.FE|   Armenia|     ARM|2004|24.02778|      female|
|Proportion of tim...|SG.TIM.UWRK.FE| Australia|     AUS|2006|20.83333|      female|
|Proportion of tim...|SG.TIM.UWRK.FE|   Austria|     AUT|2009|19.

In [8]:
spark.stop()