In [1]:
pip install delta-spark==2.4.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
path = "hdfs://hdfs-nn:9000/Projeto/bronze/Proportion_Of_Time.csv"

# Create the StructType schema
Custom_schema = StructType([
    StructField("Indicator_name ", StringType(), True),
    StructField("Indicator_name_2 ", StringType(), True),
    StructField("Indicator_code", StringType(), True),
    StructField("Country_name", StringType(), True),
    StructField("Country_code", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Value", StringType(), True),
    StructField("Disaggregation", StringType(), True)
])

time = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(Custom_schema) \
    .csv(path)

time.show(300)

+--------------------+--------------------+--------------+------------------+------------+----+--------+--------------+
|     Indicator_name |   Indicator_name_2 |Indicator_code|      Country_name|Country_code|Year|   Value|Disaggregation|
+--------------------+--------------------+--------------+------------------+------------+----+--------+--------------+
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|           Albania|         ALB|2011|21.73611|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|           Algeria|         DZA|2012|21.66667|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|         Argentina|         ARG|2013|23.41864|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|         Argentina|         ARG|2010|    17.5|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|         Argentina|         ARG|2005|16.73611|        female|
|Proportion of tim...| female (% of 24 .

In [4]:
indicatores_to_exclude = [
    "EGY", "HKG", "IRN", "KOR"
]

countries_to_exclude = [
    "West Bank and Gaza"
]

filtered_time = time.filter(
    (~col("Indicator_code").isin(indicatores_to_exclude)) &
    (~col("Country_name").isin(countries_to_exclude))
)
# Mostrar o DataFrame resultante
filtered_time.show(300)

+--------------------+--------------------+--------------+------------------+------------+----+--------+--------------+
|     Indicator_name |   Indicator_name_2 |Indicator_code|      Country_name|Country_code|Year|   Value|Disaggregation|
+--------------------+--------------------+--------------+------------------+------------+----+--------+--------------+
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|           Albania|         ALB|2011|21.73611|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|           Algeria|         DZA|2012|21.66667|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|         Argentina|         ARG|2013|23.41864|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|         Argentina|         ARG|2010|    17.5|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|         Argentina|         ARG|2005|16.73611|        female|
|Proportion of tim...| female (% of 24 .

In [5]:
spark.sql(
    """
    SHOW TABLES FROM database2
    """
).show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|database2|   gender|      false|
|database2|     time|      false|
+---------+---------+-----------+



In [6]:
spark.sql(
    """
    SELECT * FROM database2.time
    """
).show()

+--------------+----------------+-------------+---------+--------+---+-----+------------+
|nome_indicador|nome_indicador_2|cod_indicador|nome_pais|cod_pais|ano|valor|desagregacao|
+--------------+----------------+-------------+---------+--------+---+-----+------------+
+--------------+----------------+-------------+---------+--------+---+-----+------------+



In [7]:
spark.stop()