In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
path = "hdfs://hdfs-nn:9000/Projeto/bronze/JobsData.csv"

Custom_schema = StructType([
    StructField("Country_name", StringType(), True),        
    StructField("Country_code", StringType(), True),
    StructField("Indicator_name", StringType(), True),
    StructField("Indicator_code", StringType(), True),
    StructField("1990", StringType(), True),
    StructField("1991", StringType(), True),        
    StructField("1992", StringType(), True),
    StructField("1993", StringType(), True),
    StructField("1994", StringType(), True),
    StructField("1995", StringType(), True),        
    StructField("1996", StringType(), True),
    StructField("1997", StringType(), True),
    StructField("1998", StringType(), True),
    StructField("1999", StringType(), True),
    StructField("2000", StringType(), True),        
    StructField("2001", StringType(), True),
    StructField("2002", StringType(), True),
    StructField("2003", StringType(), True),
    StructField("2004", StringType(), True),        
    StructField("2005", StringType(), True),
    StructField("2006", StringType(), True),
    StructField("2007", StringType(), True),
    StructField("2008", StringType(), True),        
    StructField("2009", StringType(), True),
    StructField("2010", StringType(), True),
    StructField("2011", StringType(), True),
    StructField("2012", StringType(), True),        
    StructField("2013", StringType(), True),
    StructField("2014", StringType(), True),
    StructField("2015", StringType(), True),
    StructField("2016", StringType(), True)
])

jobs = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(Custom_schema) \
    .csv(path)

+------------+------------+--------------------+-----------------+-----------------+----------------+----------------+-----------------+----------------+------------------+----------------+------------------+----------------+----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+------------------+----------------+
|Country_name|Country_code|      Indicator_name|   Indicator_code|             1990|            1991|            1992|             1993|            1994|              1995|            1996|              1997|            1998|            1999|             2000|            2001|            2002|            2003|            2004|            2005|            2006|             2007|            2008|            2009|            2010|            2011|            201

In [3]:
countries_to_exclude = [
    "Arab World", "East Asia & Pacific", "EuroArea", "Europe and Central Asia",
    "European Union", "Heavily indebted poor countries", "High income",
    "Latin America & Caribbean", "Least developed countries", "Low & middle income",
    "Low income", "Lower middle income", "Middle East & North Africa",
    "Middle Income", "North America", "OECD members", "South Asia",
    "Sub-Saharan Africa", "Upper middle income", "World"
]

filtered_jobs = jobs.filter(~col("Country_name").isin(countries_to_exclude))


+--------------------+------------+--------------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+
|        Country_name|Country_code|      Indicator_name|   Indicator_code|            1990|            1991|            1992|            1993|            1994|            1995|            1996|            1997|            1998|            1999|            2000|            2001|            2002|            2003|            2004|            2005|            2006|            2007|            2008|            2009|            2010|            2011|            2012| 

In [4]:
melt_colunas = ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']


for col_name in melt_colunas:
    filtered_jobs = filtered_jobs.withColumnRenamed(col_name, f"year_{col_name}")

filtered_jobs = filtered_jobs.selectExpr(
    "Country_name",
    "Country_code",
    "Indicator_name",
    "Indicator_code",
    "stack(27, '1990', year_1990, '1991', year_1991, '1992', year_1992, '1993', year_1993, '1994', year_1994, '1995', year_1995, '1996', year_1996, '1997', year_1997, '1998', year_1998, '1999', year_1999, '2000', year_2000, '2001', year_2001, '2002', year_2002, '2003', year_2003, '2004', year_2004, '2005', year_2005, '2006', year_2006, '2007', year_2007, '2008', year_2008, '2009', year_2009, '2010', year_2010, '2011', year_2011, '2012', year_2012, '2013', year_2013, '2014', year_2014, '2015', year_2015, '2016', year_2016) as (Ano, Valor)"
)

In [6]:
filtered_jobs = filtered_jobs.withColumnRenamed("Country_name", "pais_name")
filtered_jobs = filtered_jobs.withColumnRenamed("Country_code", "pais_cod")
filtered_jobs = filtered_jobs.withColumnRenamed("Indicator_name", "indicador_name")
filtered_jobs = filtered_jobs.withColumnRenamed("Indicator_code", "indicador_code")
filtered_jobs = filtered_jobs.withColumnRenamed("Valor", "valor")
filtered_jobs = filtered_jobs.withColumnRenamed("Ano", "ano")
filtered_jobs = filtered_jobs.withColumn("ano", col("ano").cast("int"))
filtered_jobs = filtered_jobs.withColumn("valor", col("valor").cast("float"))

In [7]:
filtered_jobs \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .save("hdfs://hdfs-nn:9000/Projeto/silver/TabelaJobs/")

In [10]:
spark.stop()