In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
path = "hdfs://hdfs-nn:9000/Projeto/bronze/Gender_Stats.csv"

years = [str(year) for year in range(1960, 2023)]

# Create the StructType schema
Custom_schema = StructType([
    StructField("Country_name", StringType(), True),
    StructField("Country_code", StringType(), True),
    StructField("Indicator_name", StringType(), True),
    StructField("Indicator_code", StringType(), True)
] + [StructField(year, StringType(), True) for year in years])


gender = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(Custom_schema) \
    .csv(path)

In [3]:
for i in range(1960,1990):
    gender = gender.drop(str(i))

In [4]:
melt_colunas = ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']


for col_name in melt_colunas:
    gender = gender.withColumnRenamed(col_name, f"year_{col_name}")

# Unpivot the DataFrame
unpivot_gender = gender.selectExpr(
    "Country_name",
    "Country_code",
    "Indicator_name",
    "Indicator_code",
    "stack(33, '1990', year_1990, '1991', year_1991, '1992', year_1992, '1993', year_1993, '1994', year_1994, '1995', year_1995, '1996', year_1996, '1997', year_1997, '1998', year_1998, '1999', year_1999, '2000', year_2000, '2001', year_2001, '2002', year_2002, '2003', year_2003, '2004', year_2004, '2005', year_2005, '2006', year_2006, '2007', year_2007, '2008', year_2008, '2009', year_2009, '2010', year_2010, '2011', year_2011, '2012', year_2012, '2013', year_2013, '2014', year_2014, '2015', year_2015, '2016', year_2016, '2017', year_2017, '2018', year_2018, '2019', year_2019, '2020', year_2020, '2021', year_2021, '2022', year_2022) as (Ano, Valor)"
)

In [5]:
unpivot_gender.select("Country_name","Country_code").distinct().show(500)

+--------------------+------------+
|        Country_name|Country_code|
+--------------------+------------+
|       Faroe Islands|         FRO|
|              Bhutan|         BTN|
|           Euro area|         EMU|
|               Aruba|         ABW|
|     Channel Islands|         CHI|
|        Bahamas, The|         BHS|
|              France|         FRA|
|               Gabon|         GAB|
|       Guinea-Bissau|         GNB|
|            Cameroon|         CMR|
|                Fiji|         FJI|
|          Low income|         LIC|
|           Australia|         AUS|
|Middle East & Nor...|         MEA|
|          Azerbaijan|         AZE|
|Bosnia and Herzeg...|         BIH|
|            Botswana|         BWA|
|             Algeria|         DZA|
|Least developed c...|         LDC|
|Middle East & Nor...|         MNA|
|          Cabo Verde|         CPV|
|             Denmark|         DNK|
|  Dominican Republic|         DOM|
|Middle East & Nor...|         TMN|
|             Bermuda|      

In [6]:
remover = ["ARB", "AFE", "AFW", "EMU", "LIC", "MEA", "LDC", "MNA",
    "TMN", "ECA", "TEA", "IDA", "MIC", "LAC", "SSA", "IBD", "SSF", "HIC",
    "LTE", "OSS", "LCN", "EAP", "NAC", "IBD", "SAS", "EUU", "FCS", "TSS",
    "LMC", "UMC", "IDX", "SST", "PRE", "HPC", "CEB", "PST", "OED", "WLD",
    "EAS", "ECS", "IBT", "EAR", "LMY", "PSE", "ARE"
]

pais_gender = unpivot_gender.filter(
    (~col("Country_code").isin(remover))
)

In [7]:
pais_gender = pais_gender \
    .withColumn("nome_pais", col("Country_name").cast(StringType())) \
    .withColumn("cod_pais", col("Country_code").cast(StringType())) \
    .withColumn("nome_indicador", col("Indicator_name").cast(StringType())) \
    .withColumn("cod_indicador", col("Indicator_code").cast(StringType())) \
    .withColumn("valor", col("Valor").cast(FloatType())) \
    .withColumn("ano", col("Ano").cast(FloatType()))


In [8]:
pais_gender \
    .select("nome_pais","cod_pais","nome_indicador","cod_indicador","valor","ano") \
    .write \
    .option("mergeSchema", "true") \
    .format("delta") \
    .mode("overwrite") \
    .save("hdfs://hdfs-nn:9000/Projeto/silver/TabelaGender/")

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_com

Py4JError: An error occurred while calling o303.save

In [None]:
spark.sql(
    """
    SHOW TABLES FROM database
    """
).show()

In [None]:
spark.sql(
    """
    SELECT * FROM database.gender
    """
).show()

In [None]:
spark.stop()