In [1]:
pip install delta-spark==2.4.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import expr, array, col

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
path = "hdfs://hdfs-nn:9000/Projeto/bronze/Gender_Stats.csv"

years = [str(year) for year in range(1960, 2023)]

# Create the StructType schema
Custom_schema = StructType([
    StructField("Country_name", StringType(), True),
    StructField("Country_code", StringType(), True),
    StructField("Indicator_name", StringType(), True),
    StructField("Indicator_code", StringType(), True)
] + [StructField(year, StringType(), True) for year in years])


gender = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(Custom_schema) \
    .csv(path)

gender.show()

+--------------------+------------+--------------------+-----------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+------------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------+----------------+----------------+----------------+----------------+--------------+----------------+----------------+----+
|        Country_name|Country_code|      Indicator_name|   Indicator_code|1960|1961|1962|1963|1964|1965|1966|1967|1968|1969|1970|1971|1972|1973|1974|1975|1976|1977|1978|1979|1980|1981|1982|1983|1984|1985|1986|1987|1988|1989|1990|1991|1992|1993|1994|1995|1996|1997|1998|1999|2000|              2001|             2002|             2003|            2004|      

In [4]:
for i in range(1960,1990):
    gender = gender.drop(str(i))

gender.printSchema()

root
 |-- Country_name: string (nullable = true)
 |-- Country_code: string (nullable = true)
 |-- Indicator_name: string (nullable = true)
 |-- Indicator_code: string (nullable = true)
 |-- 1990: string (nullable = true)
 |-- 1991: string (nullable = true)
 |-- 1992: string (nullable = true)
 |-- 1993: string (nullable = true)
 |-- 1994: string (nullable = true)
 |-- 1995: string (nullable = true)
 |-- 1996: string (nullable = true)
 |-- 1997: string (nullable = true)
 |-- 1998: string (nullable = true)
 |-- 1999: string (nullable = true)
 |-- 2000: string (nullable = true)
 |-- 2001: string (nullable = true)
 |-- 2002: string (nullable = true)
 |-- 2003: string (nullable = true)
 |-- 2004: string (nullable = true)
 |-- 2005: string (nullable = true)
 |-- 2006: string (nullable = true)
 |-- 2007: string (nullable = true)
 |-- 2008: string (nullable = true)
 |-- 2009: string (nullable = true)
 |-- 2010: string (nullable = true)
 |-- 2011: string (nullable = true)
 |-- 2012: string (null

In [11]:
unpivot_gender = gender.selectExpr(
    "Country_name",
    "Country_code",
    "Indicator_name",
    "Indicator_code",
    "stack(33, '1990', 1990, '1991', 1991, '1992', 1992, '1993', 1993, '1994', 1994, '1995', 1995, '1996', 1996, '1997', 1997, '1998', 1998, '1999', 1999, '2000', 2000, '2001', 2001, '2002', 2002, '2003', 2003, '2004', 2004, '2005', 2005, '2006', 2006, '2007', 2007, '2008', 2008, '2009', 2009, '2010', 2010, '2011', 2011, '2012', 2012, '2013', 2013, '2014', 2014, '2015', 2015, '2016', 2016, '2017', 2017, '2018', 2018, '2019', 2019, '2020', 2020, '2021', 2021, '2022', 2022) as (Year, Value)"
)

unpivot_gender.show()

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_com

ConnectionRefusedError: [Errno 111] Connection refused

In [35]:
spark.catalog.setCurrentDatabase("database")
gender \
    .write \
    .mode("overwrite") \
    .format("parquet") \
    .option("delimiter",",")\
    .option("header","true") \
    .save("literacia_jobs")

In [3]:

tables = spark.catalog.listTables()

# Display the table names
for table in tables:
    print(table.name)

In [33]:
spark.sql(
    """
    SELECT * FROM literacia_jobs
    """
).show()

+------------+------------+--------------------+-----------------+-----------------+----------------+----------------+-----------------+----------------+------------------+----------------+------------------+----------------+----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+------------------+----------------+
|Country_name|Country_code|      Indicator_name|   Indicator_code|             1990|            1991|            1992|             1993|            1994|              1995|            1996|              1997|            1998|            1999|             2000|            2001|            2002|            2003|            2004|            2005|            2006|             2007|            2008|            2009|            2010|            2011|            201

In [None]:
spark.stop()