In [1]:
pip install delta-spark==2.4.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

In [3]:
path = "hdfs://hdfs-nn:9000/Projeto/bronze/Gender_Stats.csv"

years = [str(year) for year in range(1960, 2023)]

# Create the StructType schema
Custom_schema = StructType([
    StructField("Country_name", StringType(), True),
    StructField("Country_code", StringType(), True),
    StructField("Indicator_name", StringType(), True),
    StructField("Indicator_code", StringType(), True)
] + [StructField(year, StringType(), True) for year in years])


gender = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(Custom_schema) \
    .csv(path)

gender.show()

+--------------------+------------+--------------------+-----------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+------------------+-----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------+----------------+----------------+----------------+----------------+--------------+----------------+----------------+----+
|        Country_name|Country_code|      Indicator_name|   Indicator_code|1960|1961|1962|1963|1964|1965|1966|1967|1968|1969|1970|1971|1972|1973|1974|1975|1976|1977|1978|1979|1980|1981|1982|1983|1984|1985|1986|1987|1988|1989|1990|1991|1992|1993|1994|1995|1996|1997|1998|1999|2000|              2001|             2002|             2003|            2004|      

In [4]:
for i in range(1960,1990):
    gender = gender.drop(str(i))

gender.printSchema()

root
 |-- Country_name: string (nullable = true)
 |-- Country_code: string (nullable = true)
 |-- Indicator_name: string (nullable = true)
 |-- Indicator_code: string (nullable = true)
 |-- 1990: string (nullable = true)
 |-- 1991: string (nullable = true)
 |-- 1992: string (nullable = true)
 |-- 1993: string (nullable = true)
 |-- 1994: string (nullable = true)
 |-- 1995: string (nullable = true)
 |-- 1996: string (nullable = true)
 |-- 1997: string (nullable = true)
 |-- 1998: string (nullable = true)
 |-- 1999: string (nullable = true)
 |-- 2000: string (nullable = true)
 |-- 2001: string (nullable = true)
 |-- 2002: string (nullable = true)
 |-- 2003: string (nullable = true)
 |-- 2004: string (nullable = true)
 |-- 2005: string (nullable = true)
 |-- 2006: string (nullable = true)
 |-- 2007: string (nullable = true)
 |-- 2008: string (nullable = true)
 |-- 2009: string (nullable = true)
 |-- 2010: string (nullable = true)
 |-- 2011: string (nullable = true)
 |-- 2012: string (null

In [5]:
melt_colunas = ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']


for col_name in melt_colunas:
    gender = gender.withColumnRenamed(col_name, f"year_{col_name}")

# Unpivot the DataFrame
unpivot_gender = gender.selectExpr(
    "Country_name",
    "Country_code",
    "Indicator_name",
    "Indicator_code",
    "stack(33, '1990', year_1990, '1991', year_1991, '1992', year_1992, '1993', year_1993, '1994', year_1994, '1995', year_1995, '1996', year_1996, '1997', year_1997, '1998', year_1998, '1999', year_1999, '2000', year_2000, '2001', year_2001, '2002', year_2002, '2003', year_2003, '2004', year_2004, '2005', year_2005, '2006', year_2006, '2007', year_2007, '2008', year_2008, '2009', year_2009, '2010', year_2010, '2011', year_2011, '2012', year_2012, '2013', year_2013, '2014', year_2014, '2015', year_2015, '2016', year_2016, '2017', year_2017, '2018', year_2018, '2019', year_2019, '2020', year_2020, '2021', year_2021, '2022', year_2022) as (Ano, Valor)"
)

unpivot_gender.show()

+--------------------+------------+--------------------+--------------+----+-----+
|        Country_name|Country_code|      Indicator_name|Indicator_code| Ano|Valor|
+--------------------+------------+--------------------+--------------+----+-----+
|Africa Eastern an...|         AFE|A woman can apply...|SG.APL.PSPT.EQ|1990| null|
|Africa Eastern an...|         AFE|A woman can apply...|SG.APL.PSPT.EQ|1991| null|
|Africa Eastern an...|         AFE|A woman can apply...|SG.APL.PSPT.EQ|1992| null|
|Africa Eastern an...|         AFE|A woman can apply...|SG.APL.PSPT.EQ|1993| null|
|Africa Eastern an...|         AFE|A woman can apply...|SG.APL.PSPT.EQ|1994| null|
|Africa Eastern an...|         AFE|A woman can apply...|SG.APL.PSPT.EQ|1995| null|
|Africa Eastern an...|         AFE|A woman can apply...|SG.APL.PSPT.EQ|1996| null|
|Africa Eastern an...|         AFE|A woman can apply...|SG.APL.PSPT.EQ|1997| null|
|Africa Eastern an...|         AFE|A woman can apply...|SG.APL.PSPT.EQ|1998| null|
|Afr

In [6]:
unpivot_gender.select("Country_name","Country_code").distinct().show(500)

+--------------------+------------+
|        Country_name|Country_code|
+--------------------+------------+
|       Faroe Islands|         FRO|
|              Bhutan|         BTN|
|           Euro area|         EMU|
|               Aruba|         ABW|
|     Channel Islands|         CHI|
|        Bahamas, The|         BHS|
|              France|         FRA|
|               Gabon|         GAB|
|       Guinea-Bissau|         GNB|
|            Cameroon|         CMR|
|                Fiji|         FJI|
|          Low income|         LIC|
|           Australia|         AUS|
|Middle East & Nor...|         MEA|
|          Azerbaijan|         AZE|
|Bosnia and Herzeg...|         BIH|
|            Botswana|         BWA|
|             Algeria|         DZA|
|Least developed c...|         LDC|
|Middle East & Nor...|         MNA|
|          Cabo Verde|         CPV|
|             Denmark|         DNK|
|  Dominican Republic|         DOM|
|Middle East & Nor...|         TMN|
|             Bermuda|      

In [7]:
remover = ["ARB", "AFE", "AFW", "EMU", "LIC", "MEA", "LDC", "MNA",
    "TMN", "ECA", "TEA", "IDA", "MIC", "LAC", "SSA", "IBD", "SSF", "HIC",
    "LTE", "OSS", "LCN", "EAP", "NAC", "IBD", "SAS", "EUU", "FCS", "TSS",
    "LMC", "UMC", "IDX", "SST", "PRE", "HPC", "CEB", "PST", "OED", "WLD",
    "EAS", "ECS", "IBT", "EAR", "LMY", "PSE", "ARE"
]

pais_gender = unpivot_gender.filter(
    (~col("Country_code").isin(remover))
)
pais_gender.show(100)

+--------------------+------------+--------------------+--------------+----+-----+
|        Country_name|Country_code|      Indicator_name|Indicator_code| Ano|Valor|
+--------------------+------------+--------------------+--------------+----+-----+
|Caribbean small s...|         CSS|A woman can apply...|SG.APL.PSPT.EQ|1990| null|
|Caribbean small s...|         CSS|A woman can apply...|SG.APL.PSPT.EQ|1991| null|
|Caribbean small s...|         CSS|A woman can apply...|SG.APL.PSPT.EQ|1992| null|
|Caribbean small s...|         CSS|A woman can apply...|SG.APL.PSPT.EQ|1993| null|
|Caribbean small s...|         CSS|A woman can apply...|SG.APL.PSPT.EQ|1994| null|
|Caribbean small s...|         CSS|A woman can apply...|SG.APL.PSPT.EQ|1995| null|
|Caribbean small s...|         CSS|A woman can apply...|SG.APL.PSPT.EQ|1996| null|
|Caribbean small s...|         CSS|A woman can apply...|SG.APL.PSPT.EQ|1997| null|
|Caribbean small s...|         CSS|A woman can apply...|SG.APL.PSPT.EQ|1998| null|
|Car

In [57]:
from delta import *

pais_gender \
    .write \
    .mode("overwrite") \
    .format("delta") \
    .option("delimiter",",")\
    .option("header","true") \
    .save("hdfs://hdfs-nn:9000/database/gender")

Py4JJavaError: An error occurred while calling o815.save.
: org.apache.spark.SparkClassNotFoundException: [DATA_SOURCE_NOT_FOUND] Failed to find the data source: delta. Please find packages at `https://spark.apache.org/third-party-projects.html`.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.dataSourceNotFoundError(QueryExecutionErrors.scala:738)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:697)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:256)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:592)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:525)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:633)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:633)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:633)
	... 16 more


In [None]:
spark.sql(
    """
    SHOW TABLES FROM database
    """
).show()

In [33]:
spark.sql(
    """
    SELECT * FROM literacia_jobs
    """
).show()

+------------+------------+--------------------+-----------------+-----------------+----------------+----------------+-----------------+----------------+------------------+----------------+------------------+----------------+----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+------------------+----------------+
|Country_name|Country_code|      Indicator_name|   Indicator_code|             1990|            1991|            1992|             1993|            1994|              1995|            1996|              1997|            1998|            1999|             2000|            2001|            2002|            2003|            2004|            2005|            2006|             2007|            2008|            2009|            2010|            2011|            201

In [None]:
spark.stop()