pip install delta-spark==2.4.0

In [1]:
from os import PathLike
from hdfs import InsecureClient
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

spark = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

spark.catalog.setCurrentDatabase("database")

In [2]:
path = "hdfs://hdfs-nn:9000/Projeto/bronze/Proportion_Of_Time.csv"

Custom_schema = StructType([
    StructField("Indicator Name", StringType(), True),        
    StructField("Indicator Name 2", StringType(), True),
    StructField("Indicator Code", StringType(), True),
    StructField("Country Name", StringType(), True),
    StructField("Country Code", StringType(), True),
    StructField("Year", IntegerType(), True),        
    StructField("Value", StringType(), True),
    StructField("Disaggregation", StringType(), True)
])

time = spark \
    .read \
    .option("delimiter",",") \
    .option("header","true") \
    .schema(Custom_schema) \
    .csv(path)

time.show()

+--------------------+--------------------+--------------+------------+------------+----+--------+--------------+
|      Indicator Name|    Indicator Name 2|Indicator Code|Country Name|Country Code|Year|   Value|Disaggregation|
+--------------------+--------------------+--------------+------------+------------+----+--------+--------------+
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|     Albania|         ALB|2011|21.73611|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|     Algeria|         DZA|2012|21.66667|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|   Argentina|         ARG|2013|23.41864|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|   Argentina|         ARG|2010|    17.5|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|   Argentina|         ARG|2005|16.73611|        female|
|Proportion of tim...| female (% of 24 ...|SG.TIM.UWRK.FE|     Armenia|         ARM|2008

In [3]:
time \
    .write \
    .mode("overwrite") \
    .format("csv") \
    .option("delimiter",",")\
    .option("header","true") \
    .saveAsTable("proporcao_time")

In [4]:
spark.catalog.setCurrentDatabase("database")
tables = spark.catalog.listTables()

# Display the table names
for table in tables:
    print(table.name)

proporcao_time


In [5]:
spark.sql(
    """
    SELECT * FROM proporcao_time
    """
).show()

+--------------------+--------------------+--------------+------------+------------+----+--------+--------------+
|      Indicator Name|    Indicator Name 2|Indicator Code|Country Name|Country Code|Year|   Value|Disaggregation|
+--------------------+--------------------+--------------+------------+------------+----+--------+--------------+
|Proportion of tim...|female (% of 24 h...|SG.TIM.UWRK.FE|     Albania|         ALB|2011|21.73611|        female|
|Proportion of tim...|female (% of 24 h...|SG.TIM.UWRK.FE|     Algeria|         DZA|2012|21.66667|        female|
|Proportion of tim...|female (% of 24 h...|SG.TIM.UWRK.FE|   Argentina|         ARG|2013|23.41864|        female|
|Proportion of tim...|female (% of 24 h...|SG.TIM.UWRK.FE|   Argentina|         ARG|2010|    17.5|        female|
|Proportion of tim...|female (% of 24 h...|SG.TIM.UWRK.FE|   Argentina|         ARG|2005|16.73611|        female|
|Proportion of tim...|female (% of 24 h...|SG.TIM.UWRK.FE|     Armenia|         ARM|2008

In [6]:
spark.stop()