In [1]:
from os import PathLike
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import LongType, StringType, StructField, StructType, BooleanType, ArrayType, IntegerType, FloatType
from pyspark.sql.functions import expr, array, col, explode, arrays_zip
from delta import *

# warehouse_location points to the default location for managed databases and tables
warehouse_location = 'hdfs://hdfs-nn:9000/warehouse'

builder = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Python Spark DataFrames and SQL") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .enableHiveSupport() \

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
path = "hdfs://hdfs-nn:9000/Projeto/bronze/Global_Population.csv"

years = [str(year) for year in range(1960, 2023)]

# Create the StructType schema
Custom_schema = StructType([
    StructField("Country_name", StringType(), True),
    StructField("Country_code", StringType(), True),
    StructField("Indicator_name", StringType(), True),
    StructField("Indicator_code", StringType(), True)
] + [StructField(year, StringType(), True) for year in years])


popu = spark \
    .read \
    .option("header","true") \
    .option("delimiter", ";") \
    .schema(Custom_schema) \
    .csv(path)

popu.printSchema()

root
 |-- Country_name: string (nullable = true)
 |-- Country_code: string (nullable = true)
 |-- Indicator_name: string (nullable = true)
 |-- Indicator_code: string (nullable = true)
 |-- 1960: string (nullable = true)
 |-- 1961: string (nullable = true)
 |-- 1962: string (nullable = true)
 |-- 1963: string (nullable = true)
 |-- 1964: string (nullable = true)
 |-- 1965: string (nullable = true)
 |-- 1966: string (nullable = true)
 |-- 1967: string (nullable = true)
 |-- 1968: string (nullable = true)
 |-- 1969: string (nullable = true)
 |-- 1970: string (nullable = true)
 |-- 1971: string (nullable = true)
 |-- 1972: string (nullable = true)
 |-- 1973: string (nullable = true)
 |-- 1974: string (nullable = true)
 |-- 1975: string (nullable = true)
 |-- 1976: string (nullable = true)
 |-- 1977: string (nullable = true)
 |-- 1978: string (nullable = true)
 |-- 1979: string (nullable = true)
 |-- 1980: string (nullable = true)
 |-- 1981: string (nullable = true)
 |-- 1982: string (null

In [3]:
for i in range(1960,1990):
    popu = popu.drop(str(i))

In [4]:
melt_colunas = ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']


for col_name in melt_colunas:
    popu = popu.withColumnRenamed(col_name, f"year_{col_name}")

# Unpivot the DataFrame
popu = popu.selectExpr(
    "Country_name",
    "Country_code",
    "Indicator_name",
    "Indicator_code",
    "stack(33, '1990', year_1990, '1991', year_1991, '1992', year_1992, '1993', year_1993, '1994', year_1994, '1995', year_1995, '1996', year_1996, '1997', year_1997, '1998', year_1998, '1999', year_1999, '2000', year_2000, '2001', year_2001, '2002', year_2002, '2003', year_2003, '2004', year_2004, '2005', year_2005, '2006', year_2006, '2007', year_2007, '2008', year_2008, '2009', year_2009, '2010', year_2010, '2011', year_2011, '2012', year_2012, '2013', year_2013, '2014', year_2014, '2015', year_2015, '2016', year_2016, '2017', year_2017, '2018', year_2018, '2019', year_2019, '2020', year_2020, '2021', year_2021, '2022', year_2022) as (Ano, Valor)"
)

In [5]:
remover = ["EMU", "LIC", "MEA", "LDC", "MNA", "TLA", "ECA", "IDA", "MIC", "TEA",
        "ARB", "LAC", "SSA", "PSS", "IBD", "HIC", "TMN", "LTE", "OSS", "LCN", 
        "ADW", "EAP", "NAC", "IDB", "DCS", "INX", "TSA", "LMC", "UMC", "SST", 
        "PRE", "HPC", "CEB", "PST", "OED", "CAF", "TEC", "EAS", "IBT", "EAC", "LMY" ,"ECS"
]

popu = popu.filter(
    (~col("Country_code").isin(remover))
)

In [6]:
popu = popu \
    .withColumnRenamed("nome_pais", "Country_name") \
    .withColumnRenamed("cod_pais", "Country_code") \
    .withColumnRenamed("nome_indicador", "Indicator_name") \
    .withColumnRenamed("cod_indicador", "Indicator_code") \
    .withColumn("valor", col("Valor").cast(FloatType())) \
    .withColumn("ano", col("Ano").cast(FloatType()))


In [None]:
popu \
    .write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("hdfs://hdfs-nn:9000/Projeto/silver/population/")

In [None]:
spark.stop()