Per entrar a una sessió de Spark i iniciar un builder en DeltaLake

In [21]:
from pyspark.sql import SparkSession
import duckdb
from outliers_data_Q import *

spark = SparkSession.builder \
        .config("spark.jars", "./duckdb.jar") \
        .appName("FormattedZone") \
        .getOrCreate()

# Cargar la tabla desde DuckDB
income = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:formatted_zone.duckdb") \
    .option("dbtable", "income_data") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .load()


# Ver los primeros registros para confirmar carga correcta
income.show()

+-----+-------+-----------------+---------------------------+-----------+-----------------------------------+-------------------+----------------+-------------------------------------+---------------------+------------------+
|STATE|ZIPCODE|Number of returns|Adjusted gross income (AGI)|    Avg AGI|Number of returns with total income|Total income amount|Avg total income|Number of returns with taxable income|Taxable income amount|Avg taxable income|
+-----+-------+-----------------+---------------------------+-----------+-----------------------------------+-------------------+----------------+-------------------------------------+---------------------+------------------+
|   AL|  35004|             4930|                     255534|51.83245436|                               4930|             258024|     52.33752535|                                 4020|               163859|       40.76094527|
|   AL|  35005|             3300|                     128387|38.90515152|                       

In [22]:
income = income.withColumnRenamed('Adjusted gross income (AGI)', 'Adjusted gross income')
income.show()

+-----+-------+-----------------+---------------------+-----------+-----------------------------------+-------------------+----------------+-------------------------------------+---------------------+------------------+
|STATE|ZIPCODE|Number of returns|Adjusted gross income|    Avg AGI|Number of returns with total income|Total income amount|Avg total income|Number of returns with taxable income|Taxable income amount|Avg taxable income|
+-----+-------+-----------------+---------------------+-----------+-----------------------------------+-------------------+----------------+-------------------------------------+---------------------+------------------+
|   AL|  35004|             4930|               255534|51.83245436|                               4930|             258024|     52.33752535|                                 4020|               163859|       40.76094527|
|   AL|  35005|             3300|               128387|38.90515152|                               3300|             1293

In [None]:
# guardem taula
income.write \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:trusted_zone.duckdb") \
    .option("dbtable", "cleaned_income") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .mode("overwrite") \
    .save()

In [24]:
income.printSchema()

root
 |-- STATE: string (nullable = true)
 |-- ZIPCODE: decimal(20,0) (nullable = true)
 |-- Number of returns: decimal(20,0) (nullable = true)
 |-- Adjusted gross income: decimal(20,0) (nullable = true)
 |-- Avg AGI: double (nullable = true)
 |-- Number of returns with total income: decimal(20,0) (nullable = true)
 |-- Total income amount: decimal(20,0) (nullable = true)
 |-- Avg total income: double (nullable = true)
 |-- Number of returns with taxable income: decimal(20,0) (nullable = true)
 |-- Taxable income amount: decimal(20,0) (nullable = true)
 |-- Avg taxable income: double (nullable = true)



In [None]:
categorical_columns = ["STATE"] #variable categorica
numerical_columns = ["ZIPCODE", "Number of returns", "Adjusted gross income", "Avg AGI",
                     "Number of returns with total income", "Total income amount",
                     "Avg total income", "Number of returns with taxable income",
                     "Taxable income amount", "Avg taxable income"]

In [None]:
plots_cate(income, categorical_columns)

In [None]:
plots_num(income, numerical_columns)

In [4]:
invalid_chars = [' ', ';', '{', '}', '(', ')', '\n', '\t', '=']

# Función para limpiar los nombres de las columnas reemplazando los caracteres no válidos
def clean_column_name(column_name):
    for invalid_char in invalid_chars:
        column_name = column_name.replace(invalid_char, "_")  # Reemplaza por subrayado o cualquier otro caracter válido que prefieras
    return column_name

# Aplicar la función de limpieza a cada columna
cleaned_income = income.select([col(c).alias(clean_column_name(c)) for c in income.columns])


#Guardar l'arxiu
income.write.mode("overwrite").format("delta").save("./deltalake/income_data/")

In [None]:
# finalitzar sessió de Spark
spark.stop()