Per entrar a una sessió de Spark i iniciar un builder en DeltaLake

In [6]:
import pyspark
from delta import *
from pyspark.sql.functions import col  
import plotly.express as px 

builder = pyspark.sql.SparkSession.builder.appName("Shops_Deltalake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Comandes com si estessis a una sessió de Spark

Llegir els arxius i guardar-los en format DeltaLake, que ens garantitza que es compleixen les restriccions ACID, i dona avanatatges 




Carregar i llegir els 3 arxius de tenim

In [14]:
#Arxiu Parquet 
income = spark.read.parquet("./datalake/income_data/2024-04-17_IRSIncomeByZipCode_NoStateTotalsNoSmallZips.parquet")
income = income.withColumnRenamed('Adjusted gross income (AGI)', 'Adjusted gross income')
income.show()

+-----+-------+-----------------+---------------------+-----------+-----------------------------------+-------------------+----------------+-------------------------------------+---------------------+------------------+
|STATE|ZIPCODE|Number of returns|Adjusted gross income|    Avg AGI|Number of returns with total income|Total income amount|Avg total income|Number of returns with taxable income|Taxable income amount|Avg taxable income|
+-----+-------+-----------------+---------------------+-----------+-----------------------------------+-------------------+----------------+-------------------------------------+---------------------+------------------+
|   AL|  35004|             4930|               255534|51.83245436|                               4930|             258024|     52.33752535|                                 4020|               163859|       40.76094527|
|   AL|  35005|             3300|               128387|38.90515152|                               3300|             1293

In [8]:
income.printSchema()

root
 |-- STATE: string (nullable = true)
 |-- ZIPCODE: long (nullable = true)
 |-- Number of returns: long (nullable = true)
 |-- Adjusted gross income: long (nullable = true)
 |-- Avg AGI: double (nullable = true)
 |-- Number of returns with total income: long (nullable = true)
 |-- Total income amount: long (nullable = true)
 |-- Avg total income: double (nullable = true)
 |-- Number of returns with taxable income: long (nullable = true)
 |-- Taxable income amount: long (nullable = true)
 |-- Avg taxable income: double (nullable = true)



In [9]:
categorical_columns = ["STATE"] #variable categorica
numerical_columns = ["ZIPCODE", "Number of returns", "Adjusted gross income", "Avg AGI",
                     "Number of returns with total income", "Total income amount",
                     "Avg total income", "Number of returns with taxable income",
                     "Taxable income amount", "Avg taxable income"]

In [10]:
# Generación de gráficos para variables categóricas
for feature in categorical_columns:
    df_pd = income.groupBy(feature).count().withColumn('percent', col('count') / income.count() * 100).toPandas()
    fig = px.bar(df_pd, x=feature, y='count', text='percent', color=feature,
                 title=f'Distribución de la variable {feature}',
                 labels={'count': 'Conteo', 'percent': 'Porcentaje'})
    fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
    fig.show()

In [11]:
# Generación de gráficos para variables numéricas
for feature in numerical_columns:
    df_pd = income.select(feature).toPandas()
    fig = px.histogram(df_pd, x=feature, marginal="box", title=f'Histograma de {feature}')
    fig.update_layout(xaxis_title=feature, yaxis_title='Conteo')
    fig.show()


In [15]:
invalid_chars = [' ', ';', '{', '}', '(', ')', '\n', '\t', '=']

# Función para limpiar los nombres de las columnas reemplazando los caracteres no válidos
def clean_column_name(column_name):
    for invalid_char in invalid_chars:
        column_name = column_name.replace(invalid_char, "_")  # Reemplaza por subrayado o cualquier otro caracter válido que prefieras
    return column_name

# Aplicar la función de limpieza a cada columna
cleaned_sales = income.select([col(c).alias(clean_column_name(c)) for c in income.columns])


#Guardar l'arxiu
income.write.mode("overwrite").format("delta").save("./deltalake/income_data/")

AnalysisException: [DELTA_INVALID_CHARACTERS_IN_COLUMN_NAMES] Found invalid character(s) among ' ,;{}()\n\t=' in the column names of your schema. Please use other characters and try again.