In [83]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, explode
import os
import re

In [84]:
# Se crea la sesión de Spark
spark = SparkSession.builder \
        .appName('Clima Procesamiento') \
        .getOrCreate()

In [88]:
def obtener_ultimo_archivo(directorio, extension = '*.json'):
    
    '''
    Obtiene el archivo más reciente de un directorio con la extensión especificada.

    Args:
        directorio (str): Ruta del directorio donde buscar los archivos.
        extension (str): Extensión de los archivos a buscar (por defecto '*.json').

    Returns:
        str: Ruta completa del archivo más reciente con la extensión especificada.
        '''
    
    try:
        # Obtener la lista de rutas completas de los archivos que coinciden con la extensión
        archivos = [os.path.join(directorio, archivo) for archivo in os.listdir(directorio) if archivo.endswith(extension)]
        
        # Verificar si no hay archivos en la lista
        if not archivos:
            raise FileNotFoundError('No se encontraron archivos en el directorio especificado.')
        
        # Obtener el archivo más reciente basado en la fecha de modificación
        ultimo_archivo = max(archivos, key = os.path.getmtime)
        
        # Retorna ruta completa de último archivo
        return ultimo_archivo
    
    except FileNotFoundError as e:
        print(f'Error: {e}')
        raise
    except Exception as e:
        print(f'Error inesperado: {e}')
        raise

data_dir = 'Datos'

ultimo_archivo = obtener_ultimo_archivo(data_dir, extension='.json')

In [89]:
df = spark.read.json(ultimo_archivo)
df.show()


+-------+------+--------------------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+
|address|alerts|   currentConditions|                days|         description|latitude|longitude|queryCost|resolvedAddress|            stations|   timezone|tzoffset|
+-------+------+--------------------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+
|Sicilia|    []|{25.0, Partially ...|[{42.1, Partially...|Similar temperatu...| 38.1221|  13.3611|        1|Sicilia, Italia|{{0.0, 54398.0, C...|Europe/Rome|     1.0|
+-------+------+--------------------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+



In [38]:
df.printSchema()

root
 |-- address: string (nullable = true)
 |-- alerts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentConditions: struct (nullable = true)
 |    |-- cloudcover: double (nullable = true)
 |    |-- conditions: string (nullable = true)
 |    |-- datetime: string (nullable = true)
 |    |-- datetimeEpoch: long (nullable = true)
 |    |-- dew: double (nullable = true)
 |    |-- feelslike: double (nullable = true)
 |    |-- humidity: double (nullable = true)
 |    |-- icon: string (nullable = true)
 |    |-- moonphase: double (nullable = true)
 |    |-- precip: double (nullable = true)
 |    |-- precipprob: double (nullable = true)
 |    |-- preciptype: string (nullable = true)
 |    |-- pressure: double (nullable = true)
 |    |-- snow: double (nullable = true)
 |    |-- snowdepth: double (nullable = true)
 |    |-- solarenergy: double (nullable = true)
 |    |-- solarradiation: double (nullable = true)
 |    |-- source: string (nullable = true)
 |    

In [92]:
from pyspark.sql.types import StructType, ArrayType, StringType, DoubleType, LongType, FloatType

def explotar_columnas_array(df, diccionario_resultado, sufijo_explode):
    
    for columna in df.schema:
        if isinstance(columna.dataType, ArrayType):
            columna_nombre = columna.name
            diccionario_resultado[columna_nombre] = df.select(explode(columna_nombre)).alias(f'{columna_nombre}_{sufijo_explode}')
            

def desanidar_columnas_struct(df, diccionario_resultado, sufijo_desanidado): 
    
    for columna in df.schema:
        if isinstance(columna.dataType, StructType):
            columna_nombre = columna.name
            
            campos_struct = [
                col(f'{columna_nombre}.{subfield.name}').alias(f'{columna_nombre}_{subfield.name}')
                for subfield in columna.dataType.fields
            ]
            diccionario_resultado[columna_nombre] = df.select(*campos_struct)
            

dataframes_exploded_Alerts_Days = {}
dataframes_exploded_Days = {}
dataframes_exploded_DaysHours = {}


explotar_columnas_array(df, dataframes_exploded_Alerts_Days, 'explode')

for key_exploded, df_explode in dataframes_exploded_Alerts_Days.items():
    desanidar_columnas_struct(df_explode, dataframes_exploded_Days, 'struct')
    
for key_exploded_2, df_exploded_2 in dataframes_exploded_Days.items():
    explotar_columnas_array(df_exploded_2, dataframes_exploded_DaysHours, "explode3")

# Verificar resultados
print("DataFrames exploded (Alerts_Days):")
for key, df_res in dataframes_exploded_Alerts_Days.items():
    print(f" - {key}")
    df_res.show()

print("\nDataFrames desanidados (Days):")
for key, df_res in dataframes_exploded_Days.items():
    print(f" - {key}")
    df_res.show()

print("\nDataFrames exploded adicionales (DaysHours):")
for key, df_res in dataframes_exploded_DaysHours.items():
    print(f" - {key}")
    df_res.show()

DataFrames exploded (Alerts_Days):
 - alerts
+---+
|col|
+---+
+---+

 - days
+--------------------+
|                 col|
+--------------------+
|{77.2, Partially ...|
+--------------------+


DataFrames desanidados (Days):
 - col
+--------------+----------------+------------+-----------------+--------------------+-------+-------------+----------------+----------------+--------------------+------------+-----------------+-------------+----------+---------------+--------------+--------------+------------+--------------+--------+-------------+---------------+------------------+----------+-------------------+-----------+----------------+----------+---------------+--------+-----------+-----------+-----------+--------------+-----------+------------+-------------+
|col_cloudcover|  col_conditions|col_datetime|col_datetimeEpoch|     col_description|col_dew|col_feelslike|col_feelslikemax|col_feelslikemin|           col_hours|col_humidity|         col_icon|col_moonphase|col_precip|col_precipco

In [89]:
dataframes_exploded_Days

{'col': DataFrame[col_cloudcover: double, col_conditions: string, col_datetime: string, col_datetimeEpoch: bigint, col_description: string, col_dew: double, col_feelslike: double, col_feelslikemax: double, col_feelslikemin: double, col_hours: array<struct<cloudcover:double,conditions:string,datetime:string,datetimeEpoch:bigint,dew:double,feelslike:double,humidity:double,icon:string,precip:double,precipprob:double,preciptype:string,pressure:double,severerisk:double,snow:double,snowdepth:double,solarenergy:double,solarradiation:double,source:string,stations:array<string>,temp:double,uvindex:double,visibility:double,winddir:double,windgust:double,windspeed:double>>, col_humidity: double, col_icon: string, col_moonphase: double, col_precip: double, col_precipcover: double, col_precipprob: double, col_preciptype: string, col_pressure: double, col_severerisk: double, col_snow: double, col_snowdepth: double, col_solarenergy: double, col_solarradiation: double, col_source: string, col_stations

In [73]:
dataframes_exploded_Alerts_Days

{'alerts': DataFrame[alerts_explode: string],
 'days': DataFrame[days_explode: struct<cloudcover:double,conditions:string,datetime:string,datetimeEpoch:bigint,description:string,dew:double,feelslike:double,feelslikemax:double,feelslikemin:double,hours:array<struct<cloudcover:double,conditions:string,datetime:string,datetimeEpoch:bigint,dew:double,feelslike:double,humidity:double,icon:string,precip:double,precipprob:double,preciptype:string,pressure:double,severerisk:double,snow:double,snowdepth:double,solarenergy:double,solarradiation:double,source:string,stations:array<string>,temp:double,uvindex:double,visibility:double,winddir:double,windgust:double,windspeed:double>>,humidity:double,icon:string,moonphase:double,precip:double,precipcover:double,precipprob:double,preciptype:string,pressure:double,severerisk:double,snow:double,snowdepth:double,solarenergy:double,solarradiation:double,source:string,stations:array<string>,sunrise:string,sunriseEpoch:bigint,sunset:string,sunsetEpoch:bigi

In [78]:
dataframes_exploded_Days

{'days_explode': DataFrame[days_explode_cloudcover: double, days_explode_conditions: string, days_explode_datetime: string, days_explode_datetimeEpoch: bigint, days_explode_description: string, days_explode_dew: double, days_explode_feelslike: double, days_explode_feelslikemax: double, days_explode_feelslikemin: double, days_explode_hours: array<struct<cloudcover:double,conditions:string,datetime:string,datetimeEpoch:bigint,dew:double,feelslike:double,humidity:double,icon:string,precip:double,precipprob:double,preciptype:string,pressure:double,severerisk:double,snow:double,snowdepth:double,solarenergy:double,solarradiation:double,source:string,stations:array<string>,temp:double,uvindex:double,visibility:double,winddir:double,windgust:double,windspeed:double>>, days_explode_humidity: double, days_explode_icon: string, days_explode_moonphase: double, days_explode_precip: double, days_explode_precipcover: double, days_explode_precipprob: double, days_explode_preciptype: string, days_explo

In [81]:
dataframes_exploded_DaysHours

{'days_explode_hours': DataFrame[days_explode_hours_explode3: struct<cloudcover:double,conditions:string,datetime:string,datetimeEpoch:bigint,dew:double,feelslike:double,humidity:double,icon:string,precip:double,precipprob:double,preciptype:string,pressure:double,severerisk:double,snow:double,snowdepth:double,solarenergy:double,solarradiation:double,source:string,stations:array<string>,temp:double,uvindex:double,visibility:double,winddir:double,windgust:double,windspeed:double>],
 'days_explode_stations': DataFrame[days_explode_stations_explode3: string]}

In [83]:
for key, df in dataframes_exploded_DaysHours.items():
    print(f'- LLAVE DF: {key}')
    df.show()

- LLAVE DF: days_explode_hours
+---------------------------+
|days_explode_hours_explode3|
+---------------------------+
|       {76.2, Partially ...|
|       {100.0, Overcast,...|
|       {96.7, Overcast, ...|
|       {98.0, Overcast, ...|
|       {91.8, Overcast, ...|
|       {53.7, Partially ...|
|       {85.8, Partially ...|
|       {100.0, Overcast,...|
|       {99.4, Overcast, ...|
|       {100.0, Overcast,...|
|       {100.0, Overcast,...|
|       {100.0, Overcast,...|
|       {25.0, Partially ...|
|       {25.0, Partially ...|
|       {25.0, Partially ...|
|       {25.0, Partially ...|
|       {25.0, Partially ...|
|       {26.6, Partially ...|
|       {100.0, Overcast,...|
|       {100.0, Overcast,...|
+---------------------------+
only showing top 20 rows

- LLAVE DF: days_explode_stations
+------------------------------+
|days_explode_stations_explode3|
+------------------------------+
|                         C6242|
|                          LICJ|
|                       

In [49]:
for key, df in dataframes_desanidados.items():
    print(f'- LLAVE DF: {key}')
    df.printSchema()

- LLAVE DF: currentConditions
root
 |-- currentConditions_cloudcover_desanidado: double (nullable = true)
 |-- currentConditions_conditions_desanidado: string (nullable = true)
 |-- currentConditions_datetime_desanidado: string (nullable = true)
 |-- currentConditions_datetimeEpoch_desanidado: long (nullable = true)
 |-- currentConditions_dew_desanidado: double (nullable = true)
 |-- currentConditions_feelslike_desanidado: double (nullable = true)
 |-- currentConditions_humidity_desanidado: double (nullable = true)
 |-- currentConditions_icon_desanidado: string (nullable = true)
 |-- currentConditions_moonphase_desanidado: double (nullable = true)
 |-- currentConditions_precip_desanidado: double (nullable = true)
 |-- currentConditions_precipprob_desanidado: double (nullable = true)
 |-- currentConditions_preciptype_desanidado: string (nullable = true)
 |-- currentConditions_pressure_desanidado: double (nullable = true)
 |-- currentConditions_snow_desanidado: double (nullable = true)
 

In [34]:
def procesamiento_datos(ultimo_archivo):
    
    '''
    Esta función procesa un archivo JSON crudo obtenido de datos climáticos y realiza transformaciones para 
    generar un DataFrame de Spark desanidado, explotando los datos anidados de días y horas, y agregando 
    información de la ciudad y zona horaria.

    Argumentos:
        ultimo_archivo (str): Ruta del archivo JSON que contiene los datos climáticos crudos.

    Retorna:
        DataFrame: Un DataFrame de Spark procesado con datos desanidados, explotados y enriquecidos con 
        información de la ciudad.
    '''
    
    try:
        # Leer el archivo JSON en dataframe de Spark
        df = spark.read.json(ultimo_archivo)
        
        # Explode de la columna "days", convirtiendo cada día en una fila separada
        df_days_exploded = df.withColumn('day', explode('days')).drop('days')

        # Seleccionar y desanidar columnas del nivel "day"
        df_desanidado_day = df_days_exploded.select(
            col("day.cloudcover").alias("day_cloudcover"),
            col("day.conditions").alias("day_conditions"),
            col("day.datetime").alias("day_datetime"),
            col("day.datetimeEpoch").alias("day_datetimeEpoch"),
            col("day.description").alias("day_description"),
            col("day.dew").alias("day_dew"),
            col("day.feelslike").alias("day_feelslike"),
            col("day.feelslikemax").alias("day_feelslikemax"),
            col("day.feelslikemin").alias("day_feelslikemin"),
            col("day.hours").alias("day_hours"), # ARRAY
            col("day.humidity").alias("day_humidity"),
            col("day.icon").alias("day_icon"),
            col("day.moonphase").alias("day_moonphase"),
            col("day.precip").alias("day_precip"),
            col("day.precipcover").alias("day_precipcover"),
            col("day.precipprob").alias("day_precipprob"),
            col("day.preciptype").alias("day_preciptype"), # ARRAY
            col("day.pressure").alias("day_pressure"),
            col("day.severerisk").alias("day_severerisk"),
            col("day.snow").alias("day_snow"),
            col("day.snowdepth").alias("day_snowdepth"),
            col("day.solarenergy").alias("day_solarenergy"),
            col("day.solarradiation").alias("day_solarradiation"),
            col("day.source").alias("day_source"),
            col("day.stations").alias("day_stations"), # ARRAY
            col("day.sunrise").alias("day_sunrise"),
            col("day.sunriseEpoch").alias("day_sunriseEpoch"),
            col("day.sunset").alias("day_sunset"),
            col("day.sunsetEpoch").alias("day_sunsetEpoch"),
            col("day.temp").alias("day_temp"),
            col("day.tempmax").alias("day_tempmax"),
            col("day.tempmin").alias("day_tempmin"),
            col("day.uvindex").alias("day_uvindex"),
            col("day.visibility").alias("day_visibility"),
            col("day.winddir").alias("day_winddir"),
            col("day.windgust").alias("day_windgust"),
            col("day.windspeed").alias("day_windspeed")
        )
        
        # Explode de la columna "day_hours", creando filas separadas para cada hora
        df_Day = df_desanidado_day.withColumn('day_hour', explode('day_hours')).drop('day_hours')

        # Desanidar columnas del nivel "hour" (hora)
        df_DayHour = df_Day.select(
            col('day_hour.cloudcover').alias('dayHour_cloudcover'),
            col('day_hour.conditions').alias('dayHour_conditions'),
            col('day_hour.datetime').alias('dayHour_datetime'),
            col('day_hour.datetimeEpoch').alias('dayHour_datetimeEpoch'),
            col('day_hour.dew').alias('dayHour_dew'),
            col('day_hour.feelslike').alias('dayHour_feelslike'),
            col('day_hour.humidity').alias('dayHour_humidity'),
            col('day_hour.icon').alias('dayHour_icon'),
            col('day_hour.precip').alias('dayHour_precip'),
            col('day_hour.precipprob').alias('dayHour_precipprob'),
            col('day_hour.preciptype').alias('dayHour_preciptype'),
            col('day_hour.pressure').alias('dayHour_pressure'),
            col('day_hour.severerisk').alias('dayHour_severerisk'),
            col('day_hour.snow').alias('dayHour_snow'),
            col('day_hour.snowdepth').alias('dayHour_snowdepth'),
            col('day_hour.solarenergy').alias('dayHour_solarenergy'),
            col('day_hour.solarradiation').alias('dayHour_solarradiation'),
            col('day_hour.source').alias('dayHour_source'),
            col('day_hour.stations').alias('dayHour_stations'),
            col('day_hour.temp').alias('dayHour_temp'),
            col('day_hour.uvindex').alias('dayHour_uvindex'),
            col('day_hour.visibility').alias('dayHour_visibility'),
            col('day_hour.winddir').alias('dayHour_winddir'),
            col('day_hour.windgust').alias('dayHour_windgust'),
            col('day_hour.windspeed').alias('dayHour_windspeed')
        )
        
        # Reemplazo de valores NULL en 'dayHour_preciptype' con el valor 'Sin Dato'
        # df_DayHour_filled = (
        #     df_DayHour
        #     .withColumn(
        #         'dayHour_preciptype', when(
        #             col('dayHour_preciptype').isNull(),
        #             lit(['Sin Dato'])
        #         ).otherwise(col('dayHour_preciptype'))
        # ))
        
        # valores_reemplazo = {}
        # for columna, tipo in df_DayHour.dtypes:
        #     if tipo == 'string':
        #         valores_reemplazo[columna] = 'Sin Dato'
            
        #     elif tipo == ['int', 'bigint', 'double', 'float']:
        #         valores_reemplazo[columna] = 0
        
        # df_DayHour_filled = df_DayHour.na.fill(valores_reemplazo)
        

        # Explode de las columnas 'dayHour_preciptype' y 'dayHour_stations'
        df_explode_DayHour = (
            df_DayHour
            .withColumn('dayHour_preciptype_explode', explode('dayHour_preciptype'))
            .withColumn('dayHour_stations_explode', explode('dayHour_stations'))
            .drop('dayHour_preciptype', 'dayHour_stations')
        )
        
        # Extraer la fecha del nombre del archivo
        fecha_extraida = re.search(r'\d{4}-\d{2}-\d{2}', ultimo_archivo).group(0)

        # Agregar la columna 'day_datetime' al DataFrame basado en la fecha extraída
        df_explode_DayHour = df_explode_DayHour.withColumn('day_datetime', lit(fecha_extraida))
        
        # Obtener información constante de la ciudad del DataFrame original
        datos_ciudad = df.select('address', 'description', 'latitude', 'longitude', 'resolvedAddress', 'timezone', 'tzoffset').first()

        # Añadir columnas constantes de la ciudad al DataFrame final
        df_explode_DayHour = (
            df_explode_DayHour
            .withColumn('address', lit(datos_ciudad['address']))
            .withColumn('description', lit(datos_ciudad['description']))
            .withColumn('latitude', lit(datos_ciudad['latitude']))
            .withColumn('longitude', lit(datos_ciudad['longitude']))
            .withColumn('resolvedAddress', lit(datos_ciudad['resolvedAddress']))
            .withColumn('timezone', lit(datos_ciudad['timezone']))
            .withColumn('tzoffset', lit(datos_ciudad['tzoffset']))
        )
        
        # Agregar columna ajustada para la zona horaria Argentina
        df_explode_DayHour = df_explode_DayHour.withColumn('tzoffset_Arg', lit(-3.0))
        
        # Se crea una lista con el nombre del orden deseado de las columnas
        columnas_ordenadas = ['resolvedAddress', 'latitude', 'longitude', 'day_datetime', 'timezone', 'dayHour_datetime', 'tzoffset', 'dayHour_stations_explode', 
                      'dayHour_cloudcover', 'dayHour_conditions', 'dayHour_datetimeEpoch', 'dayHour_dew', 'dayHour_feelslike', 'dayHour_humidity', 
                      'dayHour_icon', 'dayHour_preciptype_explode', 'dayHour_precip', 'dayHour_precipprob', 'dayHour_pressure', 'dayHour_severerisk', 'dayHour_snow', 'dayHour_snowdepth', 'dayHour_solarenergy',
                      'dayHour_solarradiation', 'dayHour_source', 'dayHour_temp', 'dayHour_uvindex', 'dayHour_visibility', 'dayHour_winddir', 'dayHour_windgust', 'dayHour_windspeed']

        # Se crea una lista con el nuevo nombre deseado de las columnas
        nombres_nuevos = ['resolvedAddress', 'latitude', 'longitude', 'day', 'timezone', 'datetime', 'tzoffset', 'stations', 'cloudcover', 'conditions', 'datetimeEpoch', 'dew', 
                  'feelsLike', 'humidity', 'icon', 'precip', 'preipProb', 'pressure', 'severerisk', 'snow', 'snowDepth', 'solarEnergy', 'solarRadiation', 'source', 'temp', 
                  'uvindex', 'visibility', 'windDir', 'windGust', 'windSpeed', 'precipType']

        # Se realizar un "select" para ordenar las columnas como se especificó en la lista
        df_organizado = df_explode_DayHour.select(columnas_ordenadas)
        
        # Se realiza un for para modificar el antiguo nombre de las columnas por el nuevo
        df_renombrado_columnas = df_organizado.select(
        [col(nombre_antiguo).alias(nombre_nuevo) for nombre_antiguo, nombre_nuevo in zip(df_organizado.columns, nombres_nuevos)])
        
        # Retornar el DataFrame procesado
        return df_renombrado_columnas
    
    except FileNotFoundError:
        print(f'Error: el archivo {ultimo_archivo} no existe.')
        raise
    except ValueError:
        print('Error: los datos no están en el formato esperado.')
        raise
    # except pyspark.sql.utils.AnalysisException as e:
    #     print(f'Error en operaciones de Spark: {e}')
    #     raise
    except Exception as e:
        print(f'Error: {e}')
        raise

In [35]:
df_procesado = procesamiento_datos(ultimo_archivo)

Error: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "explode(dayHour_preciptype)" due to data type mismatch: Parameter 1 requires the ("ARRAY" or "MAP") type, however "dayHour_preciptype" has the type "STRING".;
'Project [dayHour_cloudcover#4791, dayHour_conditions#4792, dayHour_datetime#4793, dayHour_datetimeEpoch#4794L, dayHour_dew#4795, dayHour_feelslike#4796, dayHour_humidity#4797, dayHour_icon#4798, dayHour_precip#4799, dayHour_precipprob#4800, dayHour_preciptype#4801, dayHour_pressure#4802, dayHour_severerisk#4803, dayHour_snow#4804, dayHour_snowdepth#4805, dayHour_solarenergy#4806, dayHour_solarradiation#4807, dayHour_source#4808, dayHour_stations#4809, dayHour_temp#4810, dayHour_uvindex#4811, dayHour_visibility#4812, dayHour_winddir#4813, dayHour_windgust#4814, ... 2 more fields]
+- Project [day_hour#4715.cloudcover AS dayHour_cloudcover#4791, day_hour#4715.conditions AS dayHour_conditions#4792, day_hour#4715.datetime AS dayHour_datetime#4793, day_hour#4715.datetime

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "explode(dayHour_preciptype)" due to data type mismatch: Parameter 1 requires the ("ARRAY" or "MAP") type, however "dayHour_preciptype" has the type "STRING".;
'Project [dayHour_cloudcover#4791, dayHour_conditions#4792, dayHour_datetime#4793, dayHour_datetimeEpoch#4794L, dayHour_dew#4795, dayHour_feelslike#4796, dayHour_humidity#4797, dayHour_icon#4798, dayHour_precip#4799, dayHour_precipprob#4800, dayHour_preciptype#4801, dayHour_pressure#4802, dayHour_severerisk#4803, dayHour_snow#4804, dayHour_snowdepth#4805, dayHour_solarenergy#4806, dayHour_solarradiation#4807, dayHour_source#4808, dayHour_stations#4809, dayHour_temp#4810, dayHour_uvindex#4811, dayHour_visibility#4812, dayHour_winddir#4813, dayHour_windgust#4814, ... 2 more fields]
+- Project [day_hour#4715.cloudcover AS dayHour_cloudcover#4791, day_hour#4715.conditions AS dayHour_conditions#4792, day_hour#4715.datetime AS dayHour_datetime#4793, day_hour#4715.datetimeEpoch AS dayHour_datetimeEpoch#4794L, day_hour#4715.dew AS dayHour_dew#4795, day_hour#4715.feelslike AS dayHour_feelslike#4796, day_hour#4715.humidity AS dayHour_humidity#4797, day_hour#4715.icon AS dayHour_icon#4798, day_hour#4715.precip AS dayHour_precip#4799, day_hour#4715.precipprob AS dayHour_precipprob#4800, day_hour#4715.preciptype AS dayHour_preciptype#4801, day_hour#4715.pressure AS dayHour_pressure#4802, day_hour#4715.severerisk AS dayHour_severerisk#4803, day_hour#4715.snow AS dayHour_snow#4804, day_hour#4715.snowdepth AS dayHour_snowdepth#4805, day_hour#4715.solarenergy AS dayHour_solarenergy#4806, day_hour#4715.solarradiation AS dayHour_solarradiation#4807, day_hour#4715.source AS dayHour_source#4808, day_hour#4715.stations AS dayHour_stations#4809, day_hour#4715.temp AS dayHour_temp#4810, day_hour#4715.uvindex AS dayHour_uvindex#4811, day_hour#4715.visibility AS dayHour_visibility#4812, day_hour#4715.winddir AS dayHour_winddir#4813, day_hour#4715.windgust AS dayHour_windgust#4814, day_hour#4715.windspeed AS dayHour_windspeed#4815]
   +- Project [day_cloudcover#4603, day_conditions#4604, day_datetime#4605, day_datetimeEpoch#4606L, day_description#4607, day_dew#4608, day_feelslike#4609, day_feelslikemax#4610, day_feelslikemin#4611, day_humidity#4613, day_icon#4614, day_moonphase#4615, day_precip#4616, day_precipcover#4617, day_precipprob#4618, day_preciptype#4619, day_pressure#4620, day_severerisk#4621, day_snow#4622, day_snowdepth#4623, day_solarenergy#4624, day_solarradiation#4625, day_source#4626, day_stations#4627, ... 13 more fields]
      +- Project [day_cloudcover#4603, day_conditions#4604, day_datetime#4605, day_datetimeEpoch#4606L, day_description#4607, day_dew#4608, day_feelslike#4609, day_feelslikemax#4610, day_feelslikemin#4611, day_hours#4612, day_humidity#4613, day_icon#4614, day_moonphase#4615, day_precip#4616, day_precipcover#4617, day_precipprob#4618, day_preciptype#4619, day_pressure#4620, day_severerisk#4621, day_snow#4622, day_snowdepth#4623, day_solarenergy#4624, day_solarradiation#4625, day_source#4626, ... 14 more fields]
         +- Generate explode(day_hours#4612), false, [day_hour#4715]
            +- Project [day#4576.cloudcover AS day_cloudcover#4603, day#4576.conditions AS day_conditions#4604, day#4576.datetime AS day_datetime#4605, day#4576.datetimeEpoch AS day_datetimeEpoch#4606L, day#4576.description AS day_description#4607, day#4576.dew AS day_dew#4608, day#4576.feelslike AS day_feelslike#4609, day#4576.feelslikemax AS day_feelslikemax#4610, day#4576.feelslikemin AS day_feelslikemin#4611, day#4576.hours AS day_hours#4612, day#4576.humidity AS day_humidity#4613, day#4576.icon AS day_icon#4614, day#4576.moonphase AS day_moonphase#4615, day#4576.precip AS day_precip#4616, day#4576.precipcover AS day_precipcover#4617, day#4576.precipprob AS day_precipprob#4618, day#4576.preciptype AS day_preciptype#4619, day#4576.pressure AS day_pressure#4620, day#4576.severerisk AS day_severerisk#4621, day#4576.snow AS day_snow#4622, day#4576.snowdepth AS day_snowdepth#4623, day#4576.solarenergy AS day_solarenergy#4624, day#4576.solarradiation AS day_solarradiation#4625, day#4576.source AS day_source#4626, ... 13 more fields]
               +- Project [address#4551, alerts#4552, currentConditions#4553, description#4555, latitude#4556, longitude#4557, queryCost#4558L, resolvedAddress#4559, stations#4560, timezone#4561, tzoffset#4562, day#4576]
                  +- Project [address#4551, alerts#4552, currentConditions#4553, days#4554, description#4555, latitude#4556, longitude#4557, queryCost#4558L, resolvedAddress#4559, stations#4560, timezone#4561, tzoffset#4562, day#4576]
                     +- Generate explode(days#4554), false, [day#4576]
                        +- Relation [address#4551,alerts#4552,currentConditions#4553,days#4554,description#4555,latitude#4556,longitude#4557,queryCost#4558L,resolvedAddress#4559,stations#4560,timezone#4561,tzoffset#4562] json


In [None]:
df_explode = df.wit

### PROCESAMIENTO DE DATOS 

In [43]:
df.printSchema()

root
 |-- address: string (nullable = true)
 |-- alerts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentConditions: struct (nullable = true)
 |    |-- cloudcover: double (nullable = true)
 |    |-- conditions: string (nullable = true)
 |    |-- datetime: string (nullable = true)
 |    |-- datetimeEpoch: long (nullable = true)
 |    |-- dew: double (nullable = true)
 |    |-- feelslike: double (nullable = true)
 |    |-- humidity: double (nullable = true)
 |    |-- icon: string (nullable = true)
 |    |-- moonphase: double (nullable = true)
 |    |-- precip: double (nullable = true)
 |    |-- precipprob: double (nullable = true)
 |    |-- preciptype: string (nullable = true)
 |    |-- pressure: double (nullable = true)
 |    |-- snow: double (nullable = true)
 |    |-- snowdepth: double (nullable = true)
 |    |-- solarenergy: double (nullable = true)
 |    |-- solarradiation: double (nullable = true)
 |    |-- source: string (nullable = true)
 |    

In [42]:
df.show()

+-------+------+--------------------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+
|address|alerts|   currentConditions|                days|         description|latitude|longitude|queryCost|resolvedAddress|            stations|   timezone|tzoffset|
+-------+------+--------------------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+
|Sicilia|    []|{25.0, Partially ...|[{77.2, Partially...|Similar temperatu...| 38.1221|  13.3611|        1|Sicilia, Italia|{{0.0, 54398.0, C...|Europe/Rome|     1.0|
+-------+------+--------------------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+



In [39]:
from pyspark.sql.functions import col
from pyspark.sql.functions import explode

# Desanidamiento de columna "currentConditions" y asignación de nuevos alias
df_desanidado_currentConditions = df.select(
    col('currentConditions.cloudcover').alias('cloudcover'),
    col('currentConditions.conditions').alias('conditions'),
    col('currentConditions.datetime').alias('datetime'),
    col('currentConditions.datetimeEpoch').alias('datetimeEpoch'),
    col('currentConditions.dew').alias('dew'),
    col('currentConditions.feelslike').alias('feelslike'),
    col('currentConditions.humidity').alias('humidity'),
    col('currentConditions.icon').alias('icon'),
    col('currentConditions.moonphase').alias('moonphase'),
    col('currentConditions.precip').alias('precip'),
    col('currentConditions.precipprob').alias('precipprob'),
    col('currentConditions.preciptype').alias('preciptype'),
    col('currentConditions.pressure').alias('pressure'),
    col('currentConditions.snow').alias('snow'),
    col('currentConditions.snowdepth').alias('snowdepth'),
    col('currentConditions.solarenergy').alias('solarenergy'),
    col('currentConditions.solarradiation').alias('solarradiation'),
    col('currentConditions.source').alias('source'),
    col('currentConditions.stations').alias('stations'),
    col('currentConditions.sunrise').alias('sunrise'),
    col('currentConditions.sunriseEpoch').alias('sunriseEpoch'),
    col('currentConditions.sunset').alias('sunset'),
    col('currentConditions.sunsetEpoch').alias('sunsetEpoch'),
    col('currentConditions.temp').alias('temp'),
    col('currentConditions.uvindex').alias('uvindex'),
    col('currentConditions.visibility').alias('visibility'),
    col('currentConditions.winddir').alias('winddir'),
    col('currentConditions.windgust').alias('windgust'),
    col('currentConditions.windspeed').alias('windspeed') 
)

# Explode de columna "stations" (la cual se encuentra dentro de "currenConditions")
df_final_currentConditions = df_desanidado_currentConditions.withColumn('station', explode('stations')).drop('stations')
df_final_currentConditions.show()

+----------+----------------+--------+-------------+----+---------+--------+-----------------+---------+------+----------+----------+--------+----+---------+-----------+--------------+------+--------+------------+--------+-----------+----+-------+----------+-------+--------+---------+-------+
|cloudcover|      conditions|datetime|datetimeEpoch| dew|feelslike|humidity|             icon|moonphase|precip|precipprob|preciptype|pressure|snow|snowdepth|solarenergy|solarradiation|source| sunrise|sunriseEpoch|  sunset|sunsetEpoch|temp|uvindex|visibility|winddir|windgust|windspeed|station|
+----------+----------------+--------+-------------+----+---------+--------+-----------------+---------+------+----------+----------+--------+----+---------+-----------+--------------+------+--------+------------+--------+-----------+----+-------+----------+-------+--------+---------+-------+
|      25.0|Partially cloudy|16:20:00|   1737645600|47.0|     65.1|    52.0|partly-cloudy-day|     0.81|   0.0|       

In [41]:
# Explode de columna "days"
df_days_exploded = df.withColumn('day', explode('days')).drop('days')
df_days_exploded.show()

+-------+------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+--------------------+
|address|alerts|   currentConditions|         description|latitude|longitude|queryCost|resolvedAddress|            stations|   timezone|tzoffset|                 day|
+-------+------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+--------------------+
|Sicilia|    []|{25.0, Partially ...|Similar temperatu...| 38.1221|  13.3611|        1|Sicilia, Italia|{{0.0, 54398.0, C...|Europe/Rome|     1.0|{77.2, Partially ...|
+-------+------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+--------------------+



In [21]:
# Desanidamiento de columna "days" (original). 
df_desanidado_day = df_days_exploded.select(
    col("day.cloudcover").alias("day_cloudcover"),
    col("day.conditions").alias("day_conditions"),
    col("day.datetime").alias("day_datetime"),
    col("day.datetimeEpoch").alias("day_datetimeEpoch"),
    col("day.description").alias("day_description"),
    col("day.dew").alias("day_dew"),
    col("day.feelslike").alias("day_feelslike"),
    col("day.feelslikemax").alias("day_feelslikemax"),
    col("day.feelslikemin").alias("day_feelslikemin"),
    col("day.hours").alias("day_hours"), # ARRAY
    col("day.humidity").alias("day_humidity"),
    col("day.icon").alias("day_icon"),
    col("day.moonphase").alias("day_moonphase"),
    col("day.precip").alias("day_precip"),
    col("day.precipcover").alias("day_precipcover"),
    col("day.precipprob").alias("day_precipprob"),
    col("day.preciptype").alias("day_preciptype"), # ARRAY
    col("day.pressure").alias("day_pressure"),
    col("day.severerisk").alias("day_severerisk"),
    col("day.snow").alias("day_snow"),
    col("day.snowdepth").alias("day_snowdepth"),
    col("day.solarenergy").alias("day_solarenergy"),
    col("day.solarradiation").alias("day_solarradiation"),
    col("day.source").alias("day_source"),
    col("day.stations").alias("day_stations"), # ARRAY
    col("day.sunrise").alias("day_sunrise"),
    col("day.sunriseEpoch").alias("day_sunriseEpoch"),
    col("day.sunset").alias("day_sunset"),
    col("day.sunsetEpoch").alias("day_sunsetEpoch"),
    col("day.temp").alias("day_temp"),
    col("day.tempmax").alias("day_tempmax"),
    col("day.tempmin").alias("day_tempmin"),
    col("day.uvindex").alias("day_uvindex"),
    col("day.visibility").alias("day_visibility"),
    col("day.winddir").alias("day_winddir"),
    col("day.windgust").alias("day_windgust"),
    col("day.windspeed").alias("day_windspeed")
)

df_Day = df_desanidado_day.withColumn('day_hour', explode('day_hours')).drop('day_hours')
df_Day.show(5)
 

+--------------+----------------+------------+-----------------+--------------------+-------+-------------+----------------+----------------+------------+-----------------+-------------+----------+---------------+--------------+--------------+------------+--------------+--------+-------------+---------------+------------------+----------+-------------------+-----------+----------------+----------+---------------+--------+-----------+-----------+-----------+--------------+-----------+------------+-------------+--------------------+
|day_cloudcover|  day_conditions|day_datetime|day_datetimeEpoch|     day_description|day_dew|day_feelslike|day_feelslikemax|day_feelslikemin|day_humidity|         day_icon|day_moonphase|day_precip|day_precipcover|day_precipprob|day_preciptype|day_pressure|day_severerisk|day_snow|day_snowdepth|day_solarenergy|day_solarradiation|day_source|       day_stations|day_sunrise|day_sunriseEpoch|day_sunset|day_sunsetEpoch|day_temp|day_tempmax|day_tempmin|day_uvindex|da

In [24]:
df_DayHour = df_Day.select(
    col('day_hour.cloudcover').alias('dayHour_cloudcover'),
    col('day_hour.conditions').alias('dayHour_conditions'),
    col('day_hour.datetime').alias('dayHour_datetime'),
    col('day_hour.datetimeEpoch').alias('dayHour_datetimeEpoch'),
    col('day_hour.dew').alias('dayHour_dew'),
    col('day_hour.feelslike').alias('dayHour_feelslike'),
    col('day_hour.humidity').alias('dayHour_humidity'),
    col('day_hour.icon').alias('dayHour_icon'),
    col('day_hour.precip').alias('dayHour_precip'),
    col('day_hour.precipprob').alias('dayHour_precipprob'),
    col('day_hour.preciptype').alias('dayHour_preciptype'),
    col('day_hour.pressure').alias('dayHour_pressure'),
    col('day_hour.severerisk').alias('dayHour_severerisk'),
    col('day_hour.snow').alias('dayHour_snow'),
    col('day_hour.snowdepth').alias('dayHour_snowdepth'),
    col('day_hour.solarenergy').alias('dayHour_solarenergy'),
    col('day_hour.solarradiation').alias('dayHour_solarradiation'),
    col('day_hour.source').alias('dayHour_source'),
    col('day_hour.stations').alias('dayHour_stations'),
    col('day_hour.temp').alias('dayHour_temp'),
    col('day_hour.uvindex').alias('dayHour_uvindex'),
    col('day_hour.visibility').alias('dayHour_visibility'),
    col('day_hour.winddir').alias('dayHour_winddir'),
    col('day_hour.windgust').alias('dayHour_windgust'),
    col('day_hour.windspeed').alias('dayHour_windspeed')
)
df_DayHour.show(5)

+------------------+------------------+----------------+---------------------+-----------+-----------------+----------------+-------------------+--------------+------------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+-------------------+------------+---------------+------------------+---------------+----------------+-----------------+
|dayHour_cloudcover|dayHour_conditions|dayHour_datetime|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_precip|dayHour_precipprob|dayHour_preciptype|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|   dayHour_stations|dayHour_temp|dayHour_uvindex|dayHour_visibility|dayHour_winddir|dayHour_windgust|dayHour_windspeed|
+------------------+------------------+----------------+---------------------+-----------+-----------------+------

In [25]:
from pyspark.sql.functions import when, lit

# Reemplazo de valores NULL en columna 'dayHour_preciptype' para posterior 'explode'
df_DayHour_filled = (
    df_DayHour
    .withColumn(
        'dayHour_preciptype', when(
            col('dayHour_preciptype').isNull(),
            lit(['Sin Dato'])
        ).otherwise(col('dayHour_preciptype'))
))

# Eexplode para el dataframe "df_DayHour_filled"
df_explode_DayHour = (
    df_DayHour_filled
    .withColumn('dayHour_preciptype_explode', explode('dayHour_preciptype'))
    .withColumn('dayHour_stations_explode', explode('dayHour_stations'))
    .drop('dayHour_preciptype', 'dayHour_stations')
)

In [34]:
# Extrae la fecha del archivo JSON para agregar al dataframe de "df_explode_DayHour"
from pyspark.sql.functions import lit
import re

ruta_archivo = ultimo_archivo

fecha_extraida = re.search(r'\d{4}-\d{2}-\d{2}', ruta_archivo).group(0)

df_explode_DayHour = df_explode_DayHour.withColumn('day_datetime', lit(fecha_extraida))

# APLICAR FUNCIÓN "OBTENER_ULTIMO_ARCHIVO" PARA EXTRAER LA FECHA DEL JSON

In [40]:
df_explode_DayHour.orderBy(col('dayHour_datetime').desc()).limit(5).show()

+------------------+------------------+----------------+---------------------+-----------+-----------------+----------------+-------------------+--------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+------------+---------------+------------------+---------------+----------------+-----------------+--------------------------+------------------------+------------+
|dayHour_cloudcover|dayHour_conditions|dayHour_datetime|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_precip|dayHour_precipprob|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|dayHour_temp|dayHour_uvindex|dayHour_visibility|dayHour_winddir|dayHour_windgust|dayHour_windspeed|dayHour_preciptype_explode|dayHour_stations_explode|day_datetime|
+------------------+------------------+----------------+------

In [36]:
df_desanidado_currentConditions.show(5)

+----------+----------------+--------+-------------+----+---------+--------+-------------------+---------+------+----------+----------+--------+----+---------+-----------+--------------+------+--------------------+--------+------------+--------+-----------+----+-------+----------+-------+--------+---------+
|cloudcover|      conditions|datetime|datetimeEpoch| dew|feelslike|humidity|               icon|moonphase|precip|precipprob|preciptype|pressure|snow|snowdepth|solarenergy|solarradiation|source|            stations| sunrise|sunriseEpoch|  sunset|sunsetEpoch|temp|uvindex|visibility|winddir|windgust|windspeed|
+----------+----------------+--------+-------------+----+---------+--------+-------------------+---------+------+----------+----------+--------+----+---------+-----------+--------------+------+--------------------+--------+------------+--------+-----------+----+-------+----------+-------+--------+---------+
|      31.1|Partially cloudy|19:20:00|   1737397200|49.2|     52.0|    90

In [46]:
# Se agregan los datos de la ciudad (df) al df de los datos por hora (df_explode_DayHour)
datos_ciudad = df.select('address', 'description', 'latitude', 'longitude', 'resolvedAddress', 'timezone', 'tzoffset').first()

df_explode_DayHour = (
    df_explode_DayHour
    .withColumn('address', lit(datos_ciudad['address']))
    .withColumn('description', lit(datos_ciudad['description']))
    .withColumn('latitude', lit(datos_ciudad['latitude']))
    .withColumn('longitude', lit(datos_ciudad['longitude']))
    .withColumn('resolvedAddress', lit(datos_ciudad['resolvedAddress']))
    .withColumn('timezone', lit(datos_ciudad['timezone']))
    .withColumn('tzoffset', lit(datos_ciudad['tzoffset']))
)

df_explode_DayHour.show()

+------------------+------------------+----------------+---------------------+-----------+-----------------+----------------+-------------------+--------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+------------+---------------+------------------+---------------+----------------+-----------------+--------------------------+------------------------+------------+-------+--------------------+--------+---------+---------------+-----------+--------+
|dayHour_cloudcover|dayHour_conditions|dayHour_datetime|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_precip|dayHour_precipprob|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|dayHour_temp|dayHour_uvindex|dayHour_visibility|dayHour_winddir|dayHour_windgust|dayHour_windspeed|dayHour_preciptype_explode|dayHour_stations

In [47]:
# Se modifica la columna "tzoffset" ajustando el valor en -3 (valor el cual, al restarlo a conicide con la hora Argentina en que se tomaron los datos)
df_explode_DayHour = df_explode_DayHour.withColumn('tzoffset', lit(-3.0))


In [69]:
columnas_ordenadas = ['resolvedAddress', 'latitude', 'longitude', 'day_datetime', 'timezone', 'dayHour_datetime', 'tzoffset', 'dayHour_stations_explode', 
                      'dayHour_cloudcover', 'dayHour_conditions', 'dayHour_datetimeEpoch', 'dayHour_dew', 'dayHour_feelslike', 'dayHour_humidity', 
                      'dayHour_icon', 'dayHour_preciptype_explode', 'dayHour_precip', 'dayHour_precipprob', 'dayHour_pressure', 'dayHour_severerisk', 'dayHour_snow', 'dayHour_snowdepth', 'dayHour_solarenergy',
                      'dayHour_solarradiation', 'dayHour_source', 'dayHour_temp', 'dayHour_uvindex', 'dayHour_visibility', 'dayHour_winddir', 'dayHour_windgust', 'dayHour_windspeed']

nombres_nuevos = ['resolvedAddress', 'latitude', 'longitude', 'day', 'timezone', 'datetime', 'tzoffset', 'stations', 'cloudcover', 'conditions', 'datetimeEpoch', 'dew', 
                  'feelsLike', 'humidity', 'icon', 'precip', 'preipProb', 'pressure', 'severerisk', 'snow', 'snowDepth', 'solarEnergy', 'solarRadiation', 'source', 'temp', 
                  'uvindex', 'visibility', 'windDir', 'windGust', 'windSpeed', 'precipType']

df_organizado = df_explode_DayHour.select(columnas_ordenadas)
df_organizado.show(5)

+---------------+--------+---------+------------+-----------+----------------+--------+------------------------+------------------+------------------+---------------------+-----------+-----------------+----------------+-------------------+--------------------------+--------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+------------+---------------+------------------+---------------+----------------+-----------------+
|resolvedAddress|latitude|longitude|day_datetime|   timezone|dayHour_datetime|tzoffset|dayHour_stations_explode|dayHour_cloudcover|dayHour_conditions|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_preciptype_explode|dayHour_precip|dayHour_precipprob|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|dayHour_temp|dayHour_uvindex|dayHour_visibility|da

In [67]:
df_organizado.show(5)

+---------------+--------+---------+------------+-----------+----------------+--------+------------------------+------------------+------------------+---------------------+-----------+-----------------+----------------+-------------------+--------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+------------+---------------+------------------+---------------+----------------+-----------------+--------------------------+
|resolvedAddress|latitude|longitude|day_datetime|   timezone|dayHour_datetime|tzoffset|dayHour_stations_explode|dayHour_cloudcover|dayHour_conditions|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_precip|dayHour_precipprob|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|dayHour_temp|dayHour_uvindex|dayHour_visibility|dayHour_winddir|dayHour_windg

In [44]:
df_stations = df.withColumn('station', explode('stations')).drop('stations')


AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "explode(stations)" due to data type mismatch: Parameter 1 requires the ("ARRAY" or "MAP") type, however "stations" has the type "STRUCT<C6242: STRUCT<contribution: DOUBLE, distance: DOUBLE, id: STRING, latitude: DOUBLE, longitude: DOUBLE, name: STRING, quality: BIGINT, useCount: BIGINT>, D2770: STRUCT<contribution: DOUBLE, distance: DOUBLE, id: STRING, latitude: DOUBLE, longitude: DOUBLE, name: STRING, quality: BIGINT, useCount: BIGINT>, LICJ: STRUCT<contribution: DOUBLE, distance: DOUBLE, id: STRING, latitude: DOUBLE, longitude: DOUBLE, name: STRING, quality: BIGINT, useCount: BIGINT>, LICT: STRUCT<contribution: DOUBLE, distance: DOUBLE, id: STRING, latitude: DOUBLE, longitude: DOUBLE, name: STRING, quality: BIGINT, useCount: BIGINT>>".;
'Project [address#4907, alerts#4908, currentConditions#4909, days#4910, description#4911, latitude#4912, longitude#4913, queryCost#4914L, resolvedAddress#4915, stations#4916, timezone#4917, tzoffset#4918, explode(stations#4916) AS station#5408]
+- Relation [address#4907,alerts#4908,currentConditions#4909,days#4910,description#4911,latitude#4912,longitude#4913,queryCost#4914L,resolvedAddress#4915,stations#4916,timezone#4917,tzoffset#4918] json


In [70]:
df_renombrado_columnas = df_organizado.select(
    [col(nombre_antiguo).alias(nombre_nuevo) for nombre_antiguo, nombre_nuevo in zip(df_organizado.columns, nombres_nuevos)]
)

In [1]:
df_renombrado_columnas.show(5)

NameError: name 'df_renombrado_columnas' is not defined

In [7]:
from pyspark.sql.functions import col, when, lit, array, struct, expr
from pyspark.sql.types import StructType, ArrayType, StringType, DoubleType, LongType, FloatType

valores_reemplazo = {
    StringType(): "Sin Dato",
    DoubleType(): 0.0,
    LongType(): 0,
    FloatType(): 0.0
}

def reemplazar_nulos(df, schema, valores_reemplazo):
    for field in schema.fields:
        column_name = field.name
        data_type = field.dataType
        
        if isinstance(data_type, StructType):
            struct_cols = [
                when(col(f'{column_name}.{sub_field.name}').isNull(),
                     lit(valores_reemplazo.get(sub_field.dataType, None)))
                .otherwise(col(f'{column_name}.{sub_field.name}'))
                .alias(sub_field.name)
                for sub_field in data_type.fields
            ]
            df = df.withColumn(column_name, struct(*struct_cols))
            return df
        
        
df_reemplazado = reemplazar_nulos(df, df.schema, valores_reemplazo)


In [67]:
df_reemplazado.printSchema()

root
 |-- address: string (nullable = true)
 |-- alerts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentConditions: struct (nullable = false)
 |    |-- cloudcover: double (nullable = true)
 |    |-- conditions: string (nullable = true)
 |    |-- datetime: string (nullable = true)
 |    |-- datetimeEpoch: long (nullable = true)
 |    |-- dew: double (nullable = true)
 |    |-- feelslike: double (nullable = true)
 |    |-- humidity: double (nullable = true)
 |    |-- icon: string (nullable = true)
 |    |-- moonphase: double (nullable = true)
 |    |-- precip: double (nullable = true)
 |    |-- precipprob: double (nullable = true)
 |    |-- preciptype: string (nullable = true)
 |    |-- pressure: double (nullable = true)
 |    |-- snow: double (nullable = true)
 |    |-- snowdepth: double (nullable = true)
 |    |-- solarenergy: double (nullable = true)
 |    |-- solarradiation: double (nullable = true)
 |    |-- source: string (nullable = true)
 |   

In [90]:
from pyspark.sql import DataFrame
from pyspark.sql.types import StructType, ArrayType, StringType, DoubleType, LongType, FloatType
from pyspark.sql.types import StringType, IntegerType, LongType, DoubleType



def explotar_columnas_array(df, diccionario_resultado, sufijo_explode=None, columnas_target=None):
    
    for columna in df.schema:
        if isinstance(columna.dataType, ArrayType):
            columna_nombre = columna.name
            if columnas_target is None or columna_nombre in columnas_target:
                diccionario_resultado[columna_nombre] = df.select(explode(col(columna_nombre)).alias(f'{columna_nombre}'))
            
def desanidar_columnas_struct(df, diccionario_resultado, sufijo_desanidado=None, columnas_target=None):
    
    for columna in df.schema:
        if isinstance(columna.dataType, StructType):
            columna_nombre = columna.name
            if columnas_target is None or columna_nombre in columnas_target:

                campos_struct = [
                    col(f'{columna_nombre}.{subfield.name}').alias(f'{columna_nombre}_{subfield.name}')
                    for subfield in columna.dataType.fields
                ]
                diccionario_resultado[columna_nombre] = df.select(*campos_struct)
                    
def aplicar_dataframe(metodo:str, diccionario_df, diccionario_dfResultado, sufijo=None, columnas_target=None):
    
    if metodo == 'explotar':
        if isinstance(diccionario_df, DataFrame):
            explotar_columnas_array(diccionario_df, diccionario_dfResultado, sufijo, columnas_target)
        
        elif isinstance(diccionario_df, dict):
            for key, df in diccionario_df.items():
                explotar_columnas_array(df, diccionario_dfResultado, sufijo, columnas_target)
    
    elif metodo == 'desanidar':
        if isinstance(diccionario_df, DataFrame):
            desanidar_columnas_struct(diccionario_df, diccionario_dfResultado, sufijo, columnas_target)
        
        elif isinstance(diccionario_df, dict):
            for key, df in diccionario_df.items():
                desanidar_columnas_struct(df, diccionario_dfResultado, sufijo, columnas_target)
        
        


# REEMPLAZO VALORES NULOS

valores_reemplazo = {
    StringType : 'Sin Dato',
    IntegerType : 0,
    LongType : 0,
    DoubleType : 0.0
    }


def reemplazar_nulos(diccionario_df):
    
    if isinstance(diccionario_df, dict):
        
        for key, df in diccionario_df.items():
            # Reemplazar según tipo de dato
            for columna in df.schema.fields:
                tipo = type(columna.dataType)
                if tipo in valores_reemplazo:
                    df = df.fillna({columna.name : valores_reemplazo[tipo]})
            
            diccionario_df[key] = df

In [91]:

dfExplodedArray_Alerts_Days_1 = {}
dfDesanidadoStruct_Days_2 = {}
dfExplodeArray_DaysHours_DayStation_3 = {} 
dfDesanidadoStruct_DaysHours_4 = {} 

dfDesanidadoStruct_Current_Station_1 = {}

dfDesanidadoStruct_Stations_2 = {}


columnas_array_1 = ['alerts', 'days']
columnas_struct_1 = ['currentConditions', 'stations']

columnas_struct_2 = ['days']
columnas_array_2 = ['days_hours', 'days_stations']

columnas_struct_3 = ['days_hours']

columnas_struct_4 = {'stations_C6242', 'stations_D2770', 'stations_LICJ', 'stations_LICT'}


aplicar_dataframe('explotar', df, dfExplodedArray_Alerts_Days_1, 'explode1', columnas_array_1)
aplicar_dataframe('desanidar', df, dfDesanidadoStruct_Current_Station_1, 'desanidar1', columnas_struct_1)
aplicar_dataframe('desanidar', dfExplodedArray_Alerts_Days_1, dfDesanidadoStruct_Days_2, 'desanidar2', columnas_struct_2)
aplicar_dataframe('explotar', dfDesanidadoStruct_Days_2, dfExplodeArray_DaysHours_DayStation_3, 'desanidar2', columnas_array_2)
aplicar_dataframe('desanidar', dfExplodeArray_DaysHours_DayStation_3, dfDesanidadoStruct_DaysHours_4, 'desanidar2', columnas_struct_3)
aplicar_dataframe('desanidar', dfDesanidadoStruct_Current_Station_1, dfDesanidadoStruct_Stations_2, 'desanidar2', columnas_struct_4)


# REEMPLAZAR VALORES NULOS DE CADA DATAFRAME FINAL EN LOS DICCIONARIOS


In [76]:
for key, df in dfDesanidadoStruct_DaysHours_4.items():
        print(f'- DATAFRAME {key}:')
        df.show()


- DATAFRAME days_hours:
+---------------------+---------------------+-------------------+------------------------+--------------+--------------------+-------------------+-------------------+-----------------+---------------------+---------------------+-------------------+---------------------+---------------+--------------------+----------------------+-------------------------+-----------------+-------------------+---------------+------------------+---------------------+------------------+-------------------+--------------------+
|days_hours_cloudcover|days_hours_conditions|days_hours_datetime|days_hours_datetimeEpoch|days_hours_dew|days_hours_feelslike|days_hours_humidity|    days_hours_icon|days_hours_precip|days_hours_precipprob|days_hours_preciptype|days_hours_pressure|days_hours_severerisk|days_hours_snow|days_hours_snowdepth|days_hours_solarenergy|days_hours_solarradiation|days_hours_source|days_hours_stations|days_hours_temp|days_hours_uvindex|days_hours_visibility|days_hours_wi

In [None]:
from pyspark.sql.functions import col, sum, when

for key, df in dfDesanidadoStruct_Current_Station_1.items():
        print(f'- DATAFRAME {key}:')
        df.filter(
                [(col(c).isNull()).alias(c) for c in df.columns]
        ).show

### REEMPLAZO VALORES NULOS

In [92]:
from pyspark.sql.types import StringType, IntegerType, LongType, DoubleType

valores_reemplazo = {
    StringType : 'Sin Dato',
    IntegerType : 0,
    LongType : 0,
    DoubleType : 0.0
    }


def reemplazar_nulos(diccionario_df):
    
    if isinstance(diccionario_df, dict):
        
        for key, df in diccionario_df.items():
            # Reemplazar según tipo de dato
            for columna in df.schema.fields:
                tipo = type(columna.dataType)
                if tipo in valores_reemplazo:
                    df = df.fillna({columna.name : valores_reemplazo[tipo]})
            
            diccionario_df[key] = df

In [93]:
reemplazar_nulos(dfExplodedArray_Alerts_Days_1)
reemplazar_nulos(dfDesanidadoStruct_Days_2)
reemplazar_nulos(dfExplodeArray_DaysHours_DayStation_3)
reemplazar_nulos(dfDesanidadoStruct_DaysHours_4)
reemplazar_nulos(dfDesanidadoStruct_Current_Station_1)
reemplazar_nulos(dfDesanidadoStruct_Stations_2)

In [96]:
for key, df in dfDesanidadoStruct_DaysHours_4.items():
        print(f'- DATAFRAME {key}:')
        df.show()

- DATAFRAME days_hours:
+---------------------+---------------------+-------------------+------------------------+--------------+--------------------+-------------------+-------------------+-----------------+---------------------+---------------------+-------------------+---------------------+---------------+--------------------+----------------------+-------------------------+-----------------+-------------------+---------------+------------------+---------------------+------------------+-------------------+--------------------+
|days_hours_cloudcover|days_hours_conditions|days_hours_datetime|days_hours_datetimeEpoch|days_hours_dew|days_hours_feelslike|days_hours_humidity|    days_hours_icon|days_hours_precip|days_hours_precipprob|days_hours_preciptype|days_hours_pressure|days_hours_severerisk|days_hours_snow|days_hours_snowdepth|days_hours_solarenergy|days_hours_solarradiation|days_hours_source|days_hours_stations|days_hours_temp|days_hours_uvindex|days_hours_visibility|days_hours_wi