In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, explode
import os
import re

In [2]:
# Se crea la sesión de Spark
spark = SparkSession.builder \
        .appName('Clima Procesamiento') \
        .getOrCreate()

In [29]:
def obtener_ultimo_archivo(directorio, extension = '*.json'):
    
    '''
    Obtiene el archivo más reciente de un directorio con la extensión especificada.

    Args:
        directorio (str): Ruta del directorio donde buscar los archivos.
        extension (str): Extensión de los archivos a buscar (por defecto '*.json').

    Returns:
        str: Ruta completa del archivo más reciente con la extensión especificada.
        '''
    
    try:
        # Obtener la lista de rutas completas de los archivos que coinciden con la extensión
        archivos = [os.path.join(directorio, archivo) for archivo in os.listdir(directorio) if archivo.endswith(extension)]
        
        # Verificar si no hay archivos en la lista
        if not archivos:
            raise FileNotFoundError('No se encontraron archivos en el directorio especificado.')
        
        # Obtener el archivo más reciente basado en la fecha de modificación
        ultimo_archivo = max(archivos, key = os.path.getmtime)
        
        # Retorna ruta completa de último archivo
        return ultimo_archivo
    
    except FileNotFoundError as e:
        print(f'Error: {e}')
        raise
    except Exception as e:
        print(f'Error inesperado: {e}')
        raise

data_dir = 'Datos'

ultimo_archivo = obtener_ultimo_archivo(data_dir, extension='.json')

In [72]:
def procesamiento_datos(ultimo_archivo):
    
    '''
    Esta función procesa un archivo JSON crudo obtenido de datos climáticos y realiza transformaciones para 
    generar un DataFrame de Spark desanidado, explotando los datos anidados de días y horas, y agregando 
    información de la ciudad y zona horaria.

    Argumentos:
        ultimo_archivo (str): Ruta del archivo JSON que contiene los datos climáticos crudos.

    Retorna:
        DataFrame: Un DataFrame de Spark procesado con datos desanidados, explotados y enriquecidos con 
        información de la ciudad.
    '''
    
    try:
        # Leer el archivo JSON en dataframe de Spark
        df = spark.read.json(ultimo_archivo)
        
        # Explode de la columna "days", convirtiendo cada día en una fila separada
        df_days_exploded = df.withColumn('day', explode('days')).drop('days')

        # Seleccionar y desanidar columnas del nivel "day"
        df_desanidado_day = df_days_exploded.select(
            col("day.cloudcover").alias("day_cloudcover"),
            col("day.conditions").alias("day_conditions"),
            col("day.datetime").alias("day_datetime"),
            col("day.datetimeEpoch").alias("day_datetimeEpoch"),
            col("day.description").alias("day_description"),
            col("day.dew").alias("day_dew"),
            col("day.feelslike").alias("day_feelslike"),
            col("day.feelslikemax").alias("day_feelslikemax"),
            col("day.feelslikemin").alias("day_feelslikemin"),
            col("day.hours").alias("day_hours"), # ARRAY
            col("day.humidity").alias("day_humidity"),
            col("day.icon").alias("day_icon"),
            col("day.moonphase").alias("day_moonphase"),
            col("day.precip").alias("day_precip"),
            col("day.precipcover").alias("day_precipcover"),
            col("day.precipprob").alias("day_precipprob"),
            col("day.preciptype").alias("day_preciptype"), # ARRAY
            col("day.pressure").alias("day_pressure"),
            col("day.severerisk").alias("day_severerisk"),
            col("day.snow").alias("day_snow"),
            col("day.snowdepth").alias("day_snowdepth"),
            col("day.solarenergy").alias("day_solarenergy"),
            col("day.solarradiation").alias("day_solarradiation"),
            col("day.source").alias("day_source"),
            col("day.stations").alias("day_stations"), # ARRAY
            col("day.sunrise").alias("day_sunrise"),
            col("day.sunriseEpoch").alias("day_sunriseEpoch"),
            col("day.sunset").alias("day_sunset"),
            col("day.sunsetEpoch").alias("day_sunsetEpoch"),
            col("day.temp").alias("day_temp"),
            col("day.tempmax").alias("day_tempmax"),
            col("day.tempmin").alias("day_tempmin"),
            col("day.uvindex").alias("day_uvindex"),
            col("day.visibility").alias("day_visibility"),
            col("day.winddir").alias("day_winddir"),
            col("day.windgust").alias("day_windgust"),
            col("day.windspeed").alias("day_windspeed")
        )
        
        # Explode de la columna "day_hours", creando filas separadas para cada hora
        df_Day = df_desanidado_day.withColumn('day_hour', explode('day_hours')).drop('day_hours')

        # Desanidar columnas del nivel "hour" (hora)
        df_DayHour = df_Day.select(
            col('day_hour.cloudcover').alias('dayHour_cloudcover'),
            col('day_hour.conditions').alias('dayHour_conditions'),
            col('day_hour.datetime').alias('dayHour_datetime'),
            col('day_hour.datetimeEpoch').alias('dayHour_datetimeEpoch'),
            col('day_hour.dew').alias('dayHour_dew'),
            col('day_hour.feelslike').alias('dayHour_feelslike'),
            col('day_hour.humidity').alias('dayHour_humidity'),
            col('day_hour.icon').alias('dayHour_icon'),
            col('day_hour.precip').alias('dayHour_precip'),
            col('day_hour.precipprob').alias('dayHour_precipprob'),
            col('day_hour.preciptype').alias('dayHour_preciptype'),
            col('day_hour.pressure').alias('dayHour_pressure'),
            col('day_hour.severerisk').alias('dayHour_severerisk'),
            col('day_hour.snow').alias('dayHour_snow'),
            col('day_hour.snowdepth').alias('dayHour_snowdepth'),
            col('day_hour.solarenergy').alias('dayHour_solarenergy'),
            col('day_hour.solarradiation').alias('dayHour_solarradiation'),
            col('day_hour.source').alias('dayHour_source'),
            col('day_hour.stations').alias('dayHour_stations'),
            col('day_hour.temp').alias('dayHour_temp'),
            col('day_hour.uvindex').alias('dayHour_uvindex'),
            col('day_hour.visibility').alias('dayHour_visibility'),
            col('day_hour.winddir').alias('dayHour_winddir'),
            col('day_hour.windgust').alias('dayHour_windgust'),
            col('day_hour.windspeed').alias('dayHour_windspeed')
        )
        
        # Reemplazo de valores NULL en 'dayHour_preciptype' con el valor 'Sin Dato'
        df_DayHour_filled = (
            df_DayHour
            .withColumn(
                'dayHour_preciptype', when(
                    col('dayHour_preciptype').isNull(),
                    lit(['Sin Dato'])
                ).otherwise(col('dayHour_preciptype'))
        ))

        # Explode de las columnas 'dayHour_preciptype' y 'dayHour_stations'
        df_explode_DayHour = (
            df_DayHour_filled
            .withColumn('dayHour_preciptype_explode', explode('dayHour_preciptype'))
            .withColumn('dayHour_stations_explode', explode('dayHour_stations'))
            .drop('dayHour_preciptype', 'dayHour_stations')
        )
        
        # Extraer la fecha del nombre del archivo
        fecha_extraida = re.search(r'\d{4}-\d{2}-\d{2}', ultimo_archivo).group(0)

        # Agregar la columna 'day_datetime' al DataFrame basado en la fecha extraída
        df_explode_DayHour = df_explode_DayHour.withColumn('day_datetime', lit(fecha_extraida))
        
        # Obtener información constante de la ciudad del DataFrame original
        datos_ciudad = df.select('address', 'description', 'latitude', 'longitude', 'resolvedAddress', 'timezone', 'tzoffset').first()

        # Añadir columnas constantes de la ciudad al DataFrame final
        df_explode_DayHour = (
            df_explode_DayHour
            .withColumn('address', lit(datos_ciudad['address']))
            .withColumn('description', lit(datos_ciudad['description']))
            .withColumn('latitude', lit(datos_ciudad['latitude']))
            .withColumn('longitude', lit(datos_ciudad['longitude']))
            .withColumn('resolvedAddress', lit(datos_ciudad['resolvedAddress']))
            .withColumn('timezone', lit(datos_ciudad['timezone']))
            .withColumn('tzoffset', lit(datos_ciudad['tzoffset']))
        )
        
        # Agregar columna ajustada para la zona horaria Argentina
        df_explode_DayHour = df_explode_DayHour.withColumn('tzoffset_Arg', lit(-3.0))
        
        # Se crea una lista con el nombre del orden deseado de las columnas
        columnas_ordenadas = ['resolvedAddress', 'latitude', 'longitude', 'day_datetime', 'timezone', 'dayHour_datetime', 'tzoffset', 'dayHour_stations_explode', 
                      'dayHour_cloudcover', 'dayHour_conditions', 'dayHour_datetimeEpoch', 'dayHour_dew', 'dayHour_feelslike', 'dayHour_humidity', 
                      'dayHour_icon', 'dayHour_preciptype_explode', 'dayHour_precip', 'dayHour_precipprob', 'dayHour_pressure', 'dayHour_severerisk', 'dayHour_snow', 'dayHour_snowdepth', 'dayHour_solarenergy',
                      'dayHour_solarradiation', 'dayHour_source', 'dayHour_temp', 'dayHour_uvindex', 'dayHour_visibility', 'dayHour_winddir', 'dayHour_windgust', 'dayHour_windspeed']

        # Se crea una lista con el nuevo nombre deseado de las columnas
        nombres_nuevos = ['resolvedAddress', 'latitude', 'longitude', 'day', 'timezone', 'datetime', 'tzoffset', 'stations', 'cloudcover', 'conditions', 'datetimeEpoch', 'dew', 
                  'feelsLike', 'humidity', 'icon', 'precip', 'preipProb', 'pressure', 'severerisk', 'snow', 'snowDepth', 'solarEnergy', 'solarRadiation', 'source', 'temp', 
                  'uvindex', 'visibility', 'windDir', 'windGust', 'windSpeed', 'precipType']

        # Se realizar un "select" para ordenar las columnas como se especificó en la lista
        df_organizado = df_explode_DayHour.select(columnas_ordenadas)
        
        # Se realiza un for para modificar el antiguo nombre de las columnas por el nuevo
        df_renombrado_columnas = df_organizado.select(
        [col(nombre_antiguo).alias(nombre_nuevo) for nombre_antiguo, nombre_nuevo in zip(df_organizado.columns, nombres_nuevos)])
        
        # Retornar el DataFrame procesado
        return df_renombrado_columnas
    
    except FileNotFoundError:
        print(f'Error: el archivo {ultimo_archivo} no existe.')
        raise
    except ValueError:
        print('Error: los datos no están en el formato esperado.')
        raise
    except pyspark.sql.utils.AnalysisException as e:
        print(f'Error en operaciones de Spark: {e}')
        raise
    except Exception as e:
        print(f'Error: {e}')
        raise

In [73]:
df_procesado = procesamiento_datos(ultimo_archivo)

In [74]:
df_procesado.show(5)

+---------------+--------+---------+----------+-----------+--------+--------+--------+----------+----------------+-------------+----+---------+--------+-------------------+--------+---------+--------+----------+----+---------+-----------+--------------+------+----+-------+----------+-------+--------+---------+----------+
|resolvedAddress|latitude|longitude|       day|   timezone|datetime|tzoffset|stations|cloudcover|      conditions|datetimeEpoch| dew|feelsLike|humidity|               icon|  precip|preipProb|pressure|severerisk|snow|snowDepth|solarEnergy|solarRadiation|source|temp|uvindex|visibility|windDir|windGust|windSpeed|precipType|
+---------------+--------+---------+----------+-----------+--------+--------+--------+----------+----------------+-------------+----+---------+--------+-------------------+--------+---------+--------+----------+----+---------+-----------+--------------+------+----+-------+----------+-------+--------+---------+----------+
|Sicilia, Italia| 38.1221|  13.

### PROCESAMIENTO DE DATOS 

In [16]:
from pyspark.sql.functions import col
from pyspark.sql.functions import explode

# Desanidamiento de columna "currentConditions" y asignación de nuevos alias
df_desanidado_currentConditions = df.select(
    col('currentConditions.cloudcover').alias('cloudcover'),
    col('currentConditions.conditions').alias('conditions'),
    col('currentConditions.datetime').alias('datetime'),
    col('currentConditions.datetimeEpoch').alias('datetimeEpoch'),
    col('currentConditions.dew').alias('dew'),
    col('currentConditions.feelslike').alias('feelslike'),
    col('currentConditions.humidity').alias('humidity'),
    col('currentConditions.icon').alias('icon'),
    col('currentConditions.moonphase').alias('moonphase'),
    col('currentConditions.precip').alias('precip'),
    col('currentConditions.precipprob').alias('precipprob'),
    col('currentConditions.preciptype').alias('preciptype'),
    col('currentConditions.pressure').alias('pressure'),
    col('currentConditions.snow').alias('snow'),
    col('currentConditions.snowdepth').alias('snowdepth'),
    col('currentConditions.solarenergy').alias('solarenergy'),
    col('currentConditions.solarradiation').alias('solarradiation'),
    col('currentConditions.source').alias('source'),
    col('currentConditions.stations').alias('stations'),
    col('currentConditions.sunrise').alias('sunrise'),
    col('currentConditions.sunriseEpoch').alias('sunriseEpoch'),
    col('currentConditions.sunset').alias('sunset'),
    col('currentConditions.sunsetEpoch').alias('sunsetEpoch'),
    col('currentConditions.temp').alias('temp'),
    col('currentConditions.uvindex').alias('uvindex'),
    col('currentConditions.visibility').alias('visibility'),
    col('currentConditions.winddir').alias('winddir'),
    col('currentConditions.windgust').alias('windgust'),
    col('currentConditions.windspeed').alias('windspeed') 
)

# Explode de columna "stations" (la cual se encuentra dentro de "currenConditions")
df_final_currentConditions = df_desanidado_currentConditions.withColumn('station', explode('stations')).drop('stations')
df_final_currentConditions.show()

+----------+----------------+--------+-------------+----+---------+--------+-------------------+---------+------+----------+----------+--------+----+---------+-----------+--------------+------+--------+------------+--------+-----------+----+-------+----------+-------+--------+---------+-------+
|cloudcover|      conditions|datetime|datetimeEpoch| dew|feelslike|humidity|               icon|moonphase|precip|precipprob|preciptype|pressure|snow|snowdepth|solarenergy|solarradiation|source| sunrise|sunriseEpoch|  sunset|sunsetEpoch|temp|uvindex|visibility|winddir|windgust|windspeed|station|
+----------+----------------+--------+-------------+----+---------+--------+-------------------+---------+------+----------+----------+--------+----+---------+-----------+--------------+------+--------+------------+--------+-----------+----+-------+----------+-------+--------+---------+-------+
|      31.1|Partially cloudy|19:20:00|   1737397200|49.2|     52.0|    90.2|partly-cloudy-night|     0.71|   0.0

In [15]:
# Explode de columna "days"
df_days_exploded = df.withColumn('day', explode('days')).drop('days')
df_days_exploded.show()

+-------+--------------------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+--------------------+
|address|              alerts|   currentConditions|         description|latitude|longitude|queryCost|resolvedAddress|            stations|   timezone|tzoffset|                 day|
+-------+--------------------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+--------------------+
|Sicilia|[{No Special Awar...|{31.1, Partially ...|Similar temperatu...| 38.1221|  13.3611|        1|Sicilia, Italia|{{0.0, 54398.0, C...|Europe/Rome|     1.0|{31.7, Partially ...|
+-------+--------------------+--------------------+--------------------+--------+---------+---------+---------------+--------------------+-----------+--------+--------------------+



In [21]:
# Desanidamiento de columna "days" (original). 
df_desanidado_day = df_days_exploded.select(
    col("day.cloudcover").alias("day_cloudcover"),
    col("day.conditions").alias("day_conditions"),
    col("day.datetime").alias("day_datetime"),
    col("day.datetimeEpoch").alias("day_datetimeEpoch"),
    col("day.description").alias("day_description"),
    col("day.dew").alias("day_dew"),
    col("day.feelslike").alias("day_feelslike"),
    col("day.feelslikemax").alias("day_feelslikemax"),
    col("day.feelslikemin").alias("day_feelslikemin"),
    col("day.hours").alias("day_hours"), # ARRAY
    col("day.humidity").alias("day_humidity"),
    col("day.icon").alias("day_icon"),
    col("day.moonphase").alias("day_moonphase"),
    col("day.precip").alias("day_precip"),
    col("day.precipcover").alias("day_precipcover"),
    col("day.precipprob").alias("day_precipprob"),
    col("day.preciptype").alias("day_preciptype"), # ARRAY
    col("day.pressure").alias("day_pressure"),
    col("day.severerisk").alias("day_severerisk"),
    col("day.snow").alias("day_snow"),
    col("day.snowdepth").alias("day_snowdepth"),
    col("day.solarenergy").alias("day_solarenergy"),
    col("day.solarradiation").alias("day_solarradiation"),
    col("day.source").alias("day_source"),
    col("day.stations").alias("day_stations"), # ARRAY
    col("day.sunrise").alias("day_sunrise"),
    col("day.sunriseEpoch").alias("day_sunriseEpoch"),
    col("day.sunset").alias("day_sunset"),
    col("day.sunsetEpoch").alias("day_sunsetEpoch"),
    col("day.temp").alias("day_temp"),
    col("day.tempmax").alias("day_tempmax"),
    col("day.tempmin").alias("day_tempmin"),
    col("day.uvindex").alias("day_uvindex"),
    col("day.visibility").alias("day_visibility"),
    col("day.winddir").alias("day_winddir"),
    col("day.windgust").alias("day_windgust"),
    col("day.windspeed").alias("day_windspeed")
)

df_Day = df_desanidado_day.withColumn('day_hour', explode('day_hours')).drop('day_hours')
df_Day.show(5)
 

+--------------+----------------+------------+-----------------+--------------------+-------+-------------+----------------+----------------+------------+-----------------+-------------+----------+---------------+--------------+--------------+------------+--------------+--------+-------------+---------------+------------------+----------+-------------------+-----------+----------------+----------+---------------+--------+-----------+-----------+-----------+--------------+-----------+------------+-------------+--------------------+
|day_cloudcover|  day_conditions|day_datetime|day_datetimeEpoch|     day_description|day_dew|day_feelslike|day_feelslikemax|day_feelslikemin|day_humidity|         day_icon|day_moonphase|day_precip|day_precipcover|day_precipprob|day_preciptype|day_pressure|day_severerisk|day_snow|day_snowdepth|day_solarenergy|day_solarradiation|day_source|       day_stations|day_sunrise|day_sunriseEpoch|day_sunset|day_sunsetEpoch|day_temp|day_tempmax|day_tempmin|day_uvindex|da

In [24]:
df_DayHour = df_Day.select(
    col('day_hour.cloudcover').alias('dayHour_cloudcover'),
    col('day_hour.conditions').alias('dayHour_conditions'),
    col('day_hour.datetime').alias('dayHour_datetime'),
    col('day_hour.datetimeEpoch').alias('dayHour_datetimeEpoch'),
    col('day_hour.dew').alias('dayHour_dew'),
    col('day_hour.feelslike').alias('dayHour_feelslike'),
    col('day_hour.humidity').alias('dayHour_humidity'),
    col('day_hour.icon').alias('dayHour_icon'),
    col('day_hour.precip').alias('dayHour_precip'),
    col('day_hour.precipprob').alias('dayHour_precipprob'),
    col('day_hour.preciptype').alias('dayHour_preciptype'),
    col('day_hour.pressure').alias('dayHour_pressure'),
    col('day_hour.severerisk').alias('dayHour_severerisk'),
    col('day_hour.snow').alias('dayHour_snow'),
    col('day_hour.snowdepth').alias('dayHour_snowdepth'),
    col('day_hour.solarenergy').alias('dayHour_solarenergy'),
    col('day_hour.solarradiation').alias('dayHour_solarradiation'),
    col('day_hour.source').alias('dayHour_source'),
    col('day_hour.stations').alias('dayHour_stations'),
    col('day_hour.temp').alias('dayHour_temp'),
    col('day_hour.uvindex').alias('dayHour_uvindex'),
    col('day_hour.visibility').alias('dayHour_visibility'),
    col('day_hour.winddir').alias('dayHour_winddir'),
    col('day_hour.windgust').alias('dayHour_windgust'),
    col('day_hour.windspeed').alias('dayHour_windspeed')
)
df_DayHour.show(5)

+------------------+------------------+----------------+---------------------+-----------+-----------------+----------------+-------------------+--------------+------------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+-------------------+------------+---------------+------------------+---------------+----------------+-----------------+
|dayHour_cloudcover|dayHour_conditions|dayHour_datetime|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_precip|dayHour_precipprob|dayHour_preciptype|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|   dayHour_stations|dayHour_temp|dayHour_uvindex|dayHour_visibility|dayHour_winddir|dayHour_windgust|dayHour_windspeed|
+------------------+------------------+----------------+---------------------+-----------+-----------------+------

In [25]:
from pyspark.sql.functions import when, lit

# Reemplazo de valores NULL en columna 'dayHour_preciptype' para posterior 'explode'
df_DayHour_filled = (
    df_DayHour
    .withColumn(
        'dayHour_preciptype', when(
            col('dayHour_preciptype').isNull(),
            lit(['Sin Dato'])
        ).otherwise(col('dayHour_preciptype'))
))

# Eexplode para el dataframe "df_DayHour_filled"
df_explode_DayHour = (
    df_DayHour_filled
    .withColumn('dayHour_preciptype_explode', explode('dayHour_preciptype'))
    .withColumn('dayHour_stations_explode', explode('dayHour_stations'))
    .drop('dayHour_preciptype', 'dayHour_stations')
)

In [34]:
# Extrae la fecha del archivo JSON para agregar al dataframe de "df_explode_DayHour"
from pyspark.sql.functions import lit
import re

ruta_archivo = ultimo_archivo

fecha_extraida = re.search(r'\d{4}-\d{2}-\d{2}', ruta_archivo).group(0)

df_explode_DayHour = df_explode_DayHour.withColumn('day_datetime', lit(fecha_extraida))

# APLICAR FUNCIÓN "OBTENER_ULTIMO_ARCHIVO" PARA EXTRAER LA FECHA DEL JSON

In [40]:
df_explode_DayHour.orderBy(col('dayHour_datetime').desc()).limit(5).show()

+------------------+------------------+----------------+---------------------+-----------+-----------------+----------------+-------------------+--------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+------------+---------------+------------------+---------------+----------------+-----------------+--------------------------+------------------------+------------+
|dayHour_cloudcover|dayHour_conditions|dayHour_datetime|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_precip|dayHour_precipprob|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|dayHour_temp|dayHour_uvindex|dayHour_visibility|dayHour_winddir|dayHour_windgust|dayHour_windspeed|dayHour_preciptype_explode|dayHour_stations_explode|day_datetime|
+------------------+------------------+----------------+------

In [36]:
df_desanidado_currentConditions.show(5)

+----------+----------------+--------+-------------+----+---------+--------+-------------------+---------+------+----------+----------+--------+----+---------+-----------+--------------+------+--------------------+--------+------------+--------+-----------+----+-------+----------+-------+--------+---------+
|cloudcover|      conditions|datetime|datetimeEpoch| dew|feelslike|humidity|               icon|moonphase|precip|precipprob|preciptype|pressure|snow|snowdepth|solarenergy|solarradiation|source|            stations| sunrise|sunriseEpoch|  sunset|sunsetEpoch|temp|uvindex|visibility|winddir|windgust|windspeed|
+----------+----------------+--------+-------------+----+---------+--------+-------------------+---------+------+----------+----------+--------+----+---------+-----------+--------------+------+--------------------+--------+------------+--------+-----------+----+-------+----------+-------+--------+---------+
|      31.1|Partially cloudy|19:20:00|   1737397200|49.2|     52.0|    90

In [46]:
# Se agregan los datos de la ciudad (df) al df de los datos por hora (df_explode_DayHour)
datos_ciudad = df.select('address', 'description', 'latitude', 'longitude', 'resolvedAddress', 'timezone', 'tzoffset').first()

df_explode_DayHour = (
    df_explode_DayHour
    .withColumn('address', lit(datos_ciudad['address']))
    .withColumn('description', lit(datos_ciudad['description']))
    .withColumn('latitude', lit(datos_ciudad['latitude']))
    .withColumn('longitude', lit(datos_ciudad['longitude']))
    .withColumn('resolvedAddress', lit(datos_ciudad['resolvedAddress']))
    .withColumn('timezone', lit(datos_ciudad['timezone']))
    .withColumn('tzoffset', lit(datos_ciudad['tzoffset']))
)

df_explode_DayHour.show()

+------------------+------------------+----------------+---------------------+-----------+-----------------+----------------+-------------------+--------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+------------+---------------+------------------+---------------+----------------+-----------------+--------------------------+------------------------+------------+-------+--------------------+--------+---------+---------------+-----------+--------+
|dayHour_cloudcover|dayHour_conditions|dayHour_datetime|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_precip|dayHour_precipprob|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|dayHour_temp|dayHour_uvindex|dayHour_visibility|dayHour_winddir|dayHour_windgust|dayHour_windspeed|dayHour_preciptype_explode|dayHour_stations

In [47]:
# Se modifica la columna "tzoffset" ajustando el valor en -3 (valor el cual, al restarlo a conicide con la hora Argentina en que se tomaron los datos)
df_explode_DayHour = df_explode_DayHour.withColumn('tzoffset', lit(-3.0))


In [69]:
columnas_ordenadas = ['resolvedAddress', 'latitude', 'longitude', 'day_datetime', 'timezone', 'dayHour_datetime', 'tzoffset', 'dayHour_stations_explode', 
                      'dayHour_cloudcover', 'dayHour_conditions', 'dayHour_datetimeEpoch', 'dayHour_dew', 'dayHour_feelslike', 'dayHour_humidity', 
                      'dayHour_icon', 'dayHour_preciptype_explode', 'dayHour_precip', 'dayHour_precipprob', 'dayHour_pressure', 'dayHour_severerisk', 'dayHour_snow', 'dayHour_snowdepth', 'dayHour_solarenergy',
                      'dayHour_solarradiation', 'dayHour_source', 'dayHour_temp', 'dayHour_uvindex', 'dayHour_visibility', 'dayHour_winddir', 'dayHour_windgust', 'dayHour_windspeed']

nombres_nuevos = ['resolvedAddress', 'latitude', 'longitude', 'day', 'timezone', 'datetime', 'tzoffset', 'stations', 'cloudcover', 'conditions', 'datetimeEpoch', 'dew', 
                  'feelsLike', 'humidity', 'icon', 'precip', 'preipProb', 'pressure', 'severerisk', 'snow', 'snowDepth', 'solarEnergy', 'solarRadiation', 'source', 'temp', 
                  'uvindex', 'visibility', 'windDir', 'windGust', 'windSpeed', 'precipType']

df_organizado = df_explode_DayHour.select(columnas_ordenadas)
df_organizado.show(5)

+---------------+--------+---------+------------+-----------+----------------+--------+------------------------+------------------+------------------+---------------------+-----------+-----------------+----------------+-------------------+--------------------------+--------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+------------+---------------+------------------+---------------+----------------+-----------------+
|resolvedAddress|latitude|longitude|day_datetime|   timezone|dayHour_datetime|tzoffset|dayHour_stations_explode|dayHour_cloudcover|dayHour_conditions|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_preciptype_explode|dayHour_precip|dayHour_precipprob|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|dayHour_temp|dayHour_uvindex|dayHour_visibility|da

In [67]:
df_organizado.show(5)

+---------------+--------+---------+------------+-----------+----------------+--------+------------------------+------------------+------------------+---------------------+-----------+-----------------+----------------+-------------------+--------------+------------------+----------------+------------------+------------+-----------------+-------------------+----------------------+--------------+------------+---------------+------------------+---------------+----------------+-----------------+--------------------------+
|resolvedAddress|latitude|longitude|day_datetime|   timezone|dayHour_datetime|tzoffset|dayHour_stations_explode|dayHour_cloudcover|dayHour_conditions|dayHour_datetimeEpoch|dayHour_dew|dayHour_feelslike|dayHour_humidity|       dayHour_icon|dayHour_precip|dayHour_precipprob|dayHour_pressure|dayHour_severerisk|dayHour_snow|dayHour_snowdepth|dayHour_solarenergy|dayHour_solarradiation|dayHour_source|dayHour_temp|dayHour_uvindex|dayHour_visibility|dayHour_winddir|dayHour_windg

In [70]:
df_renombrado_columnas = df_organizado.select(
    [col(nombre_antiguo).alias(nombre_nuevo) for nombre_antiguo, nombre_nuevo in zip(df_organizado.columns, nombres_nuevos)]
)

In [71]:
df_renombrado_columnas.show(5)

+---------------+--------+---------+----------+-----------+--------+--------+--------+----------+----------------+-------------+----+---------+--------+-------------------+--------+---------+--------+----------+----+---------+-----------+--------------+------+----+-------+----------+-------+--------+---------+----------+
|resolvedAddress|latitude|longitude|       day|   timezone|datetime|tzoffset|stations|cloudcover|      conditions|datetimeEpoch| dew|feelsLike|humidity|               icon|  precip|preipProb|pressure|severerisk|snow|snowDepth|solarEnergy|solarRadiation|source|temp|uvindex|visibility|windDir|windGust|windSpeed|precipType|
+---------------+--------+---------+----------+-----------+--------+--------+--------+----------+----------------+-------------+----+---------+--------+-------------------+--------+---------+--------+----------+----+---------+-----------+--------------+------+----+-------+----------+-------+--------+---------+----------+
|Sicilia, Italia| 38.1221|  13.