# Libraries

In [2]:
import os
import sys
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable
from pyspark.sql import SparkSession, DataFrame
import requests
import calendar
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, TimestampType, IntegerType
from pyspark.sql.functions import col, unix_timestamp
from functools import reduce

#### Calling the Api and create Dataframe with pyspark

In [3]:
# create spark session 
spark = SparkSession.builder.appName('data_exploration').getOrCreate()

In [None]:
# Define the mapping of month names
months= {
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December'
}


# Define a list for the DataFrames of the monthly prices
dfs_monthly_prices = []

# Define the parameters for the API request
params = {
    'time_trunc': 'hour',
    'geo_limit': 'peninsular',
    'geo_ids': '8741'
}

# Define the base URL for the API request
base_url = 'https://apidatos.ree.es/es/datos/mercados/precios-mercados-tiempo-real?'

# Iterate through the months of the year 2022
for month in range(1, 13):  # 1 to 12 for January to December
    # Get the last day of the current month
    last_day = calendar.monthrange(2022, month)[1]

    # Define the start and end dates for each month
    start_date = f'2022-{month:02d}-01T00:00'
    end_date = f'2022-{month:02d}-{last_day:02d}T23:59'

    # Add the dates to the parameters
    params['start_date'] = start_date
    params['end_date'] = end_date

    # Make the HTTP request
    response = requests.get(base_url, params=params)
    data = response.json()

    # Extract the price data for the current month
    df_monthly_prices = data['included'][0]['attributes']['values']

    # Define the schema for the DataFrame
    schema = StructType([
        StructField("value", StringType(), True),
        StructField("percentage", StringType(), True),
        StructField("datetime", StringType(), True)
    ])

    # Create the DataFrame for the current month
    df = spark.createDataFrame(df_monthly_prices, schema=schema)

    # Convert the datatypes of the columns
    df = df.withColumn("value", col("value").cast(DoubleType()))
    df = df.withColumn("percentage", col("percentage").cast(IntegerType()))
    df = df.withColumn("datetime", unix_timestamp(col("datetime"), "yyyy-MM-dd'T'HH:mm:ss.SSSXXX").cast(TimestampType()))

    # Add the DataFrame of the current month to the list
    dfs_monthly_prices.append(df)

    # Name the DataFrame with the month name in English
    month_name = months[month]
    globals()[f'df_prices_{month_name}'] = df


# Concatena todos los DataFrames en uno solo
df_yearly_prices = reduce(DataFrame.union, dfs_monthly_prices)





In [5]:
df_yearly_prices.show(5)


+------+----------+-------------------+
| value|percentage|           datetime|
+------+----------+-------------------+
|204.51|         1|2022-01-01 00:00:00|
|171.35|         1|2022-01-01 01:00:00|
| 172.7|         1|2022-01-01 02:00:00|
|156.07|         1|2022-01-01 03:00:00|
|159.08|         1|2022-01-01 04:00:00|
+------+----------+-------------------+
only showing top 5 rows

