#### Libraries

In [55]:
from sklearn.impute import SimpleImputer

In [57]:
import pandas as pd
import requests
import calendar
from datetime import datetime, timedelta
import numpy as np



#### Create Functions

In [5]:
# function to clean columns and transform columns to datetime

def process_dataframe(df):
    # Verifica si la columna 'datetime' existe en el DataFrame
    if 'datetime' in df.columns:
        # Convierte la columna 'datetime' a tipo datetime con zona horaria UTC
        df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
        
        # Elimina la columna 'percentage' si existe
        if 'percentage' in df.columns:
            df = df.drop('percentage', axis=1)
        
        # Elimina la información de la zona horaria para que sea similar a tu ejemplo
        df['datetime'] = df['datetime'].dt.tz_localize(None)
    
    return df


# Calling the Api's

#### Price

In [6]:
# Define common parameters
params = {
    'time_trunc': 'hour',
    'geo_limit': 'peninsular',
    'geo_ids': '8741'
}

# Define the base URL
base_url = 'https://apidatos.ree.es/es/datos/mercados/precios-mercados-tiempo-real?'

# Get the current date and time
current_datetime = datetime.now()

# Calculate the start date as one year before the current date
start_date = current_datetime - timedelta(days=365)

# Initialize a list to store the DataFrames of prices
dfs_prices = []

# Create a loop to fetch data month by month
while start_date <= current_datetime:
    # Calculate the end date for the current month
    year = start_date.year
    month = start_date.month
    last_day_of_month = (start_date.replace(day=28) + timedelta(days=4)).replace(day=1) - timedelta(days=1)
    end_date = last_day_of_month.replace(hour=23, minute=59, second=59)

    # Add the dates to the parameters
    params['start_date'] = start_date.isoformat()
    params['end_date'] = end_date.isoformat()

    # Make the HTTP request
    response = requests.get(base_url, params=params)
    data = response.json()

    # Extract the price data for the current month
    monthly_prices = data['included'][0]['attributes']['values']

    # Create a DataFrame for the current month and add it to the list
    df_month = pd.DataFrame(monthly_prices)
    dfs_prices.append(df_month)

    # Move to the next month
    start_date = last_day_of_month + timedelta(days=1)

# Concatenate all the DataFrames into one
df_all_prices = pd.concat(dfs_prices, ignore_index=True)


In [7]:
df_all_prices = process_dataframe(df_all_prices)

In [8]:
df_all_prices.to_csv('../../data/df_yearly_prices.csv', index=False)

#### Max Monthly demand

In [9]:
# Define common parameters
params = {
    'time_trunc': 'month',
    'geo_limit': 'peninsular',
    'geo_ids': '8741'
}

# Get the current date and time
current_datetime = datetime.now()

# Round the current date to the previous night at 00:00
end_date = current_datetime.replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1)

# Calculate the start date as one year ago at 00:00
start_date = end_date - timedelta(days=365)

# Convert the dates to text strings in the "00:00" format
start_date_str = start_date.strftime('%Y-%m-%dT%H:%M')
end_date_str = end_date.strftime('%Y-%m-%dT%H:%M')

# Define the base URL with dynamic time parameters
base_url = f'https://apidatos.ree.es/es/datos/demanda/demanda-maxima-diaria?start_date={start_date_str}&end_date={end_date_str}'

# Make the request to the API
response = requests.get(base_url, params=params)
data = response.json()

# Extract and print the data
df_max_demand = data['included'][0]['attributes']['values']


In [10]:
# Convert to DataFrame
df_max_demand = pd.DataFrame(df_max_demand)

In [11]:
# Process the DataFrame
df_max_demand = process_dataframe(df_max_demand)

In [12]:
# Save the DataFrame to a CSV file
df_max_demand.to_csv('../../data/df_max_demand.csv', index=False)

#### Actual demand

In [13]:
# Define common parameters
params = {
    'time_trunc': 'hour',
    'geo_limit': 'peninsular',
    'geo_ids': '8741'
}

# Define the base URL
base_url = 'https://apidatos.ree.es/es/datos/demanda/demanda-tiempo-real?'

# Get the current date and time
current_datetime = datetime.now()

# Calculate the start date as one year before the current date
start_date = current_datetime - timedelta(days=365)

# Initialize a list to store the DataFrames of prices
dfs_r_demand = []
dfs_p_demand = []
dfs_e_demand = []

# Create a loop to fetch data month by month
while start_date <= current_datetime:
    # Calculate the end date for the current month
    year = start_date.year
    month = start_date.month
    last_day_of_month = (start_date.replace(day=28) + timedelta(days=4)).replace(day=1) - timedelta(days=1)
    end_date = last_day_of_month.replace(hour=23, minute=59, second=59)

    # Add the dates to the parameters
    params['start_date'] = start_date.isoformat()
    params['end_date'] = end_date.isoformat()

    # Make the HTTP request
    response = requests.get(base_url, params=params)
    data = response.json()

    # Extract the price data for the current month
    monthly_demand_1 = data['included'][0]['attributes']['values']
    monthly_demand_2 = data['included'][1]['attributes']['values']
    monthly_demand_3 = data['included'][2]['attributes']['values']

    # Create a DataFrame for the current month
    df_month_1 = pd.DataFrame(monthly_demand_1)
    df_month_2 = pd.DataFrame(monthly_demand_2)
    df_month_3 = pd.DataFrame(monthly_demand_3)

    # Add the DataFrames to the list
    dfs_r_demand.append(df_month_1)
    dfs_p_demand.append(df_month_2)
    dfs_e_demand.append(df_month_3)

    # Move to the next month
    start_date = last_day_of_month + timedelta(days=1)

# Concatenate all DataFrames in the list to create a single DataFrame
df_real_demand = pd.concat(dfs_r_demand, ignore_index=True)
df_programed_demand = pd.concat(dfs_p_demand, ignore_index=True)
df_expected_demand = pd.concat(dfs_e_demand, ignore_index=True)


In [14]:
# Process the DataFrames
df_real_demand = process_dataframe(df_real_demand)
df_expected_demand = process_dataframe(df_expected_demand)
df_programed_demand = process_dataframe(df_programed_demand)

In [15]:
# Save the DataFrames to CSV
df_real_demand.to_csv('../../data/df_real_demand.csv', index=False)
df_expected_demand.to_csv('../../data/df_expected_demand.csv', index=False)
df_programed_demand.to_csv('../../data/df_programed_demand.csv', index=False)

#### Structure Generation

In [16]:
# Define common parameters
params = {
    'time_trunc': 'day',
    'geo_limit': 'peninsular',
    'geo_ids': '8741'
}

# Define the base URL
base_url = 'https://apidatos.ree.es/es/datos/generacion/estructura-generacion?'

# Get the current date and time
current_datetime = datetime.now()

# Calculate the start date as one year before the current date
params['start_date'] = current_datetime - timedelta(days=365)

# Calculate the end date as the current date
params['end_date'] = current_datetime

# Initialize a dictionary to store DataFrames for each 'type'
dfs_structure_by_type = {}

# Make the initial HTTP request and store the data
response = requests.get(base_url, params=params)
data = response.json()



In [17]:
# Loop through the 'included' list and store the DataFrames
for i in range(len(data['included'])):
    type_name = data['included'][i]['type']
    df = pd.DataFrame(data['included'][i]['attributes']['values'])
    # Renombrar las columnas con un prefijo único basado en el tipo de generación
    df = df.rename(columns={'value': f'value_{type_name}'})
    dfs_structure_by_type[type_name] = df

# Crear un DataFrame consolidado inicialmente con la columna 'datetime'
consolidated_df = pd.DataFrame(data['included'][0]['attributes']['values'])[['datetime']]

# Fusionar los DataFrames en función de la columna 'datetime'
for key, df in dfs_structure_by_type.items():
    consolidated_df = pd.merge(consolidated_df, df, on='datetime', how='outer')

# Quitar las columnas que contienen 'percentage'
df_structure_merge = consolidated_df.loc[:, ~consolidated_df.columns.str.contains('percentage')]
# Quitar todos los strings que sean value_
df_structure_merge.columns = df_structure_merge.columns.str.replace('value_', '')


  consolidated_df = pd.merge(consolidated_df, df, on='datetime', how='outer')
  consolidated_df = pd.merge(consolidated_df, df, on='datetime', how='outer')
  consolidated_df = pd.merge(consolidated_df, df, on='datetime', how='outer')
  consolidated_df = pd.merge(consolidated_df, df, on='datetime', how='outer')
  consolidated_df = pd.merge(consolidated_df, df, on='datetime', how='outer')
  consolidated_df = pd.merge(consolidated_df, df, on='datetime', how='outer')


In [18]:
df_structure_merge

Unnamed: 0,datetime,Hidráulica,Turbinación bombeo,Nuclear,Carbón,Fuel + Gas,Ciclo combinado,Eólica,Solar fotovoltaica,Solar térmica,Otras renovables,Cogeneración,Residuos no renovables,Residuos renovables,Generación total
0,2022-10-25T00:00:00.000+02:00,30080.301,10971.145,119122.962,5995.381,,159026.494,244455.554,45358.291,796.962,11195.358,32890.611,3980.7345,2048.0415,665921.835
1,2022-10-26T00:00:00.000+02:00,38650.143,7785.445,118898.293,5719.703,,202505.778,176570.620,43052.518,374.961,11209.556,33696.408,4227.1300,2063.9650,644754.520
2,2022-10-27T00:00:00.000+02:00,23648.834,10923.012,118905.370,3517.236,0.001,94982.595,308167.676,58365.066,2902.327,11302.942,33125.862,3827.6405,1884.2845,671552.846
3,2022-10-28T00:00:00.000+02:00,26491.342,13053.460,118980.687,4267.437,,108810.815,242578.456,56579.410,1842.236,12150.606,34739.583,3867.4525,1877.6325,625239.117
4,2022-10-29T00:00:00.000+02:00,27662.928,14771.582,119102.743,4130.189,,74687.542,249073.795,62251.113,4877.024,12212.541,32579.921,4136.1000,1879.4490,607364.927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2023-10-21T00:00:00.000+02:00,56829.100,17303.300,111193.000,7356.900,,63803.600,202876.800,84769.316,5123.584,6702.700,28985.300,3669.2500,1999.7500,590612.600
362,2023-10-22T00:00:00.000+02:00,57046.500,12121.500,110483.200,7746.600,,72234.900,176301.400,37950.063,936.537,7967.800,32733.900,3678.3000,1978.8000,521179.500
363,2023-10-23T00:00:00.000+02:00,74980.900,19090.800,96024.600,9928.700,,153019.300,153475.400,49859.870,317.630,8341.000,44593.200,3817.0500,2013.6500,615462.100
364,2023-10-24T00:00:00.000+02:00,59575.700,4916.700,95985.600,7637.200,,77535.500,299245.700,74564.379,3318.621,8202.300,43972.200,3399.4500,2075.4500,680428.800


In [19]:
df_structure_merge = process_dataframe(df_structure_merge)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'].dt.tz_localize(None)


In [20]:
# Drop columns index 5
df_structure_merge = df_structure_merge.drop(df_structure_merge.columns[5], axis=1)


In [21]:
df_structure_merge.to_csv('../../data/df_structure_merge.csv', index=False)

#### Installed power

In [22]:
# Define common parameters
params = {
    'time_trunc': 'day',
    'geo_limit': 'peninsular',
    'geo_ids': '8741'
}

# Get the current date and time
current_datetime = datetime.now()

# Round the current date to the previous night at 00:00
end_date = current_datetime.replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1)

# Calculate the start date as one year ago at 00:00
start_date = end_date - timedelta(days=365)

# Convert the dates to text strings in the "00:00" format
start_date_str = start_date.strftime('%Y-%m-%dT%H:%M')
end_date_str = end_date.strftime('%Y-%m-%dT%H:%M')

# Define the base URL with dynamic time parameters
base_url = f'https://apidatos.ree.es/es/datos/generacion/potencia-instalada?start_date={start_date_str}&end_date={end_date_str}'

# Make the request to the API
response = requests.get(base_url, params=params)
data = response.json()

# create dataframe for each type
dfs_power_by_type = {}

for i in range(len(data['included'])):
    type_name = data['included'][i]['type']

    # Initialize a DataFrame for the current type with the name as the key
    dfs_power_by_type[type_name] = pd.DataFrame()

    # Extract the 'values' data
    values = data['included'][i]['attributes']['values']

    # Create a DataFrame for the current month
    df_month = pd.DataFrame(values)

    # Concatenate the current month's data with the type's DataFrame
    dfs_power_by_type[type_name] = pd.concat([dfs_power_by_type[type_name], df_month], ignore_index=True)


In [23]:
# Create an empty DataFrame to store the consolidated data
consolidated_df1 = pd.DataFrame(columns=['values', 'percentage', 'datetime',])
for key, df in dfs_power_by_type.items():
    # Renombra las columnas 'values' y 'datetime' para que tengan el nombre del tipo de generación
    
    # Fusiona el DataFrame actual con el consolidado en función de la columna 'datetime'
    consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')


df_power_merge = consolidated_df1.loc[:,~consolidated_df1.columns.str.contains('percentage')]

  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')
  consolidated_df1 = pd.merge(consolidated_df1, df, on='datetime', how='outer')


In [24]:
df_power_merge = process_dataframe(df_power_merge)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'].dt.tz_localize(None)


In [25]:
df_power_merge = df_power_merge.drop(df_power_merge.columns[0], axis=1)

In [26]:
# Save the DataFrames to CSV
df_power_merge.to_csv('../../data/df_power_merge.csv', index=False)

#### Energy exchange

In [27]:
# Define common parameters
params = {
    'time_trunc': 'day',
    'geo_limit': 'peninsular',
    'geo_ids': '8741'
}

# Get the current date and time
current_datetime = datetime.now()

# Round the current date to the previous night at 00:00
end_date = current_datetime.replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1)

# Calculate the start date as one year ago at 00:00
start_date = end_date - timedelta(days=365)

# Convert the dates to text strings in the "00:00" format
start_date_str = start_date.strftime('%Y-%m-%dT%H:%M')
end_date_str = end_date.strftime('%Y-%m-%dT%H:%M')

# Define the base URL with dynamic time parameters
base_url = f'https://apidatos.ree.es/es/datos/intercambios/todas-fronteras-programados?start_date={start_date_str}&end_date={end_date_str}'

# Make the request to the API
response = requests.get(base_url, params=params)
data = response.json()

dfs_exchange_by_country = {}

for i in range(len(data['included']) -1):
    type_name = data['included'][i]['type']

    # Initialize a DataFrame for the current type with the name as the key
    dfs_exchange_by_country[type_name] = pd.DataFrame()

    # Extract the 'values' data
    values = data['included'][i]['attributes']['content'][2]['attributes']['values']

    # Create a DataFrame for the current month
    df_month = pd.DataFrame(values)

    # Concatenate the current month's data with the type's DataFrame
    dfs_exchange_by_country[type_name] = pd.concat([dfs_exchange_by_country[type_name], df_month], ignore_index=True)



In [29]:
# Crear un DataFrame vacío para almacenar los datos consolidados
consolidated_df = pd.DataFrame(columns=['value', 'percentage', 'datetime'])

# Fusionar los DataFrames en función de la columna 'datetime'
for key, df in dfs_exchange_by_country.items():
    # Renombrar las columnas 'value' y 'datetime' con el nombre del key del diccionario
    df = df.rename(columns={'value': key, 'datetime': 'datetime'})
    
    # Fusionar el DataFrame actual con el consolidado en función de la columna 'datetime'
    if consolidated_df.empty:
        consolidated_df = df
    else:
        consolidated_df = pd.merge(consolidated_df, df, on='datetime', how='outer')

# Quitar las columnas que contienen 'percentage'
df_exchange_merge = consolidated_df.loc[:, ~consolidated_df.columns.str.contains('percentage')]


In [30]:
df_exchange_merge = process_dataframe(df_exchange_merge)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'].dt.tz_localize(None)


In [66]:
# Columnas de interés para el imputador
columns_to_impute = ["francia", "portugal", "marruecos"]

# Crear un objeto SimpleImputer para rellenar con la estrategia de la media
imp_mean = SimpleImputer(strategy='mean')

# Rellenar las columnas de interés
df_exchange_merge[columns_to_impute] = imp_mean.fit_transform(df_exchange_merge[columns_to_impute])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exchange_merge[columns_to_impute] = imp_mean.fit_transform(df_exchange_merge[columns_to_impute])


In [68]:
# Save the DataFrames to CSV
df_exchange_merge.to_csv('../../data/df_exchange_merge.csv', index=False)

#### Energy price components

In [32]:
# Define common parameters
params = {
    'time_trunc': 'month',
    'geo_limit': 'peninsular',
    'geo_ids': '8741'
}

# Define the base URL
base_url = 'https://apidatos.ree.es/es/datos/mercados/componentes-precio-energia-cierre-desglose?'

# Get the current date and time
current_datetime = datetime.now()

# Round the current date to the previous night at 00:00
end_date = current_datetime.replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1)

# Calculate the start date as one year ago at 00:00
start_date = end_date - timedelta(days=365)

# Convert the dates to text strings in the "00:00" format
start_date_str = start_date.strftime('%Y-%m-%dT%H:%M')
end_date_str = end_date.strftime('%Y-%m-%dT%H:%M')

params['start_date'] = start_date_str
params['end_date'] = end_date_str

# Create dictionary to store dataframes
dfs_components_daily = {}
dfs_components_intradaily = {}


# Make the request to the API
response = requests.get(base_url, params=params)
data = response.json()

# Extract and print the data
for i in range(len(data['included'])):
    type_name = data['included'][i]['type']

    # Initialize a DataFrame for the current type with the name as the key
    dfs_components_daily[type_name] = pd.DataFrame()

    # Extract the 'values' data
    values = data['included'][i]['attributes']['content'][0]['attributes']['values']


    # Create a DataFrame for the current month
    df_month = pd.DataFrame(values)

    # Concatenate the current month's data with the type's DataFrame']
    dfs_components_daily[type_name] = pd.concat([dfs_components_daily[type_name], df_month], ignore_index=True)

# Extract and print the data
for i in range(len(data['included']) - 2):
    type_name = data['included'][i]['type']

    # Initialize a DataFrame for the current type with the name as the key
    dfs_components_intradaily[type_name] = pd.DataFrame()

    # Extract the 'values' data
    values2 = data['included'][i]['attributes']['content'][1]['attributes']['values']


    # Create a DataFrame for the current month
    df_month2 = pd.DataFrame(values2)

    # Concatenate the current month's data with the type's DataFrame']
    dfs_components_intradaily[type_name] = pd.concat([dfs_components_intradaily[type_name], df_month2], ignore_index=True)



In [33]:
# Create an empty DataFrame to store the consolidated data
consolidated_df3 = pd.DataFrame(columns=['values', 'percentage', 'datetime',])
for key, df in dfs_components_daily.items():
    # Renombra las columnas 'values' y 'datetime' para que tengan el nombre del tipo de generación
    
    # Fusiona el DataFrame actual con el consolidado en función de la columna 'datetime'
    consolidated_df3 = pd.merge(consolidated_df3, df, on='datetime', how='outer')

# drop all the columns that contains 'percentage'
df_components_daily_merge = consolidated_df3.loc[:,~consolidated_df3.columns.str.contains('percentage')]

  consolidated_df3 = pd.merge(consolidated_df3, df, on='datetime', how='outer')
  consolidated_df3 = pd.merge(consolidated_df3, df, on='datetime', how='outer')
  consolidated_df3 = pd.merge(consolidated_df3, df, on='datetime', how='outer')


In [34]:
df_components_daily_merge = process_dataframe(df_components_daily_merge)
df_components_daily_merge = df_components_daily_merge.drop(df_components_daily_merge.columns[0], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'].dt.tz_localize(None)


In [35]:
# Save the DataFrames to CSV
df_components_daily_merge.to_csv('../../data/df_components_daily_merge.csv', index=False)

In [36]:
# Create an empty DataFrame to store the consolidated data
consolidated_df4 = pd.DataFrame(columns=['values', 'percentage', 'datetime',])
for key, df in dfs_components_intradaily.items():
    # Renombra las columnas 'values' y 'datetime' para que tengan el nombre del tipo de generación
    
    # Fusiona el DataFrame actual con el consolidado en función de la columna 'datetime'
    consolidated_df4 = pd.merge(consolidated_df4, df, on='datetime', how='outer')

# drop all the columns that contains 'percentage'
df_components_intradaily_merge = consolidated_df4.loc[:,~consolidated_df4.columns.str.contains('percentage')]

  consolidated_df4 = pd.merge(consolidated_df4, df, on='datetime', how='outer')


In [37]:
df_components_intradaily_merge = process_dataframe(df_components_intradaily_merge)
df_components_intradaily_merge = df_components_intradaily_merge.drop(df_components_intradaily_merge.columns[0], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = df['datetime'].dt.tz_localize(None)


In [38]:
# Save the DataFrames to CSV
df_components_intradaily_merge.to_csv('../../data/df_components_intradaily_merge.csv', index=False)

# Data resampling


#### Market exchange resampling

In [70]:
df_exchange_merge

Unnamed: 0,francia,datetime,portugal,marruecos
0,75996.034,2022-10-23 22:00:00,-37093.00,-8820.000000
1,8705.200,2022-10-24 22:00:00,-17814.90,-12860.000000
2,24084.000,2022-10-25 22:00:00,-14708.70,-12660.000000
3,910.800,2022-10-26 22:00:00,-9926.10,-11388.700000
4,27700.800,2022-10-27 22:00:00,-10911.40,-8820.000000
...,...,...,...,...
362,44405.500,2023-10-20 22:00:00,-42234.60,-14091.600000
363,32009.200,2023-10-21 22:00:00,-17327.30,-1580.000000
364,21040.950,2023-10-22 22:00:00,-17772.20,-5218.548129
365,-32987.600,2023-10-23 22:00:00,13825.85,2333.200000


In [71]:
# Crear un rango de fechas a nivel horario
rango_horario1 = pd.date_range(start=df_exchange_merge['datetime'].min(), end=df_exchange_merge['datetime'].max(), freq='H')

# Expandir los datos a nivel horario
df_exchange_hourly = pd.DataFrame({'datetime': rango_horario1})
df_exchange_hourly = df_exchange_hourly.merge(df_exchange_merge, on='datetime', how='left')

# Rellenar los valores diarios a nivel horario, columna por columna
columns_to_fill = ['francia', 'marruecos', 'portugal']

for column in columns_to_fill:
    df_exchange_hourly[column].fillna(method='ffill', inplace=True)

# Asegurarse de que todos los valores estén llenos
df_exchange_hourly.fillna(method='bfill', inplace=True)


In [72]:
# Save the DataFrames to CSV
df_exchange_hourly.to_csv('../../data/df_exchange_hourly.csv', index=False)

#### Structure Merge resampling

In [42]:
df_structure_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 366 entries, 0 to 365
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   datetime                366 non-null    datetime64[ns]
 1   Hidráulica              366 non-null    float64       
 2   Turbinación bombeo      366 non-null    float64       
 3   Nuclear                 366 non-null    float64       
 4   Carbón                  366 non-null    float64       
 5   Ciclo combinado         366 non-null    float64       
 6   Eólica                  366 non-null    float64       
 7   Solar fotovoltaica      366 non-null    float64       
 8   Solar térmica           366 non-null    float64       
 9   Otras renovables        366 non-null    float64       
 10  Cogeneración            366 non-null    float64       
 11  Residuos no renovables  366 non-null    float64       
 12  Residuos renovables     366 non-null    float64   

In [43]:
# Crear un rango de fechas a nivel horario
rango_horario = pd.date_range(start=df_structure_merge['datetime'].min(), end=df_structure_merge['datetime'].max(), freq='H')

# Expandir los datos a nivel horario
df_structure_hourly = pd.DataFrame({'datetime': rango_horario})
df_structure_hourly = df_structure_hourly.merge(df_structure_merge, on='datetime', how='left')

# Rellenar los valores diarios a nivel horario, columna por columna
columns_to_fill = ['Hidráulica', 'Turbinación bombeo','Nuclear', 'Carbón', 'Ciclo combinado', 'Eólica', 'Solar fotovoltaica',
                    'Solar térmica', 'Otras renovables', 'Cogeneración', 'Residuos no renovables',
                     'Residuos renovables', 'Generación total']

for column in columns_to_fill:
    df_structure_hourly[column].fillna(method='ffill', inplace=True)

# Asegurarse de que todos los valores estén llenos
df_structure_hourly.fillna(method='bfill', inplace=True)




In [44]:
# Save the DataFrames to CSV
df_structure_hourly.to_csv('../../data/df_structure_hourly.csv', index=False)

#### Demand resampling

In [45]:
df_merge_demand = df_real_demand.merge(df_programed_demand, on='datetime', how='inner') \
    .merge(df_expected_demand, on='datetime', how='inner')



In [46]:
df_merge_demand.rename(columns={'value_x': 'real_de'}, inplace=True)
df_merge_demand.rename(columns={'value_y': 'prog_de'}, inplace=True)
df_merge_demand.rename(columns={'value': 'exp_de'}, inplace=True)



In [47]:
# move value_x to the end
df_merge_demand = df_merge_demand[['datetime', 'prog_de', 'exp_de', 'real_de']]

# Final dataframe

In [48]:
df_yearly_prices = pd.read_csv('../../data/df_yearly_prices.csv')

In [49]:
df_yearly_prices['datetime'] = pd.to_datetime(df_yearly_prices['datetime'])
df_yearly_prices['datetime'] = df_yearly_prices['datetime'].dt.tz_localize(None)



In [50]:
df_merge= df_merge_demand.merge(df_yearly_prices, on='datetime', how='inner')

In [51]:
df_merge = df_merge.merge(df_structure_hourly, on='datetime', how='inner')

In [74]:
df_merge = df_merge.merge(df_exchange_hourly, on='datetime', how='inner')

In [76]:
df_merge.corr()['value'].sort_values(ascending=False)

  df_merge.corr()['value'].sort_values(ascending=False)


value                     1.000000
Ciclo combinado           0.548996
Carbón                    0.481342
Otras renovables          0.479792
prog_de                   0.470350
exp_de                    0.464427
real_de                   0.463098
Residuos no renovables    0.320079
Cogeneración              0.300333
portugal                  0.188581
Hidráulica                0.172758
Residuos renovables       0.153651
Generación total          0.118149
marruecos                 0.075704
Nuclear                   0.068232
francia                  -0.030176
Solar térmica            -0.175854
Solar fotovoltaica       -0.179699
Eólica                   -0.284648
Turbinación bombeo       -0.350245
Name: value, dtype: float64

In [78]:
# move column value to column index 1

df_merge = df_merge[['datetime', 'value','prog_de', 'exp_de','real_de',
                     	'Hidráulica', 'Turbinación bombeo', 'Nuclear', 'Carbón',
                        'Ciclo combinado', 'Eólica', 'Solar fotovoltaica',	'Solar térmica', 'Otras renovables',
                        'Cogeneración',	'Residuos no renovables', 'Residuos renovables',
                        'Generación total',	'francia', 'portugal', 'marruecos']]


In [79]:
df_merge.to_csv('../../data/df_merge_final.csv', index=False)