In [1]:
import pandas as pd

# Read the CSV file directly
df = pd.read_csv(r"C:\Users\user\Desktop\msc-thesis-incomplete-markets-LDES\models\Complete_markets_risk_averse_central_planner\data_final\data_preparation_new\data_preparation\load_processing\data\espeni.csv")

# Convert ELEXM_utc to datetime (already UTC, so no change needed)
df['ELEXM_utc'] = pd.to_datetime(df['ELEXM_utc'])

# Extract year
df['year'] = df['ELEXM_utc'].dt.year

# Calculate hour of the year (hours since start of year), ensuring UTC timezone
df['hour_of_year'] = (df['ELEXM_utc'] - pd.to_datetime(df['year'].astype(str) + '-01-01', utc=True)).dt.total_seconds() / 3600

# Filter out half-hourly data (keep only whole hours)
df = df[df['hour_of_year'].apply(lambda x: x % 1 == 0)]

# Select required columns and rename POWER_ESPENI_MW to value
processed_df = df[['year', 'hour_of_year', 'POWER_ESPENI_MW']].copy()
processed_df.rename(columns={'POWER_ESPENI_MW': 'value'}, inplace=True)

# Handle invalid or missing data
processed_df = processed_df.dropna()
processed_df = processed_df[processed_df['value'].notnull() & processed_df['value'].apply(lambda x: isinstance(x, (int, float)))]

# Calculate normalized column: value divided by the maximum value for each year
processed_df['normalized'] = processed_df.groupby('year')['value'].transform(lambda x: x / x.max())

# Save to processed_data.csv
processed_df.to_csv('processed_data.csv', index=False)

# Print the first few rows for verification
print(processed_df.head().to_csv(index=False))

year,hour_of_year,value,normalized
2008,7437.0,44020,0.7590833060302461
2008,7438.0,40228,0.6936938490455415
2008,7439.0,37640,0.6490662344156852
2008,7440.0,36831,0.6351157938300771
2008,7441.0,36397,0.627631873911469



In [7]:
import pandas as pd
import numpy as np

# Parameters
n = 12  # Desired number of years
output_file = 'sampled_normalized_data_12.csv'

# Load the processed data
df = pd.read_csv('processed_data.csv')

# Filter for whole hours (should already be done, but ensure for safety)
df = df[df['hour_of_year'].apply(lambda x: x % 1 == 0)]

# Count hours per year
hours_per_year = df.groupby('year')['hour_of_year'].count()

# Identify years with at least 8760 hours (set Y)
Y = hours_per_year[hours_per_year >= 8760].index.tolist()

# For leap years, remove data beyond 8760 hours (last day of leap year)
def filter_leap_year(df_year):
    year = df_year['year'].iloc[0]
    is_leap = (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)
    if is_leap:
        return df_year[df_year['hour_of_year'] < 8760]
    return df_year

# Apply filtering to standardize to 8760 hours
df_filtered = df[df['year'].isin(Y)].groupby('year').apply(filter_leap_year).reset_index(drop=True)

# Verify each year has exactly 8760 hours
hours_per_year_filtered = df_filtered.groupby('year')['hour_of_year'].count()
valid_years = hours_per_year_filtered[hours_per_year_filtered == 8760].index.tolist()

if not valid_years:
    raise ValueError("No years with exactly 8760 hours found.")

# Determine how to construct the list of years
len_Y = len(valid_years)
num_full_sets = n // len_Y  # Number of complete Y sets
remainder = n % len_Y  # Number of additional years needed
print(remainder)
# Build the list of selected years
selected_years = []
for _ in range(num_full_sets):
    selected_years.extend(valid_years)  # Add full Y set
if remainder > 0:
    print(1)
    # Randomly sample remainder years
    np.random.seed(42)  # For reproducibility
    selected_years.extend(np.random.choice(valid_years, size=remainder, replace=True))

# Randomly shuffle the selected years
np.random.seed(42)  # For reproducibility
np.random.shuffle(selected_years)

# Create new DataFrame
new_df_list = []
for i, year in enumerate(selected_years[:n]):  # Ensure exactly n years
    year_data = df_filtered[df_filtered['year'] == year].copy()
    year_data['Y'] = i+1
    year_data['T'] = range(1,8761)  # 0 to 8759
    year_data['value'] = year_data['normalized']
    year_data = year_data[['Y', 'T', 'value']]
    new_df_list.append(year_data)

# Concatenate all years
new_df = pd.concat(new_df_list, ignore_index=True)

# Save to CSV
new_df.to_csv(output_file, index=False)

# Print the first few rows for verification
print(new_df.head().to_csv(index=False))

0
Y,T,value
1,1,0.5521266194084576
1,2,0.5526562372688014
1,3,0.5224272793937913
1,4,0.5069054021021755
1,5,0.4889798745213069



  df_filtered = df[df['year'].isin(Y)].groupby('year').apply(filter_leap_year).reset_index(drop=True)
