In [10]:
import json
import pandas as pd

# Load config
with open('../models/config.json', 'r') as f:
    config = json.load(f)

preprocess = config['preprocessing']
# Output Path
OUTPUT_PATH = preprocess['output_path']
# Input Path
INPUT_PATH = preprocess['input_path']

# Load the dataset
dataset = pd.read_csv(INPUT_PATH + '240624_validation_data.csv')

# Define the dates for which data will be extracted
dates = ['2023-03-16', '2023-04-09', '2023-04-10', '2023-05-01', '2023-05-15', '2023-06-19', '2023-06-22', '2023-07-09', '2023-07-10']

# Convert 'time' column to datetime
dataset['time'] = pd.to_datetime(dataset['time'], utc=True)

# Create dataframe with necessary columns
df_data = pd.DataFrame({
    'time': dataset['time'],
    'actual_demand': dataset['delivered heat'],
    'forecast_demand': dataset['predicted heat'],
    'temperatures': dataset['temperatures']  # Assuming the dataset has a 'temperature' column
})

# Add hour column
df_data['hour'] = df_data['time'].dt.hour

# Calculate error and absolute error
df_data['error'] = df_data['actual_demand'] - df_data['forecast_demand']

# Calculate the mean and standard deviation of the error for the whole dataset
mu_all = df_data['error'].mean()
sigma_all = df_data['error'].std()
print('Mean of the error for the entire dataset:', mu_all)
print('Standard deviation of the error for the entire dataset:', sigma_all)

# Function to filter data for selected period
def filter_period_data(df, selected_date, selected_period):
    selected_date = pd.to_datetime(selected_date, utc=True)
    if selected_period == 0:  # Day
        start_date = selected_date.normalize()
        end_date = start_date + pd.Timedelta(hours=23)
    elif selected_period == 1:  # Week
        start_date = selected_date.normalize()
        end_date = start_date + pd.Timedelta(days=6, hours=23)
    elif selected_period == 2:  # Month
        start_date = selected_date.normalize()
        end_date = (start_date + pd.DateOffset(months=1)) - pd.Timedelta(hours=1)
    else:
        raise ValueError("selected_period must be 0 (day), 1 (week), or 2 (month)")

    # Filter data for the selected period
    period_data = df[(df['time'] >= start_date) & (df['time'] <= end_date)].reset_index(drop=True)
    return period_data

# Select a specific date and period
selected_date = '2023-07-10'
selected_period = 0  # 0 for day, 1 for week, 2 for month

# Filter data for the selected period
period_data = filter_period_data(df_data, selected_date, selected_period)

# Calculate average temperature for the selected period
average_temperature = period_data['temperatures'].mean()
print('Average temperature for the selected period:', average_temperature)

# Generate expected timestamps
expected_time = pd.date_range(start=period_data['time'].min(), end=period_data['time'].max(), freq='H', tz='UTC')

# Check for missing timestamps
missing_time = expected_time.difference(period_data['time'])

print('Expected number of data points:', len(expected_time))
print('Actual number of data points:', len(period_data))

if missing_time.empty:
    print('The selected period has all the expected data.')
else:
    print(f'The selected period is missing {len(missing_time)} data points.')
    print('Missing timestamps:')
    print(missing_time)

# Display the filtered data
print(period_data.head())


Mean of the error for the entire dataset: -8.39020298257356
Standard deviation of the error for the entire dataset: 31.42172005784749
Average temperature for the selected period: 22.833333333333332
Expected number of data points: 24
Actual number of data points: 24
The selected period has all the expected data.
                       time  actual_demand  forecast_demand  temperatures  \
0 2023-07-10 00:00:00+00:00      19.491667        37.182262          20.8   
1 2023-07-10 01:00:00+00:00      42.829167        44.471830          20.9   
2 2023-07-10 02:00:00+00:00     144.383336        87.021062          19.6   
3 2023-07-10 03:00:00+00:00      42.000000        59.673830          18.8   
4 2023-07-10 04:00:00+00:00      70.545834        63.712583          19.6   

   hour      error  
0     0 -17.690595  
1     1  -1.642663  
2     2  57.362274  
3     3 -17.673829  
4     4   6.833251  


  expected_time = pd.date_range(start=period_data['time'].min(), end=period_data['time'].max(), freq='H', tz='UTC')
