In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from Energy.HelpFunctions.get_energy_data import get_energy_data, prepare_data
from HelpFunctions.date_and_time import most_recent_thursday, split_time
from Energy.Models.baseline import baseline
from HelpFunctions.calc_score import evaluate_horizon
from HelpFunctions.mix_models import mix_models
from Energy.Models.Model1 import model1
from Energy.Models.Model2 import model2
from Energy.Models.Model4_population import model4_population
from Energy.Models.Model4 import model4
from Energy.Models.Model3 import model3
from Energy.Models.Model5 import model5
from Energy.Models.Model4_holidays_2 import model4_holidays_2
from Energy.Models.Model4_sunhours import model4_sunhours
# import importlib
# importlib.reload(Energy.Models)



If needed: Fetch energy-data first

In [2]:
# from Energy.HelpFunctions.get_energy_data import fetch_energy_data
# fetch_energy_data()

  energydata = pd.concat([energydata, pd.DataFrame(rawdata, columns=col_names)])
100%|██████████| 263/263 [00:23<00:00, 11.17it/s]


In [3]:
df = get_energy_data()
df = prepare_data(df)

# Cross validate baseline model

Remove everything til last thursday night 12pm

In [4]:
start_date_excl = most_recent_thursday(df)
df_cval = df.loc[df.index < start_date_excl]

Repeatedly run the model. Record predictions and true values (observations). Make sure the observations are available for the most recent prediction.

In [5]:
from HelpFunctions.mix_models import mix_models_per_horizon


def evaluate_models(models, df, last_x, years =False, months=False, weeks=False):
    # Check that exactly one of the boolean parameters is True
    if sum([years, months, weeks]) != 1:
        raise ValueError("Exactly one of the boolean parameters (years, months, weeks) must be True.")
    
    years = int(years)
    months = int(months)
    weeks = int(weeks)
        
    for m in models:
        print(f'*********** Start the evaluation of Model {m["name"]} ***********')
        m['evaluation'] = evaluate_model(m, df, last_x, years, months, weeks)
        
def evaluate_model(model, df, last_x, years, months, weeks):
    df_before = df
    evaluation = pd.DataFrame()
    
    for w in range(last_x):
        print(f'Iteration {w} of {last_x}')
        df_before, df_after = split_time(df_before, num_years=years, num_months=months, num_weeks=weeks)        
        
        pred = None     
        # Is mixed model?
        if callable(model['function']):
            pred = model['function'](df_before)
        else:
            pred = mix_models_per_horizon(model['function'][0], model['function'][1], df_before)
               
        
        obs = pd.DataFrame({'gesamt': df.loc[pred['forecast_date']]["gesamt"]})
        pred = pred.set_index('forecast_date')
        merged_df = pd.merge(pred, obs, left_index=True, right_index=True) 
    
    
         # Add scores to the merged_df
        for index, row in merged_df.iterrows():
            quantile_preds = row[['q0.025','q0.25','q0.5','q0.75','q0.975']]
            observation = row['gesamt']
            score = evaluate_horizon(quantile_preds, observation)
            merged_df.at[index, 'score'] = score
        # print(merged_df[['q0.025','q0.25','q0.5','q0.75','q0.975']])
        evaluation = pd.concat([evaluation, merged_df])
    return evaluation

## Evaluation of selected Models

In [6]:
from Energy.Models import mstl

models = [
    # {
    #     'name': 'model5',
    #     'function': model5
    # },
    {
        'name': 'baseline',
        'function': baseline
     },
    {
        'name': 'mstl',
        'function': mstl.mstl
     },
]

In [7]:
evaluate_models(models, df_cval, last_x=10, weeks=True)

*********** Start the evaluation of Model baseline ***********
Iteration 0 of 10


AttributeError: 'DataFrame' object has no attribute 'weekday'

In [None]:
models[0]['evaluation']

### Save evaluations in pkl file

In [None]:
# with open('./Model evaluations/mm_m5_bl_m4_x.pkl', 'wb') as f:
#     pickle.dump(models, f)

# with open('./Model evaluations/m4_pop_m5.pkl', 'rb') as f:
#     models2 = pickle.load(f)

### Create a table that only contains the different scores of the different models

In [None]:
# scores = [m['evaluation']['score'][m['evaluation']['horizon'] == '36 hour'] for m in models]
# horizons = ['36 hour', '40 hour', '44 hour', '60 hour', '64 hour', '68 hour']
# 
# 
# names = [m['name'] for m in models]
# score_df = pd.concat(scores, axis=1,keys=names)

In [None]:
horizons = ['36 hour', '40 hour', '44 hour', '60 hour', '64 hour', '68 hour']
scores = []
names = []
for h in horizons:
    for m in models:
        col_name = f'{m["name"]}: {h}'
        scores.append(m['evaluation']['score'][m['evaluation']['horizon'] == h])
        names.append(col_name)

score_df = pd.concat(scores, axis=1,keys=names)

In [None]:
score_df

### Plot the densities of the obtained scores

In [None]:
models_display = ['m5_bl_m4_x']
# models_display = ['baseline', 'model4_sunhours', 'model4_holidays_2', 'model4', 'model4_population']
for h in [str(h) + " hour" for h in [36, 40, 44, 60, 64, 68]]:
    # sns.kdeplot(data=score_df.loc[:,[f'baseline: {h}',f'model3: {h}',f'MM_baseline_model3: {h}']], fill=True)
    sns.kdeplot(data=score_df.loc[:,[f'{m}: {h}' for m in models_display]], fill=True)
    
    # Adding labels and title
    plt.xlabel('Score')
    plt.ylabel('Density')
    plt.title(f'Density Plot of Scores: {h} horizon')
    
    # Display the plot
    plt.savefig(f'plots/density_plot_{h.replace(" ", "_")}.png')
    plt.show()

In [None]:
for h in [str(h) + " hour" for h in [36, 40, 44, 60, 64, 68]]:
    # Plot values from two columns over time
    score_df_36_no_na = score_df[score_df[f'{models_display[0]}: {h}'].notna()]
    for c in [f'{m}: {h}' for m in models_display]:
        # plt.plot(score_df_36_no_na.index, score_df_36_no_na['baseline: 36 hour'], label='baseline')
        # plt.plot(score_df_36_no_na.index, score_df_36_no_na['model1: 36 hour'], label='model1')
        # plt.plot(score_df_36_no_na.index, score_df_36_no_na['model2: 36 hour'], label='model2')
        plt.plot(score_df_36_no_na.index, score_df_36_no_na[c], label=c)
        # plt.plot(score_df_36_no_na.index, score_df_36_no_na[c], label='model4')
        # Adding labels and title
    plt.xlabel('time')
    plt.ylabel('score')
    plt.title(f'Comparison of scores over time: {h}')
    plt.ylim(0,50)
    
    # Display legend
    plt.legend()
    plt.savefig(f'plots/line_plot_{h.replace(" ", "_")}.png')

    # Show the plot
    plt.show()

#### Plot Whole evaluation for a model

In [None]:
[m['name'] for m in models]

In [None]:


import matplotlib.pyplot as plt

def plot_evaluation(evaluation_data):
    # Assuming your DataFrame is named df
    numeric_columns = evaluation_data.select_dtypes(include='number')
    
    # Plotting
    plt.figure(figsize=(12, 8))
    for column in numeric_columns.columns:
        plt.plot(evaluation_data.index, evaluation_data[column], label=column)
    
    # Adding labels and legend
    plt.xlabel('Forecast Date')
    plt.ylabel('Values')
    plt.title('Line Plot for Numeric Columns')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'plots/evaluation_overview.png')
    plt.show()

plot_evaluation(models[1]['evaluation'][models[1]['evaluation']['horizon'] == '36 hour'])

### Search for outliers (highest scores per horizon)

In [None]:
models_display = ['model4_holidays_2']
horizons = [36]
for h in [str(h) + " hour" for h in [36, 40, 44, 60, 64, 68]]:
    for c in [f'{m}: {h}' for m in models_display]:
        df_sorted = score_df[c].sort_values().dropna().tail(10)
        
        
        print(f'{c}')
        print(df_sorted)

In [None]:
[m['name'] for m in models]

In [None]:
models[5]