In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import time
import json
import requests
from datetime import datetime

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", 100)
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})
sns.set_theme()

In [3]:
#I. PIPELINE

## accessing the open energy data from Energinet Denmark, through their public API
def get_PowerSystem_data():

    url = 'https://api.energidataservice.dk/dataset/PowerSystemRightNow?start=now-P1Y&end=now&sort=Minutes1DK'
    response = requests.get(url)

    if response.status_code != 200:
        print("Request failed with status code:", response.status_code)

    selected_records = response.json()['records'][::5]
    df = pd.DataFrame(selected_records).fillna(0)

    total_generation = df.ProductionGe100MW + df.ProductionLt100MW + df.SolarPower + df.OffshoreWindPower + df.OnshoreWindPower

    net_imports = df.Exchange_DK1_DE + df.Exchange_DK1_NL + df.Exchange_DK1_GB + df.Exchange_DK1_NO + df.Exchange_DK1_SE + \
                    df.Exchange_DK1_DK2 + df.Exchange_DK2_DE + df.Exchange_DK2_SE + df.Exchange_Bornholm_SE

    imbalance = df.ImbalanceDK1 + df.ImbalanceDK2

    df['Demand'] = total_generation + net_imports - imbalance
    df['Renewables'] = df.SolarPower + df.OffshoreWindPower + df.OnshoreWindPower
    actionable = df['Demand'] - df['Renewables']

    return df

In [4]:
data = get_PowerSystem_data()
data.tail(5)

Unnamed: 0,Minutes1UTC,Minutes1DK,CO2Emission,ProductionGe100MW,ProductionLt100MW,SolarPower,OffshoreWindPower,OnshoreWindPower,Exchange_Sum,Exchange_DK1_DE,Exchange_DK1_NL,Exchange_DK1_GB,Exchange_DK1_NO,Exchange_DK1_SE,Exchange_DK1_DK2,Exchange_DK2_DE,Exchange_DK2_SE,Exchange_Bornholm_SE,aFRR_ActivatedDK1,aFRR_ActivatedDK2,mFRR_ActivatedDK1,mFRR_ActivatedDK2,ImbalanceDK1,ImbalanceDK2,Demand,Renewables
105006,2024-11-04T10:22:00,2024-11-04T11:22:00,121.77,1012.32,474.42,454.78,52.5,67.92,2997.93,-338.55,688.25,-334.0,1632.21,709.0,-407.74,-426.03,1056.02,11.03,26.14,-37.81,-195.0,0.0,-97.43,217.01,4532.55,575.2
105007,2024-11-04T10:27:00,2024-11-04T11:27:00,121.98,1004.86,483.35,468.34,47.7,64.02,2977.73,-368.38,688.25,-334.0,1631.47,709.0,-407.87,-425.95,1065.25,12.09,24.89,0.12,-195.0,0.0,-42.87,212.25,4468.75,580.06
105008,2024-11-04T10:32:00,2024-11-04T11:32:00,121.85,998.78,483.09,469.29,49.06,56.98,3011.32,-332.16,688.25,-334.0,1631.39,709.0,-407.87,-426.04,1062.76,12.12,10.92,6.42,-195.0,0.0,-98.08,201.41,4557.32,575.33
105009,2024-11-04T10:37:00,2024-11-04T11:37:00,121.32,1020.74,483.96,480.73,49.66,49.8,2943.24,-366.44,688.25,-334.0,1631.73,709.0,-407.87,-425.99,1028.3,12.39,22.01,9.58,-195.0,0.0,-47.65,229.67,4438.24,580.19
105010,2024-11-04T10:42:00,2024-11-04T11:42:00,121.71,993.32,479.63,467.01,46.3,50.64,3029.77,-303.01,688.25,-334.0,1632.34,709.0,-407.87,-425.92,1050.92,12.19,7.94,10.4,-195.0,0.0,-146.92,209.91,4595.81,563.95


In [None]:
#II. PREDICT CO2 EMISSION LEVELS BASED ON POWER PRODUCTION

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error


In [8]:
def infer_co2_levels(data):

    X = data.iloc[:, 3:-2]
    X1 = pd.concat([data['Renewables'],X.iloc[:, :2]], axis=1)
    X2 = X.iloc[:, 6:-6]
    X = pd.concat([X1,X2], axis=1).to_numpy()
    #print(X.head(5))

    y = data.CO2Emission.to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9)

    rg = DecisionTreeRegressor()

    param_grid = {
    'max_depth': [None,7,8,9,10,12],
    'max_leaf_nodes': [None,40,45,50]
    }

    grid_search = GridSearchCV(
        estimator=rg,
        param_grid=param_grid,
        scoring='neg_mean_absolute_error',  
        cv=5,                               
        n_jobs=-1,                          
        verbose=1)

    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = -grid_search.best_score_
    print("Decision Tree results:")
    print("Best Parameters:", best_params)
    print("Best Mean Absolute Error (MAE) from Cross-Validation:", best_score)

    best_rg = grid_search.best_estimator_
    y_pred = best_rg.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_pred)
    print("Test Mean Absolute Error (MAE):", test_mae)
    print("*"*20)
    print("*"*20)

    return best_rg


In [None]:
from joblib import dump
dump(infer_co2_levels(data), 'decision_tree_regressor.joblib')
print("Model saved successfully.")
#

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Decision Tree results:
Best Parameters: {'max_depth': None, 'max_leaf_nodes': None}
Best Mean Absolute Error (MAE) from Cross-Validation: 3.839532635948222
Test Mean Absolute Error (MAE): 3.458259983582828
********************
********************
Model saved successfully.
