In [1]:
!pip install pmdarima
!pip install workalendar
!pip install prophet
!pip install -q --upgrade linear-tree

Defaulting to user installation because normal site-packages is not writeable
Collecting pmdarima
  Downloading pmdarima-2.0.4-cp312-cp312-win_amd64.whl.metadata (8.0 kB)
Collecting joblib>=0.11 (from pmdarima)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting Cython!=0.29.18,!=0.29.31,>=0.29 (from pmdarima)
  Downloading Cython-3.0.11-cp312-cp312-win_amd64.whl.metadata (3.2 kB)
Collecting scikit-learn>=0.22 (from pmdarima)
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.3.2 (from pmdarima)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting statsmodels>=0.13.2 (from pmdarima)
  Downloading statsmodels-0.14.4-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn>=0.22->pmdarima)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Collecting patsy>=0.5.6 (from statsmodels>=0.13.2->pmdarima)
  Downloading patsy-0.5.6-py2.py3-none-any

ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\febin\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python312\\site-packages\\prophet\\stan_model\\cmdstan-2.33.1\\stan\\lib\\stan_math\\lib\\tbb_2020.3\\include\\tbb\\internal\\_deprecated_header_message_guard.h'



In [4]:
import datetime
import json
import os
from joblib import Parallel, delayed
from time import sleep, time
import logging

import itertools
import holidays
import keras
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pmdarima as pm
import requests
import seaborn as sns
import statsmodels
import statsmodels.tsa.api as sm
import tensorflow as tf
import xgboost as xgb
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation, Dropout
from lineartree import LinearBoostRegressor
from matplotlib import rcParams  # Used to set default paremeters
from prophet import Prophet
from prophet.diagnostics import cross_validation
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import month_plot, plot_acf, plot_pacf, quarter_plot
from workalendar.europe import UnitedKingdom

# Figure default parameters
Define the default font and font sizes for all the plots

In [5]:
sns.set_style("whitegrid")

# Set Default Fonts
rcParams["font.family"] = "sans-serif"
rcParams["font.sans-serif"] = ["Arial", "Tahoma"]

# Set Default font sizes
small_size = 12
medium_size = 14
large_size = 16

# # Change the font size for individual elements
matplotlib.rc("font", size=small_size)  # controls default text sizes
matplotlib.rc("axes", titlesize=small_size)  # fontsize of the axes title
matplotlib.rc("axes", labelsize=medium_size)  # fontsize of the x and y labels
matplotlib.rc("xtick", labelsize=small_size)  # fontsize of the tick labels
matplotlib.rc("ytick", labelsize=small_size)  # fontsize of the tick labels
matplotlib.rc("legend", fontsize=small_size)  # legend fontsize
matplotlib.rc("axes", titlesize=large_size)  # title fontsize

# **Load data**

In [25]:
# Define the file path variable
file_path = '/content/historic_demand_2009_2024.csv'

# Load the CSV file, setting the first unnamed column as the index
df = pd.read_csv(file_path, index_col=0)

# Drop any additional unnamed columns if they appear
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Convert the "settlement date" column to datetime and format as "yyyy-mm-dd"
df['settlement_date'] = pd.to_datetime(df['settlement_date']).dt.strftime('%Y-%m-%d')

# EDA and Feature engineering
In this section, I will go through several steps to understand and transform the original dataset. The steps I will follow are:

Data understanding. Have a first look at the timeseries dataset
Data preparation. Deal with null values, drop uninformative features and remove outliers
Feature understanding. Understand the trend and different seasonalities in the data

**Data understanding**

The dataset includes a description of all the columns, but for this project I will only use three columns:

SETTLEMET_DATA: date in format dd/mm/yyyy
SETTLEMENT_PERIOD: half hourly period for the historic outtunr occurred
TSD (Transmission System Demand). Transmission System Demand is equal to the ND plus the additional generation required to meet station load, pump storage pumping and interconnector exports. Measured in MW.

TSD is target variable and the aim is to predict the future demand using different models

In [26]:
df.sample(n=7)

Unnamed: 0,settlement_date,settlement_period,nd,tsd,england_wales_demand,embedded_wind_generation,embedded_wind_capacity,embedded_solar_generation,embedded_solar_capacity,non_bm_stor,...,ifa2_flow,britned_flow,moyle_flow,east_west_flow,nemo_flow,nsl_flow,eleclink_flow,scottish_transfer,viking_flow,is_holiday
91502,2014-03-22,5,26906,28867,24421,1006,2524,0,4520,0,...,0,926,2,-144,0,,,,,0
37610,2011-02-23,27,46380,48380,42104,1407,1734,7,109,0,...,0,0,-401,0,0,,,,,0
134044,2016-08-24,21,32301,33232,29098,416,4353,4870,11100,0,...,0,1004,0,-423,0,,,,,0
252149,2023-05-21,8,17389,18803,15650,602,6538,0,15493,0,...,992,502,-452,-457,999,1296.0,999.0,-556.0,0.0,0
161570,2018-03-21,3,31162,33181,28541,809,5978,0,13052,0,...,0,999,297,0,0,,,,,0
210376,2020-12-31,41,36410,37018,33516,1019,6527,0,13080,0,...,-1,0,400,504,1015,0.0,0.0,,,0
88425,2014-01-17,8,28241,31023,25388,805,2524,0,3403,0,...,0,1000,-180,-478,0,,,,,0
