In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import warnings
import math
from math import sqrt
import time
warnings.filterwarnings("ignore")


## 1. Load consumption data | Load consumption in MW

In [2]:
# Load xlsx dataframes
df1 = pd.read_excel('load_2006_2015.xlsx')


In [3]:
load_2006_NL = df1[df1['Country'] == 'NL']


In [4]:
# desplay
load_2006_NL

Unnamed: 0,Country,Year,Month,Day,Coverage ratio,0,1,2,3,4,...,14,15,16,17,18,19,20,21,22,23
83568,NL,2006,1,1,100.0,10215.0,9979.0,9460.0,8833.0,8525.0,...,11305.0,11202.0,11553.0,12691.0,13134.0,13094.0,12719.0,12130.0,11464.0,10482.0
83569,NL,2006,1,2,100.0,9590.0,9017.0,8758.0,8580.0,8621.0,...,15464.0,15202.0,15292.0,16301.0,15926.0,15583.0,14725.0,13936.0,12905.0,12165.0
83570,NL,2006,1,3,100.0,11064.0,10145.0,9717.0,9573.0,9634.0,...,15600.0,15435.0,15708.0,16592.0,15995.0,15563.0,14732.0,13958.0,12922.0,12184.0
83571,NL,2006,1,4,100.0,11088.0,10221.0,9859.0,9700.0,9730.0,...,15896.0,15799.0,15973.0,16581.0,15907.0,15465.0,14568.0,13837.0,12773.0,12076.0
83572,NL,2006,1,5,100.0,10925.0,10029.0,9612.0,9386.0,9501.0,...,16042.0,16003.0,16233.0,16514.0,15921.0,15569.0,14851.0,13926.0,12833.0,12113.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87215,NL,2015,12,27,100.0,10730.0,10168.0,9861.0,9705.0,9668.0,...,13267.0,13263.0,13813.0,14660.0,14457.0,14031.0,13477.0,12873.0,12324.0,11567.0
87216,NL,2015,12,28,100.0,10889.0,10405.0,10122.0,10084.0,10159.0,...,14589.0,14647.0,15316.0,16485.0,15884.0,15335.0,14600.0,13816.0,12970.0,12135.0
87217,NL,2015,12,29,100.0,11382.0,10746.0,10430.0,10297.0,10346.0,...,14967.0,14937.0,15461.0,16379.0,15811.0,15274.0,14617.0,13845.0,12982.0,12225.0
87218,NL,2015,12,30,100.0,11424.0,10827.0,10526.0,10358.0,10376.0,...,14510.0,14602.0,15152.0,16237.0,15798.0,15292.0,14616.0,13851.0,12958.0,12205.0


In [5]:
df2 = pd.read_excel('load_2015_2019.xlsx')

In [6]:
df2.head()

Unnamed: 0,MeasureItem,DateUTC,DateShort,TimeFrom,TimeTo,CountryCode,Cov_ratio,Value,Value_ScaleTo100
0,Monthly Hourly Load Values,2014-12-31 23:00:00,2014-12-31,23:00:00,00:00:00,DE,98,46419.79,47367.132653
1,Monthly Hourly Load Values,2015-01-01 00:00:00,2015-01-01,00:00:00,01:00:00,DE,98,44898.3,45814.591837
2,Monthly Hourly Load Values,2015-01-01 01:00:00,2015-01-01,01:00:00,02:00:00,DE,98,43305.31,44189.091837
3,Monthly Hourly Load Values,2015-01-01 02:00:00,2015-01-01,02:00:00,03:00:00,DE,98,41918.17,42773.642857
4,Monthly Hourly Load Values,2015-01-01 03:00:00,2015-01-01,03:00:00,04:00:00,DE,98,41330.17,42173.642857


In [7]:
load_2015_NL = df2[df2['CountryCode'] == 'NL']


In [8]:
load_2015_NL.head()

Unnamed: 0,MeasureItem,DateUTC,DateShort,TimeFrom,TimeTo,CountryCode,Cov_ratio,Value,Value_ScaleTo100
8790,Monthly Hourly Load Values,2015-12-31 23:00:00,2015-12-31,23:00:00,00:00:00,NL,100,11373.0,11373.0
8824,Monthly Hourly Load Values,2016-01-01 00:00:00,2016-01-01,00:00:00,01:00:00,NL,100,11102.0,11102.0
8859,Monthly Hourly Load Values,2016-01-01 01:00:00,2016-01-01,01:00:00,02:00:00,NL,100,10657.0,10657.0
8894,Monthly Hourly Load Values,2016-01-01 02:00:00,2016-01-01,02:00:00,03:00:00,NL,100,10238.0,10238.0
8929,Monthly Hourly Load Values,2016-01-01 03:00:00,2016-01-01,03:00:00,04:00:00,NL,100,10018.0,10018.0


In [9]:
def preprocess_load_consumption(load_2006_path: str, load_2015_path: str) -> pd.DataFrame:
    # Load and preprocess data for 2006-2015
    load_2006 = pd.read_excel(load_2006_path, index_col=0)
    load_2006_NL = load_2006[load_2006['Country'] == 'NL']
    
    # Reshape and clean up columns
    load_2006_melted = pd.melt(load_2006_NL, id_vars=["Year", "Month", "Day"], 
                               var_name="Hour", value_name="loadConsumption")
    load_2006_melted['date'] = pd.to_datetime(load_2006_melted[['Year', 'Month', 'Day']])
    load_2006_melted['datetime'] = pd.to_datetime(load_2006_melted[['Year', 'Month', 'Day']]) + \
                                   pd.to_timedelta(load_2006_melted['Hour'].astype(int) - 1, unit='h')
    load_2006_melted.drop(columns=['Country', 'Coverage ratio', 'Month', 'Day'], inplace=True)
    
    # Load and preprocess data for 2015-2019
    load_2015_2017 = pd.read_excel(load_2015_path, sheet_name='2015-2017')
    load_2018_2019 = pd.read_excel(load_2015_path, sheet_name='2018-2019')
    load_2015_2019 = pd.concat([load_2015_2017, load_2018_2019])

    # Filter for NL data and handle date/time columns
    load_2015_2019_NL = load_2015_2019[load_2015_2019['CountryCode'] == 'NL']
    load_2015_2019_NL['date'] = pd.to_datetime(load_2015_2019_NL['DateShort'], dayfirst=True)
    load_2015_2019_NL = load_2015_2019_NL[load_2015_2019_NL['date'] >= '2016-01-01']
    
    # Format datetime columns and clean up columns
    load_2015_2019_NL['datetime'] = pd.to_datetime(load_2015_2019_NL['DateUTC'], dayfirst=True, errors='coerce')
    load_2015_2019_NL['Hour'] = load_2015_2019_NL['datetime'].dt.hour + 1
    load_2015_2019_NL.rename(columns={'Value': 'loadConsumption'}, inplace=True)
    load_2015_2019_NL = load_2015_2019_NL[['datetime', 'date', 'Year', 'Hour', 'loadConsumption']]

    # Combine 2006-2015 and 2015-2019 data
    load_total = pd.concat([load_2006_melted, load_2015_2019_NL], ignore_index=True)

    return load_total

# Paths for the data files
load_2006_path = 'load_2006_2015.xlsx'
load_2015_path = 'load_2015_2019.xlsx'


In [10]:
load_2015_2017 = pd.read_excel(load_2015_path, sheet_name='2015-2017')
load_2018_2019 = pd.read_excel(load_2015_path, sheet_name='2018-2019')
load_2018_2019

Unnamed: 0,MeasureItem,DateUTC,DateShort,TimeFrom,TimeTo,CountryCode,Cov_ratio,Value,Value_ScaleTo100
0,Monthly Hourly Load Values,2018-01-01 00:00:00,2018-01-01,00:00:00,01:00:00,AL,100,746.45,746.45
1,Monthly Hourly Load Values,2018-01-01 00:00:00,2018-01-01,00:00:00,01:00:00,AT,100,6414.00,6414.00
2,Monthly Hourly Load Values,2018-01-01 00:00:00,2018-01-01,00:00:00,01:00:00,BA,100,1249.00,1249.00
3,Monthly Hourly Load Values,2018-01-01 00:00:00,2018-01-01,00:00:00,01:00:00,BE,100,8379.00,8379.00
4,Monthly Hourly Load Values,2018-01-01 00:00:00,2018-01-01,00:00:00,01:00:00,BG,100,3424.49,3424.49
...,...,...,...,...,...,...,...,...,...
476955,Monthly Hourly Load Values,2019-10-31 22:00:00,2019-10-31,22:00:00,23:00:00,HR,100,1767.00,1767.00
476956,Monthly Hourly Load Values,2019-10-31 22:00:00,2019-10-31,22:00:00,23:00:00,HU,100,4804.89,4804.89
476957,Monthly Hourly Load Values,2019-10-31 22:00:00,2019-10-31,22:00:00,23:00:00,MK,100,872.00,872.00
476958,Monthly Hourly Load Values,2019-10-31 22:00:00,2019-10-31,22:00:00,23:00:00,PT,100,5921.00,5921.00


In [11]:
# Concatenate the two datasets for 2015-2019
load = pd.concat([load_2015_2017, load_2018_2019])

# Filter for rows where CountryCode is 'NL' and create a copy to avoid warnings
load_NL = load.loc[load['CountryCode'] == 'NL'].copy()

# Convert DateShort to datetime format
load_NL['DateShort'] = pd.to_datetime(load_NL['DateShort'], dayfirst=True)

# Filter rows based on the date condition
load_NL = load_NL.loc[load_NL['DateShort'] >= '2016-01-01']

# Reset index after filtering
load_NL.reset_index(drop=True, inplace=True)


In [12]:
load_NL

Unnamed: 0,MeasureItem,DateUTC,DateShort,TimeFrom,TimeTo,CountryCode,Cov_ratio,Value,Value_ScaleTo100
0,Monthly Hourly Load Values,2016-01-01 00:00:00,2016-01-01,00:00:00,01:00:00,NL,100,11102.0,11102.0
1,Monthly Hourly Load Values,2016-01-01 01:00:00,2016-01-01,01:00:00,02:00:00,NL,100,10657.0,10657.0
2,Monthly Hourly Load Values,2016-01-01 02:00:00,2016-01-01,02:00:00,03:00:00,NL,100,10238.0,10238.0
3,Monthly Hourly Load Values,2016-01-01 03:00:00,2016-01-01,03:00:00,04:00:00,NL,100,10018.0,10018.0
4,Monthly Hourly Load Values,2016-01-01 04:00:00,2016-01-01,04:00:00,05:00:00,NL,100,10017.0,10017.0
...,...,...,...,...,...,...,...,...,...
29177,Monthly Hourly Load Values,2019-04-30 17:00:00,2019-04-30,17:00:00,18:00:00,NL,100,13910.0,13910.0
29178,Monthly Hourly Load Values,2019-04-30 18:00:00,2019-04-30,18:00:00,19:00:00,NL,100,13773.0,13773.0
29179,Monthly Hourly Load Values,2019-04-30 19:00:00,2019-04-30,19:00:00,20:00:00,NL,100,13652.0,13652.0
29180,Monthly Hourly Load Values,2019-04-30 20:00:00,2019-04-30,20:00:00,21:00:00,NL,100,13048.0,13048.0


In [13]:
# Filter relevant columns
load_NL = load_NL[['DateUTC', 'DateShort', 'Value']].copy()

# Convert date columns to datetime format
load_NL['date'] = pd.to_datetime(load_NL['DateShort'], dayfirst=True, format='%d/%m/%Y')
load_NL['datetime'] = pd.to_datetime(load_NL['DateUTC'], format='%d/%m/%Y %H:%M')

# Extract hour, year, and week directly from the datetime columns
load_NL['Hour'] = load_NL['datetime'].dt.hour + 1  # 1–24 range
load_NL['Year'] = load_NL['date'].dt.year

# Rename 'Value' to 'loadConsumption' 
load_NL.rename(columns={'Value': 'loadConsumption'}, inplace=True)

# Drop intermediate columns that are no longer needed
load_NL.drop(columns=['DateShort', 'DateUTC'], inplace=True)


In [14]:
# Convert column names to strings to use .isdigit() safely
load_2006_NL.columns = load_2006_NL.columns.astype(str)

# Identify columns corresponding to hours by checking if column names are numeric
hour_columns = [col for col in load_2006_NL.columns if col.isdigit()]

# Reshape data to long format, focusing only on necessary columns
df_melted = pd.melt(
    load_2006_NL,
    id_vars=["Year", "Month", "Day"],  # Only necessary columns
    value_vars=hour_columns,  # Limit to hour columns only
    var_name="Hour",
    value_name="loadConsumption"
)

# Convert hour column to integer for datetime operations
df_melted['Hour'] = df_melted['Hour'].astype(int)

# Convert to datetime formats for date and datetime columns
df_melted['date'] = pd.to_datetime(df_melted[['Year', 'Month', 'Day']])
df_melted['datetime'] = pd.to_datetime(df_melted['date']) + pd.to_timedelta(df_melted['Hour'] - 1, unit='h')

# Drop unnecessary columns for cleaner output
df_melted.drop(columns=['Month', 'Day'], inplace=True)


In [15]:
df_melted

Unnamed: 0,Year,Hour,loadConsumption,date,datetime
0,2006,0,10215.0,2006-01-01,2005-12-31 23:00:00
1,2006,0,9590.0,2006-01-02,2006-01-01 23:00:00
2,2006,0,11064.0,2006-01-03,2006-01-02 23:00:00
3,2006,0,11088.0,2006-01-04,2006-01-03 23:00:00
4,2006,0,10925.0,2006-01-05,2006-01-04 23:00:00
...,...,...,...,...,...
87643,2015,23,11567.0,2015-12-27,2015-12-27 22:00:00
87644,2015,23,12135.0,2015-12-28,2015-12-28 22:00:00
87645,2015,23,12225.0,2015-12-29,2015-12-29 22:00:00
87646,2015,23,12205.0,2015-12-30,2015-12-30 22:00:00


In [16]:
load_2006_2019 = pd.concat([df_melted, load_NL])
load_2006_2019.reset_index(inplace=True, drop=True)

In [17]:
#load_2006_2019.to_excel('load_2006_2019.xlsx', index=False, encoding='utf-8')


In [18]:
files = ['load_2019.xlsx','load_2020.xlsx', 'load_2021.xlsx', 'load_2022.xlsx', 'load_2023.xlsx']

filtered_data = []

for file in files:
    df = pd.read_excel(file)  
    df_nl = df[df['CountryCode'] == 'NL']  # Filter rows for the Netherlands
    filtered_data.append(df_nl)  

# Combine all filtered data into a single DataFrame
combined_2019_2023 = pd.concat(filtered_data, ignore_index=True)

print(combined_2019_2023.head())


                  MeasureItem             DateUTC  DateShort  \
0  Monthly Hourly Load Values 2019-01-01 00:00:00 2019-01-01   
1  Monthly Hourly Load Values 2019-01-01 01:00:00 2019-01-01   
2  Monthly Hourly Load Values 2019-01-01 02:00:00 2019-01-01   
3  Monthly Hourly Load Values 2019-01-01 03:00:00 2019-01-01   
4  Monthly Hourly Load Values 2019-01-01 04:00:00 2019-01-01   

              TimeFrom               TimeTo CountryCode  Cov_ratio  \
0  1970-01-01 00:00:00  1970-01-01 01:00:00          NL        100   
1  1970-01-01 01:00:00  1970-01-01 02:00:00          NL        100   
2  1970-01-01 02:00:00  1970-01-01 03:00:00          NL        100   
3  1970-01-01 03:00:00  1970-01-01 04:00:00          NL        100   
4  1970-01-01 04:00:00  1970-01-01 05:00:00          NL        100   

        Value  Value_ScaleTo100              CreateDate  \
0  11193.6350        11193.6350 2024-05-29 11:37:27.260   
1  10933.3600        10933.3600 2024-05-29 11:37:27.260   
2  10666.5800    

In [20]:
combined_2019_2023.head()

Unnamed: 0,MeasureItem,DateUTC,DateShort,TimeFrom,TimeTo,CountryCode,Cov_ratio,Value,Value_ScaleTo100,CreateDate,UpdateDate
0,Monthly Hourly Load Values,2019-01-01 00:00:00,2019-01-01,1970-01-01 00:00:00,1970-01-01 01:00:00,NL,100,11193.635,11193.635,2024-05-29 11:37:27.260,2024-05-29 11:37:27.260
1,Monthly Hourly Load Values,2019-01-01 01:00:00,2019-01-01,1970-01-01 01:00:00,1970-01-01 02:00:00,NL,100,10933.36,10933.36,2024-05-29 11:37:27.260,2024-05-29 11:37:27.260
2,Monthly Hourly Load Values,2019-01-01 02:00:00,2019-01-01,1970-01-01 02:00:00,1970-01-01 03:00:00,NL,100,10666.58,10666.58,2024-05-29 11:37:27.260,2024-05-29 11:37:27.260
3,Monthly Hourly Load Values,2019-01-01 03:00:00,2019-01-01,1970-01-01 03:00:00,1970-01-01 04:00:00,NL,100,10413.3575,10413.3575,2024-05-29 11:37:27.260,2024-05-29 11:37:27.260
4,Monthly Hourly Load Values,2019-01-01 04:00:00,2019-01-01,1970-01-01 04:00:00,1970-01-01 05:00:00,NL,100,10440.7925,10440.7925,2024-05-29 11:37:27.260,2024-05-29 11:37:27.260


In [21]:
missing_percentage = combined_2019_2023.isnull().mean() * 100

# Display the missing percentage for each column
print(missing_percentage)


MeasureItem          0.000000
DateUTC              0.000000
DateShort            0.000000
TimeFrom             0.000000
TimeTo               0.000000
CountryCode          0.000000
Cov_ratio            0.000000
Value                0.000000
Value_ScaleTo100     0.000000
CreateDate          19.989047
UpdateDate          19.989047
dtype: float64


In [22]:
# Drop columns with missing values
columns_to_drop = ['CreateDate', 'UpdateDate']  

combined_2019_2023 = combined_2019_2023.drop(columns=columns_to_drop)

In [23]:
combined_2019_2023

Unnamed: 0,MeasureItem,DateUTC,DateShort,TimeFrom,TimeTo,CountryCode,Cov_ratio,Value,Value_ScaleTo100
0,Monthly Hourly Load Values,2019-01-01 00:00:00,2019-01-01,1970-01-01 00:00:00,1970-01-01 01:00:00,NL,100,11193.6350,11193.6350
1,Monthly Hourly Load Values,2019-01-01 01:00:00,2019-01-01,1970-01-01 01:00:00,1970-01-01 02:00:00,NL,100,10933.3600,10933.3600
2,Monthly Hourly Load Values,2019-01-01 02:00:00,2019-01-01,1970-01-01 02:00:00,1970-01-01 03:00:00,NL,100,10666.5800,10666.5800
3,Monthly Hourly Load Values,2019-01-01 03:00:00,2019-01-01,1970-01-01 03:00:00,1970-01-01 04:00:00,NL,100,10413.3575,10413.3575
4,Monthly Hourly Load Values,2019-01-01 04:00:00,2019-01-01,1970-01-01 04:00:00,1970-01-01 05:00:00,NL,100,10440.7925,10440.7925
...,...,...,...,...,...,...,...,...,...
43819,Monthly Hourly Load Values,2023-12-31 19:00:00,2023-12-31,1970-01-01 19:00:00,1970-01-01 20:00:00,NL,100,12673.4650,12673.4650
43820,Monthly Hourly Load Values,2023-12-31 20:00:00,2023-12-31,1970-01-01 20:00:00,1970-01-01 21:00:00,NL,100,12366.1950,12366.1950
43821,Monthly Hourly Load Values,2023-12-31 21:00:00,2023-12-31,1970-01-01 21:00:00,1970-01-01 22:00:00,NL,100,12143.1775,12143.1775
43822,Monthly Hourly Load Values,2023-12-31 22:00:00,2023-12-31,1970-01-01 22:00:00,1970-01-01 23:00:00,NL,100,11993.1200,11993.1200


In [24]:
combined_2019_2023 = combined_2019_2023[['DateUTC', 'DateShort', 'Value']].copy()

# Convert date columns to datetime format
combined_2019_2023['date'] = pd.to_datetime(combined_2019_2023['DateShort'], dayfirst=True, format='%Y-%m-%d')  
combined_2019_2023['datetime'] = pd.to_datetime(combined_2019_2023['DateUTC'], format='%Y-%m-%d %H:%M')

# Extract hour, year, and other necessary components
combined_2019_2023['Hour'] = combined_2019_2023['datetime'].dt.hour + 1  # Adjust for 1–24 hour range
combined_2019_2023['Year'] = combined_2019_2023['date'].dt.year

#Rename 'Value' column to match the old dataset naming convention
combined_2019_2023.rename(columns={'Value': 'loadConsumption'}, inplace=True)

combined_2019_2023.drop(columns=['DateShort', 'DateUTC'], inplace=True)

#Check the transformed dataset
print(combined_2019_2023.head())
print(combined_2019_2023.info())


   loadConsumption       date            datetime  Hour  Year
0       11193.6350 2019-01-01 2019-01-01 00:00:00     1  2019
1       10933.3600 2019-01-01 2019-01-01 01:00:00     2  2019
2       10666.5800 2019-01-01 2019-01-01 02:00:00     3  2019
3       10413.3575 2019-01-01 2019-01-01 03:00:00     4  2019
4       10440.7925 2019-01-01 2019-01-01 04:00:00     5  2019
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43824 entries, 0 to 43823
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   loadConsumption  43824 non-null  float64       
 1   date             43824 non-null  datetime64[ns]
 2   datetime         43824 non-null  datetime64[ns]
 3   Hour             43824 non-null  int64         
 4   Year             43824 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(2)
memory usage: 1.7 MB
None


In [25]:
#combined_data.to_csv('load_data2019-2023.csv', index=False)

In [26]:
total_data_2006_2023= pd.concat([combined_2019_2023, load_2006_2019], axis=0).sort_index()


In [27]:
total_data_2006_2023

Unnamed: 0,loadConsumption,date,datetime,Hour,Year
0,11193.635,2019-01-01,2019-01-01 00:00:00,1,2019
0,10215.000,2006-01-01,2005-12-31 23:00:00,0,2006
1,9590.000,2006-01-02,2006-01-01 23:00:00,0,2006
1,10933.360,2019-01-01,2019-01-01 01:00:00,2,2019
2,10666.580,2019-01-01,2019-01-01 02:00:00,3,2019
...,...,...,...,...,...
116825,13910.000,2019-04-30,2019-04-30 17:00:00,18,2019
116826,13773.000,2019-04-30,2019-04-30 18:00:00,19,2019
116827,13652.000,2019-04-30,2019-04-30 19:00:00,20,2019
116828,13048.000,2019-04-30,2019-04-30 20:00:00,21,2019


In [29]:
print(total_data_2006_2023['datetime'].dtypes)


datetime64[ns]


In [30]:
total_data_2006_2023['datetime'] = pd.to_datetime(total_data_2006_2023['datetime'], errors='coerce')


In [31]:
print(total_data_2006_2023['datetime'].isnull().sum())


0


In [33]:
total_data_2006_2023 = total_data_2006_2023.sort_values(by='datetime').reset_index(drop=True)


In [34]:
print(total_data_2006_2023.head())
print(total_data_2006_2023.tail())


   loadConsumption       date            datetime  Hour  Year
0          10215.0 2006-01-01 2005-12-31 23:00:00     0  2006
1           9979.0 2006-01-01 2006-01-01 00:00:00     1  2006
2           9460.0 2006-01-01 2006-01-01 01:00:00     2  2006
3           8833.0 2006-01-01 2006-01-01 02:00:00     3  2006
4           8525.0 2006-01-01 2006-01-01 03:00:00     4  2006
        loadConsumption       date            datetime  Hour  Year
160649       12673.4650 2023-12-31 2023-12-31 19:00:00    20  2023
160650       12366.1950 2023-12-31 2023-12-31 20:00:00    21  2023
160651       12143.1775 2023-12-31 2023-12-31 21:00:00    22  2023
160652       11993.1200 2023-12-31 2023-12-31 22:00:00    23  2023
160653       11922.8925 2023-12-31 2023-12-31 23:00:00    24  2023


In [35]:
complete_dates = pd.date_range(start=total_data_2006_2023.index.min(),
                               end=total_data_2006_2023.index.max(),
                               freq='H')  # Adjust frequency if needed


In [36]:
missing_dates = complete_dates.difference(total_data_2006_2023.index)
print(f"Number of missing dates: {len(missing_dates)}")
print("Missing dates:")
print(missing_dates)


Number of missing dates: 1
Missing dates:
DatetimeIndex(['1970-01-01'], dtype='datetime64[ns]', freq='H')


In [37]:
total_data_2006_2023 = total_data_2006_2023[total_data_2006_2023.index != '1970-01-01']


In [38]:
print('1970-01-01' in total_data_2006_2023.index)


False


In [39]:
missing_data = total_data_2006_2023.isnull().sum()
print(missing_data)


loadConsumption    10
date                0
datetime            0
Hour                0
Year                0
dtype: int64


In [40]:
def fill_with_average(series):

    return series.fillna((series.shift() + series.shift(-1)) / 2)

# Apply to a specific to loadConsumption
total_data_2006_2023['loadConsumption'] = fill_with_average(total_data_2006_2023['loadConsumption'])


In [41]:
missing_data = total_data_2006_2023.isnull().sum()
print(missing_data)

loadConsumption    0
date               0
datetime           0
Hour               0
Year               0
dtype: int64


In [42]:
data_2019 = total_data_2006_2023[total_data_2006_2023['Year'] == 2019]


In [43]:
duplicates_date = data_2019[data_2019.duplicated(subset=['datetime'], keep=False)]
print("Duplicates based on 'datetime':")
print(duplicates_date)


Duplicates based on 'datetime':
        loadConsumption       date            datetime  Hour  Year
113952       11466.0000 2019-01-01 2019-01-01 00:00:00     1  2019
113953       11193.6350 2019-01-01 2019-01-01 00:00:00     1  2019
113954       10933.3600 2019-01-01 2019-01-01 01:00:00     2  2019
113955       11207.0000 2019-01-01 2019-01-01 01:00:00     2  2019
113956       10940.0000 2019-01-01 2019-01-01 02:00:00     3  2019
...                 ...        ...                 ...   ...   ...
119703       13661.3600 2019-04-30 2019-04-30 19:00:00    20  2019
119704       13059.5350 2019-04-30 2019-04-30 20:00:00    21  2019
119705       13048.0000 2019-04-30 2019-04-30 20:00:00    21  2019
119706       12420.8925 2019-04-30 2019-04-30 21:00:00    22  2019
119707       12408.0000 2019-04-30 2019-04-30 21:00:00    22  2019

[5755 rows x 5 columns]


In [44]:
data_2019_cleaned = data_2019.drop_duplicates(subset=['datetime'], keep='first')


In [45]:
# Remove the old 2019 data
total_data_2006_2023 = total_data_2006_2023[total_data_2006_2023['Year'] != 2019]

# Append the cleaned 2019 data
total_data_2006_2023 = pd.concat([total_data_2006_2023, data_2019_cleaned])

# Sort and reset index
total_data_2006_2023 = total_data_2006_2023.sort_values(by='datetime').reset_index(drop=True)


In [46]:
duplicates_date_total = total_data_2006_2023[total_data_2006_2023.duplicated(subset=['datetime'], keep=False)]
print("Duplicates based on 'datetime':")
print(duplicates_date_total)

Duplicates based on 'datetime':
        loadConsumption       date            datetime  Hour  Year
124826        9700.3175 2020-03-29 2020-03-29 03:00:00     4  2020
124827        9800.4350 2020-03-29 2020-03-29 03:00:00     4  2020
133562        9888.5300 2021-03-28 2021-03-28 03:00:00     4  2021
133563        9812.5000 2021-03-28 2021-03-28 03:00:00     4  2021
142298        9292.0550 2022-03-27 2022-03-27 03:00:00     4  2022
142299        9285.1450 2022-03-27 2022-03-27 03:00:00     4  2022
151034        9313.6250 2023-03-26 2023-03-26 03:00:00     4  2023
151035        9338.1725 2023-03-26 2023-03-26 03:00:00     4  2023


In [47]:
total_data_2006_2023 = total_data_2006_2023.drop_duplicates(subset=['datetime'], keep='first')


In [48]:
total_data_2006_2023.to_csv('data_2006_2023.csv', index=False)
