In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('../data_preprocessing/Merged_data/002_verwaltung_with_holidays_indoor_person_counts.csv')
df['DateTime'] = pd.to_datetime(df['DateTime'], format='%Y-%m-%d %H:%M:%S')

In [3]:
df.set_index('DateTime',inplace=True)

In [4]:
df = df.astype({'is_holiday': 'int32', 'day_of_week': 'int32', 'hour_of_day': 'int32', 'is_working_hour': 'int32'})

In [5]:
# Define the date ranges for training and test data
train_end_date = df.index.min() + pd.DateOffset(months=9)
test_start_date = train_end_date

# Split the DataFrame
train_data = df[:train_end_date]
test_data = df[test_start_date:]

In [6]:
# Load and prepare the data
# Assuming `df` is your DataFrame and it has been preprocessed as shown in the provided context
input_cols = ['is_holiday', 'day_of_week', 'hour_of_day', 'is_working_hour', 'number_of_people',
              'Temperature', 'Humidity', 'Dewpoint', 'Sun Duration', 'Precipitation Height',
              'Wind Speed', 'Wind Direction', 'indoor_temperature', 'temperature_difference']
target_col = 'heating_15min_diff'

# Split the data into training and test sets
X_train = train_data[input_cols].astype(np.float16)
y_train = train_data[target_col].astype(np.float16)
X_test = test_data[input_cols].astype(np.float16)
y_test = test_data[target_col].astype(np.float16)


In [7]:
# Define the kernel: Constant * RBF
kernel = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-3, 1e3))

# Initialize and fit the GPR model
gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=1e-2)
gpr.fit(X_train, y_train)

MemoryError: Unable to allocate 10.2 GiB for an array with shape (26205, 26205, 2) and data type float64

In [None]:
# Make predictions
y_pred, sigma = gpr.predict(X_test, return_std=True)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R² Score: {r2:.4f}')

In [None]:
# Plot the actual vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(y_test, label='Actual', color='b')
plt.plot(y_pred, label='Predicted', color='r', linestyle='--')
plt.fill_between(np.arange(len(y_pred)), y_pred - 1.96 * sigma, y_pred + 1.96 * sigma, color='r', alpha=0.2)
plt.xlabel('Time Step')
plt.ylabel('Heating 15min Diff')
plt.title('Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()