In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Market Analysis Team - Analysis and Prediction - OMV

### Author: Samuel Nicolas Bernát (2024)

This notebook analyzes electricity production of gas powered electrical power plants in Austria for OMV Gas Marketing & Trading GmbH (OMV), and their relationship to general consumption of gas in Austria. It also contains a showcase of Linear Regression model and a prediction, utilized via joblib.

### Table of Contents
1. [Data Loading and Transformation](#Data-Loading-and-Transformation)
2. [Data Preparation](#Data-Preparation)
3. [Linear Regression Prediction and ML](#Linear-Regression-Prediction-and-ML)
4. [Prediction of Gas Consumption in Austria](#Prediction-of-Gas-Consumption-in-Austria)
5. [Key Findings](#Key-Findings)

## Data Loading
The data is loaded from CSV files - data sources are web portals ENTSOE transparency, AGGM.

In [None]:
import pandas as pd
import plotly.express as px

In [None]:
aggm_df = pd.read_csv('data-omv/gas_consumption.csv')
entsoe_df = pd.read_csv('data-omv/electricity_production.csv')

## Data Preparation

In [None]:
# aggm_df.to_csv('gas_consumption.csv', index=False)
# entsoe_df.to_csv('electricity_production.csv', index=True)

aggm_df['name'] = aggm_df['name'].astype(str)
entsoe_df['station_name'] = entsoe_df['station_name'].astype(str)

aggm_df['datetime'] = pd.to_datetime(aggm_df['datetime'])
entsoe_df['datetime'] = pd.to_datetime(entsoe_df['datetime'])


aggm_consumption_df = aggm_df[aggm_df['name'].str.contains('ErmittelterEKVOesterreich', na=False)]      # filter aggm df to contain only consumption data
aggm_consumption_df.loc[:, 'value_mwhd'] = aggm_consumption_df['value_mwhd'] * -1

print(aggm_consumption_df)
ac_df = aggm_consumption_df

In [None]:
entsoe_df['datetime'] = pd.to_datetime(entsoe_df['datetime'])           # filter entsoe df to contain aggregated records of production of all powerplants in a day to 16:00
entsoe_df.set_index('datetime', inplace=True)
entsoe_production_df = entsoe_df['value_mwhd'].resample('24h', offset='16h').sum().reset_index()

ec_df = entsoe_production_df

Visualized correlation of electricity production of plants running on gas vs. general gas consumption in Austria

In [None]:
entsoe_production_df = entsoe_production_df.copy()
aggm_consumption_df = aggm_consumption_df.copy()

entsoe_production_df.loc[:, 'source'] = 'ENTSOE'
aggm_consumption_df.loc[:, 'source'] = 'AGGM'


combined_df = pd.concat([                       # combining dfs
    entsoe_production_df[['datetime', 'value_mwhd', 'source']],
    aggm_consumption_df[['datetime', 'value_mwhd', 'source']]], ignore_index=True)


fig = px.line(combined_df, x='datetime', y='value_mwhd', color='source', title='Electricity Production vs. Gas Consumption Comparison')
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Consumption MWh/d',
    legend_title='Source'
)

fig.show()


## Linear Regression Prediction and ML
Preparing data:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import joblib

ec_df.loc[:, 'datetime'] = ec_df['datetime'].dt.date
ac_df.loc[:, 'datetime'] = ac_df['datetime'].dt.date
ec_df.set_index('datetime', inplace=True)
ac_df.set_index('datetime', inplace=True)

data = pd.merge(ec_df.reset_index(), ac_df.reset_index(), on='datetime', suffixes=('_entsoe', '_aggm'))
data.drop(columns=['name'], inplace=True)
data.dropna(inplace=True)

print(data)

In [None]:
X = data[['value_mwhd_entsoe']]                                                             # preparing feature and target variables
y = data['value_mwhd_aggm']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)    # splitting data into training and testing sets

model = LinearRegression()                                                                  # training linear regression model
model.fit(X_train, y_train)
joblib.dump(model, 'gas_consumption_linear_regression_model.pkl')                           # save the trained model for later use

y_pred = model.predict(X_test)                                                              # predicting AGGM consumption on test set
data['predicted_aggm_consumption'] = model.predict(X)

fig = px.line(data, x='datetime', y=['value_mwhd_aggm', 'predicted_aggm_consumption'], title='Actual vs predicted AGGM gas consumption in Austria')    # Plot the actual vs predicted values using Plotly Express
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Consumption MWh/d',
    legend_title='Actual value/Prediction'
)

fig.update_traces(
    name='Actual Gas Consumption', selector=dict(name='value_mwhd_aggm')
)
fig.update_traces(
    name='Predicted Gas Consumption', selector=dict(name='predicted_aggm_consumption')
)
fig.show()


In the plot above, we can visually see the correlation of the actual gas consumption in Austria and the predicted values.
The results of performance metrics below are not ideal, but good enough as a showcase. Ideally, root mean squared error should be as low as possible, because it shows deviations of predicted values ​​from the actual ones. R2 score should be as close to 1 as possible in order to increase the model performance.

In [None]:
mse = mean_squared_error(y_test, y_pred)                                                # calculating performance metrics to evaluate solution
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Root mean squared error: {rmse}')
print(f'R2 score: {r2}')

## Prediction of Gas Consumption in Austria
This prediction is based on the trained model and the data we have with expected electricity production of gas powered power plants within Austria. Based on these data we try to predict the overall gas consumption in Austria.

In [None]:
prediction_entsoe_df = pd.read_csv('data-omv/future_gas_consumption.csv')                        # load data from CSV file about prediction of electricity production

prediction_entsoe_df['datetime'] = pd.to_datetime(prediction_entsoe_df['datetime'])     # preparing data
prediction_entsoe_df.set_index('datetime', inplace=True)
prediction_entsoe_df = prediction_entsoe_df['value_mwhd'].resample('24h', offset='16h').sum().reset_index()
prediction_entsoe_df.rename(columns={'value_mwhd': 'value_mwhd_entsoe'}, inplace=True)
new_entsoe_consumption = prediction_entsoe_df[['value_mwhd_entsoe']]

loaded_model = joblib.load('gas_consumption_linear_regression_model.pkl')               # load the saved linear regression model (above code)
aggm_predictions = loaded_model.predict(new_entsoe_consumption)                         # predict the data

prediction_entsoe_df['predicted_aggm_consumption'] = aggm_predictions                   # inserting predictions to the df for comparison
print(prediction_entsoe_df)

In [None]:
fig = px.line(prediction_entsoe_df, x='datetime', y=['value_mwhd_entsoe', 'predicted_aggm_consumption'], title='Predicted total gas consumption based on powerplants electricity production')
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Consumption & Production MWh/d',
    legend_title='Consumption and Production Data'
)
fig.update_traces(
    name='Actual Electricity Production', selector=dict(name='value_mwhd_entsoe')
)
fig.update_traces(
    name='Predicted Gas Consumption', selector=dict(name='predicted_aggm_consumption')
)
fig.show()

## Key Findings
- We have developed a Linear Regression model capable of predicting general gas consumption in Austria based on electricity production of gas powered electrical powerplants
- We have saved the model for further use
- In the graph above, we can see predicted values ​​for the future of gas consumption