### Introduction

In [1]:
# This code produces a user interface to predict average gas flow consumption (Sm3/h) based on Outside Temperature (°C)and Production status for each hour
# The information helps the operator determine when to operate heat recovery systeme to prevent peak consumption higher tariffs

In [2]:
import numpy as np
import statsmodels.api as sm
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score

### Functions

In [3]:
# Formula to calculate adjusted R2
def adjusted_r2_score(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p -1)

## Import the data

In [4]:

df=pd.read_excel('data_SR_JP.xlsx', sheet_name='data')

In [5]:
# Convert and resample data based on average
df['timestamp'] = pd.to_datetime(df['timestamp'])
df1 = df.set_index('timestamp')
# set sample to 0.5h
sample_hr = 0.5
df_hourly = df1 
if sample_hr > 0:
    df_hourly = df1.select_dtypes(include='number').resample(f'{sample_hr}H').mean()
#drop lines with no data
df_hourly.dropna(how='any', inplace=True)
df_hourly.reset_index(drop=True, inplace=True)

  df_hourly = df1.select_dtypes(include='number').resample(f'{sample_hr}H').mean()


In [6]:
#Remove outliers in predicted data
data_filtered = df_hourly[df_hourly['Gas flow (Sm3/h)'] < df_hourly['Gas flow (Sm3/h)'].quantile(0.99)]

In [7]:
X = data_filtered[['Outside_temperature (°C)', 'Production']]
y = data_filtered['Gas flow (Sm3/h)'].values

#### OLS Regression on price table for each area

In [8]:
# OLS Regression models and adjusted R²

X_ols = sm.add_constant(X)
model_OLS = sm.OLS(y, X_ols).fit()
print(model_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.773
Model:                            OLS   Adj. R-squared:                  0.772
Method:                 Least Squares   F-statistic:                 1.426e+04
Date:                Wed, 23 Apr 2025   Prob (F-statistic):               0.00
Time:                        10:53:56   Log-Likelihood:                -53715.
No. Observations:                8402   AIC:                         1.074e+05
Df Residuals:                    8399   BIC:                         1.075e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

### Linear regression model

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

In [10]:
model.coef_

array([ -3.73559207, 542.38549128])

In [11]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)

# Get number of samples and features
n = X.shape[0]  # Number of samples
p = X.shape[1]  # Number of features

adj_R2=adjusted_r2_score(r2, n, p)

print("Adjusted R² score:", round(adj_R2,2))


Adjusted R² score: 0.78


In [12]:
# Initialize K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = LinearRegression()

adjusted_r2_scores = []

# Perform manual cross-validation
for train_index, test_index in kf.split(X_train):
    X_tr, X_te = X_train.iloc[train_index], X_train.iloc[test_index]
    y_tr, y_te = y_train[train_index], y_train[test_index]

    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)

    r2 = r2_score(y_te, y_pred)
    n = len(y_te)
    p = X_train.shape[1]  # number of predictors
    adj_r2 = adjusted_r2_score(r2, n, p)

    adjusted_r2_scores.append(round(adj_r2,2))

print("Adjusted R² scores for each fold:", adjusted_r2_scores)
print("Average Adjusted R²:", round(np.mean(adjusted_r2_scores),2))


Adjusted R² scores for each fold: [0.79, 0.77, 0.77, 0.76, 0.77]
Average Adjusted R²: 0.77


### Creating a summary table

In [13]:
from sklearn.feature_selection import f_regression
f_regression(X,y)
# Create the summary dataframe
reg_summary = pd.DataFrame(data=X.columns.values, columns=['Features'])

# Add coefficients from the model
reg_summary['Coefficients'] = model.coef_
    
# Add p-values
p_values=f_regression(X,y)[1]
reg_summary['p-values'] = p_values.round(3)
reg_summary

Unnamed: 0,Features,Coefficients,p-values
0,Outside_temperature (°C),-3.916547,0.0
1,Production,542.868143,0.0


### Random forest regression

In [14]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [15]:
# Predict and evaluate
y_pred = model_rf.predict(X_test)
r2 = r2_score(y_test, y_pred)

adj_R2=adjusted_r2_score(r2, n, p)

print("Adjusted R² score:", round(adj_R2,2))


Adjusted R² score: 0.71


In [16]:
# Initialize K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

adjusted_r2_scores = []

# Perform manual cross-validation
for train_index, test_index in kf.split(X_train):
    X_tr, X_te = X_train.iloc[train_index], X_train.iloc[test_index]
    y_tr, y_te = y_train[train_index], y_train[test_index]

    model_rf.fit(X_tr, y_tr)
    y_pred = model_rf.predict(X_te)

    r2 = r2_score(y_te, y_pred)
    n = len(y_te)
    p = X_train.shape[1]  # number of predictors
    adj_r2 = adjusted_r2_score(r2, n, p)

    adjusted_r2_scores.append(round(adj_r2,2))

print("Adjusted R² scores for each fold:", adjusted_r2_scores)
print("Average Adjusted R²:", round(np.mean(adjusted_r2_scores),2))


Adjusted R² scores for each fold: [0.71, 0.7, 0.7, 0.7, 0.7]
Average Adjusted R²: 0.7


### Feature importance Permutation Importance

In [17]:
importances = model_rf.feature_importances_

feature_names = X_train.columns
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)
importance_df

Unnamed: 0,Feature,Importance
1,Production,0.797037
0,Outside_temperature (°C),0.202963


#### Choosing the regression model

In [18]:
# Since Average Adjusted R²: 0.77 for linear regression is higher than 
# Average Adjusted R²: 0.70 for Random Forest regression
# We choose the linear regression for the rest of the code

### Creating a dashboard to predict gas consumption based on Temperature and Production

In [None]:
from dash import Dash, dcc, html, Input, Output, State, dash_table
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from sklearn.linear_model import LinearRegression

# Dummy training section (replace with actual model and training data)
model = LinearRegression()
model.fit(X_train, y_train)

# Assume model is trained and accessible as `model`

app = Dash(__name__)
hours = list(range(24))

# Initialize input data
initial_data = [{'Hour': h, 'Outside_Temperature': None, 'Production': 'False', 'Prediction': None} for h in hours]

app.layout = html.Div([
    html.H2("Hourly Prediction of Gas flow (Sm3/h) with Linear Regression"),
    dash_table.DataTable(
        id='input-table',
        columns=[
            {'name': 'Hour', 'id': 'Hour', 'editable': False},
            {'name': 'Outside_Temperature (°C)', 'id': 'Outside_Temperature', 'type': 'numeric'},
            {'name': 'Production', 'id': 'Production', 'type': 'text', 'presentation': 'dropdown'},
            {'name': 'Predicted Gas flow (Sm3/h)', 'id': 'Prediction', 'editable': False}
        ],
        dropdown={
            'Production': {
                'options': [
                    {'label': 'True', 'value': 'True'},
                    {'label': 'False', 'value': 'False'}
                ]
            }
        },
        data=initial_data,
        editable=True,
        row_deletable=False,
        style_table={'overflowX': 'auto'},
    ),
    html.Br(),
    html.Button("Predict", id='predict-button'),
    html.Br(), html.Br(),
    html.Div(id='error-message', style={'color': 'red'}),
    dcc.Graph(id='prediction-graph')
])

@app.callback(
    Output('input-table', 'data'),
    Output('error-message', 'children'),
    Input('predict-button', 'n_clicks'),
    State('input-table', 'data')
)
def update_predictions(n_clicks, rows):
    if not n_clicks:
        return rows, ''

    updated_rows = []
    for row in rows:
        temp = row['Outside_Temperature']
        prod = row['Production']
        try:
            temp = float(temp)
        except (ValueError, TypeError):
            return rows, 'Please enter numeric values for Outside_Temperature (°C).'

        if str(prod).strip().lower() not in ['true', 'false']:
            return rows, 'Production must be "True" or "False".'

        prod = 1 if str(prod).strip().lower() == 'true' else 0

        try:
            X_input = np.array([[temp, prod]])
            prediction = model.predict(X_input)[0]
            row['Prediction'] = round(prediction, 2)
        except Exception:
            return rows, 'Prediction failed. Please check inputs.'

        updated_rows.append(row)

    return updated_rows, ''

@app.callback(
    Output('prediction-graph', 'figure'),
    Input('input-table', 'data')
)
def update_graph(data):
    hours = [row['Hour'] for row in data if row['Prediction'] is not None]
    predictions = [row['Prediction'] for row in data if row['Prediction'] is not None]

    return go.Figure(
        data=go.Scatter(x=hours, y=predictions, mode='lines+markers'),
        layout=go.Layout(
            title='Predicted Gas Flow by Hour',
            xaxis={'title': 'Hour'},
            yaxis={'title': 'Predicted Gas flow (Sm3/h)'}
        )
    )

if __name__ == '__main__':
    app.run(debug=True)


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does not have valid feature names, but LinearRegression was fitted with feature names


X does no