In [2]:
import pandas as pd
import numpy as np
import os
from datetime import datetime as dt, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
from urllib.parse import urlparse
import mlflow.sklearn 
from mlflow.tracking import MlflowClient
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [37]:
from transparency_epias.consumption import consumptionClient

In [38]:
obj = consumptionClient.consumptionClient()

In [27]:
today = dt.now()

In [28]:
print(str(today.date()))

2023-06-22


In [41]:
date_list, consumption_list = obj.consumption_realtime("2020-01-01",str(today.date()))

In [42]:
date_list[0]

'2020-01-01T00:00:00.000+0300'

In [43]:
consumption_list[0]

28125.46

In [64]:
df_dynamic = pd.DataFrame(
    {'DATE': date_list,
     'CONSUMPTION': consumption_list
    })

In [65]:
df_dynamic.head()

Unnamed: 0,DATE,CONSUMPTION
0,2020-01-01T00:00:00.000+0300,28125.46
1,2020-01-01T01:00:00.000+0300,26548.41
2,2020-01-01T02:00:00.000+0300,25287.99
3,2020-01-01T03:00:00.000+0300,24311.53
4,2020-01-01T04:00:00.000+0300,23730.48


In [66]:
df_dynamic.tail()

Unnamed: 0,DATE,CONSUMPTION
30439,2023-06-22T07:00:00.000+0300,30346.21
30440,2023-06-22T08:00:00.000+0300,35444.91
30441,2023-06-22T09:00:00.000+0300,38110.11
30442,2023-06-22T10:00:00.000+0300,38753.11
30443,2023-06-22T11:00:00.000+0300,38385.44


In [68]:
df_dynamic["DAY"] = pd.to_datetime(df_dynamic["DATE"]).dt.day

In [69]:
df_dynamic["MONTH"] = pd.to_datetime(df_dynamic["DATE"]).dt.month

In [70]:
df_dynamic["HOUR"] = pd.to_datetime(df_dynamic["DATE"]).dt.hour

In [71]:
df_dynamic["YEAR"] = pd.to_datetime(df_dynamic["DATE"]).dt.year

In [72]:
df_dynamic.head()

Unnamed: 0,DATE,CONSUMPTION,DAY,MONTH,HOUR,YEAR
0,2020-01-01T00:00:00.000+0300,28125.46,1,1,0,2020
1,2020-01-01T01:00:00.000+0300,26548.41,1,1,1,2020
2,2020-01-01T02:00:00.000+0300,25287.99,1,1,2,2020
3,2020-01-01T03:00:00.000+0300,24311.53,1,1,3,2020
4,2020-01-01T04:00:00.000+0300,23730.48,1,1,4,2020


In [73]:
df_dynamic.tail()

Unnamed: 0,DATE,CONSUMPTION,DAY,MONTH,HOUR,YEAR
30439,2023-06-22T07:00:00.000+0300,30346.21,22,6,7,2023
30440,2023-06-22T08:00:00.000+0300,35444.91,22,6,8,2023
30441,2023-06-22T09:00:00.000+0300,38110.11,22,6,9,2023
30442,2023-06-22T10:00:00.000+0300,38753.11,22,6,10,2023
30443,2023-06-22T11:00:00.000+0300,38385.44,22,6,11,2023


In [86]:
type(df_dynamic["CONSUMPTION"][0])

numpy.float64

In [74]:
df_dynamic = df_dynamic[["CONSUMPTION", "YEAR", "MONTH", "DAY","HOUR"]]

In [89]:
df_dynamic.head()

Unnamed: 0,CONSUMPTION,YEAR,MONTH,DAY,HOUR
0,28125.46,2020,1,1,0
1,26548.41,2020,1,1,1
2,25287.99,2020,1,1,2
3,24311.53,2020,1,1,3
4,23730.48,2020,1,1,4


In [81]:
df_dynamic["CONSUMPTION"] = df_dynamic["CONSUMPTION"].astype(float)

In [82]:
df_dynamic_agg = df_dynamic.groupby(['DAY', 'MONTH', 'YEAR']).agg({'CONSUMPTION': 'sum'})

In [83]:
df_dynamic_agg = df_dynamic_agg.reset_index()

In [90]:
df_dynamic_agg = df_dynamic_agg[["CONSUMPTION", "YEAR", "MONTH", "DAY"]]

In [91]:
df_dynamic_agg.head()

Unnamed: 0,CONSUMPTION,YEAR,MONTH,DAY
0,684709.92,2020,1,1
1,695973.69,2021,1,1
2,754257.04,2022,1,1
3,689830.56,2023,1,1
4,806502.33,2020,2,1


In [88]:
df_dynamic.to_csv('../GercekZamanliTuketim') 

In [None]:
#### STANDART WAY

In [2]:
df = pd.read_csv("GercekZamanliTuketim-15062020-15062023.csv", encoding= 'unicode_escape')

In [3]:
df.head()

Unnamed: 0,Tarih,Saat,Tüketim Miktarý (MWh)
0,15.06.2020,00:00,"28.407,29"
1,15.06.2020,01:00,"26.899,36"
2,15.06.2020,02:00,"25.893,43"
3,15.06.2020,03:00,"25.304,63"
4,15.06.2020,04:00,"25.196,80"


In [4]:
df.shape

(26300, 3)

In [5]:
columns = ["DATE", "TIME", "CONSUMPTION"]

In [6]:
df.set_axis(columns, axis='columns', inplace=True)

In [7]:
df.head()

Unnamed: 0,DATE,TIME,CONSUMPTION
0,15.06.2020,00:00,"28.407,29"
1,15.06.2020,01:00,"26.899,36"
2,15.06.2020,02:00,"25.893,43"
3,15.06.2020,03:00,"25.304,63"
4,15.06.2020,04:00,"25.196,80"


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26300 entries, 0 to 26299
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   DATE         26300 non-null  object
 1   TIME         26300 non-null  object
 2   CONSUMPTION  26300 non-null  object
dtypes: object(3)
memory usage: 616.5+ KB


In [9]:
df['DATE'] = pd.to_datetime(df['DATE'].str.replace('.','-') + ' ' +df['TIME'])

  df['DATE'] = pd.to_datetime(df['DATE'].str.replace('.','-') + ' ' +df['TIME'])


In [10]:
df.drop('TIME', axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,DATE,CONSUMPTION
0,2020-06-15 00:00:00,"28.407,29"
1,2020-06-15 01:00:00,"26.899,36"
2,2020-06-15 02:00:00,"25.893,43"
3,2020-06-15 03:00:00,"25.304,63"
4,2020-06-15 04:00:00,"25.196,80"


In [12]:
df['CONSUMPTION'] = df['CONSUMPTION'].str.replace(',','')

In [13]:
df["DAY"] = df["DATE"].dt.day

In [14]:
df["MONTH"] = df["DATE"].dt.month

In [15]:
df["YEAR"] = df["DATE"].dt.year

In [16]:
df["HOUR"] = df["DATE"].dt.hour

In [17]:
df["CONSUMPTION"] = df["CONSUMPTION"].astype(float)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26300 entries, 0 to 26299
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   DATE         26300 non-null  datetime64[ns]
 1   CONSUMPTION  26300 non-null  float64       
 2   DAY          26300 non-null  int64         
 3   MONTH        26300 non-null  int64         
 4   YEAR         26300 non-null  int64         
 5   HOUR         26300 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 1.2 MB


In [19]:
df.head()

Unnamed: 0,DATE,CONSUMPTION,DAY,MONTH,YEAR,HOUR
0,2020-06-15 00:00:00,28.40729,15,6,2020,0
1,2020-06-15 01:00:00,26.89936,15,6,2020,1
2,2020-06-15 02:00:00,25.89343,15,6,2020,2
3,2020-06-15 03:00:00,25.30463,15,6,2020,3
4,2020-06-15 04:00:00,25.1968,15,6,2020,4


In [25]:
# Group the data by day, month, and year, and sum the 'CONSUMPTION' column
df_day = df.groupby(['DAY', 'MONTH', 'YEAR']).agg({'CONSUMPTION': 'sum'})

# Reset the index to make 'DAY', 'MONTH', and 'YEAR' as regular columns
df_day = df_day.reset_index()

In [26]:
df_day = df_day[["CONSUMPTION", "YEAR", "MONTH", "DAY"]]

In [27]:
df_day[:5]

Unnamed: 0,CONSUMPTION,YEAR,MONTH,DAY
0,695.97369,2021,1,1
1,754.25704,2022,1,1
2,689.83056,2023,1,1
3,726.64643,2021,2,1
4,773.71833,2022,2,1


In [28]:
df_hour = df[["CONSUMPTION", "YEAR", "MONTH", "DAY", "HOUR"]]

In [29]:
df_hour[:5]

Unnamed: 0,CONSUMPTION,YEAR,MONTH,DAY,HOUR
0,28.40729,2020,6,15,0
1,26.89936,2020,6,15,1
2,25.89343,2020,6,15,2
3,25.30463,2020,6,15,3
4,25.1968,2020,6,15,4


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26300 entries, 0 to 26299
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   DATE         26300 non-null  datetime64[ns]
 1   CONSUMPTION  26300 non-null  float64       
 2   DAY          26300 non-null  int64         
 3   MONTH        26300 non-null  int64         
 4   YEAR         26300 non-null  int64         
 5   HOUR         26300 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 1.2 MB


In [26]:
X_hour = df_hour.iloc[:, 1:]
print(X_hour.shape)
print(type(X_hour))
print(X_hour[:3])

# Output variable
y_hour = df_hour.iloc[:, 0]
print(y_hour.shape)
print(type(y_hour))
print(y_hour[:3])

(26300, 4)
<class 'pandas.core.frame.DataFrame'>
   YEAR  MONTH  DAY  HOUR
0  2020      6   15     0
1  2020      6   15     1
2  2020      6   15     2
(26300,)
<class 'pandas.core.series.Series'>
0    28.40729
1    26.89936
2    25.89343
Name: CONSUMPTION, dtype: float64


In [27]:
X_day = df_day.iloc[:, 1:]
print(X_day.shape)
print(type(X_day))
print(X_day[:3])

# Output variable
y_day = df_day.iloc[:, 0]
print(y_day.shape)
print(type(y_day))
print(y_day[:3])

(1096, 3)
<class 'pandas.core.frame.DataFrame'>
   YEAR  MONTH  DAY
0  2021      1    1
1  2022      1    1
2  2023      1    1
(1096,)
<class 'pandas.core.series.Series'>
0    695.97369
1    754.25704
2    689.83056
Name: CONSUMPTION, dtype: float64


In [28]:
# MODEL FOR HOURLY CONSUMPTION

In [29]:
X_train_hour, X_test_hour, y_train_hour, y_test_hour = train_test_split(X_hour,y_hour, test_size=0.2, random_state=42)

In [31]:
pipeline_hour = Pipeline([
    ('ct-ohe', ColumnTransformer([('ct', OneHotEncoder(handle_unknown='ignore', categories='auto'), [0, 1, 2, 3])], remainder='passthrough')),
    ('scaler', StandardScaler(with_mean=False)),
    ('estimator', TransformedTargetRegressor(regressor=xgboost.XGBRegressor(), transformer=StandardScaler()))
])

# Fit the pipeline
pipeline_hour.fit(X_train_hour, y_train_hour)
print(X_train_hour[:5])
print(X_test_hour)
y_pred_hour = pipeline_hour.predict(X_test_hour)

       YEAR  MONTH  DAY  HOUR
11507  2021      7   10    11
18018  2022      5    7    18
16190  2022      4   20    14
5332   2021      1   23     4
26004  2023      3    6    12
       YEAR  MONTH  DAY  HOUR
18303  2022      7   17    15
24612  2023      6    4    12
2973   2020     10   16    21
22501  2023      8    1    13
4699   2020     12   27    19
...     ...    ...  ...   ...
9222   2021      4    7     6
5245   2021      1   19    13
23086  2023      1    2    22
2814   2020     10   10     6
19525  2022      6    9    13

[5260 rows x 4 columns]


In [32]:
# MODEL FOR DAILY CONSUMPTION

In [33]:
X_train_day, X_test_day, y_train_day, y_test_day = train_test_split(X_day,y_day, test_size=0.2, random_state=42)

In [34]:
pipeline_day = Pipeline([
    ('ct-ohe', ColumnTransformer([('ct', OneHotEncoder(handle_unknown='ignore', categories='auto'), [0, 1, 2])], remainder='passthrough')),
    ('scaler', StandardScaler(with_mean=False)),
    ('estimator', TransformedTargetRegressor(regressor=xgboost.XGBRegressor(), transformer=StandardScaler()))
])

# Fit the pipeline
pipeline_day.fit(X_train_day, y_train_day)
print(X_train_day[:5])
print(X_test_day)
y_pred_day = pipeline_day.predict(X_test_day)

     YEAR  MONTH  DAY
2    2023      1    1
6    2021      3    1
590  2022      5   17
634  2020      8   18
842  2022      5   24
     YEAR  MONTH  DAY
44   2023      3    2
568  2020     10   16
56   2023      7    2
636  2022      8   18
486  2020      7   14
..    ...    ...  ...
757  2021      1   22
713  2021     10   20
365  2022      2   11
299  2022      4    9
286  2021     12    8

[220 rows x 3 columns]


In [35]:
X_test_day[:10]

Unnamed: 0,YEAR,MONTH,DAY
44,2023,3,2
568,2020,10,16
56,2023,7,2
636,2022,8,18
486,2020,7,14
96,2021,9,3
761,2022,2,22
51,2021,6,2
107,2023,12,3
666,2022,6,19


In [36]:
y_pred_day[:10]

array([ 871.0726 ,  787.89844,  879.00024, 1001.0668 ,  871.0579 ,
        907.2531 ,  919.4171 ,  861.8203 ,  873.6684 ,  910.99994],
      dtype=float32)

In [39]:
new_df = X_test_day.assign(CONSUMPTION = y_pred_day)

In [40]:
new_df

Unnamed: 0,YEAR,MONTH,DAY,CONSUMPTION
44,2023,3,2,871.072571
568,2020,10,16,787.898438
56,2023,7,2,879.000244
636,2022,8,18,1001.066772
486,2020,7,14,871.057922
...,...,...,...,...
757,2021,1,22,897.334412
713,2021,10,20,873.089417
365,2022,2,11,811.608643
299,2022,4,9,966.237427


In [45]:
# for key, value in new_df.iterrows():
#     print(row, int(value['YEAR']))

#### TEST FOR HOUR

In [53]:
input_for_24_hour = dt.now()

In [54]:
input_for_24_hour

datetime.datetime(2023, 6, 18, 17, 54, 25, 256793)

In [64]:
year = input_for_24_hour.year
month = input_for_24_hour.month
day=input_for_24_hour.day
hour=input_for_24_hour.hour

In [65]:
hour

17

In [66]:
type(hour)

int

In [78]:
## range will be an input from end user

In [81]:
row_list_hour = []
for i in range(24):
    hour = hour +1
    row = [year,month,day,hour]
    if int(hour) == 24:
        hour = 0
        row = [year,month,day,hour]
    row_list_hour.append(row)
print(row_list_hour)
print(len(row_list_hour))

[[2023, 6, 23, 18], [2023, 6, 23, 19], [2023, 6, 23, 20], [2023, 6, 23, 21], [2023, 6, 23, 22], [2023, 6, 23, 23], [2023, 6, 23, 0], [2023, 6, 23, 1], [2023, 6, 23, 2], [2023, 6, 23, 3], [2023, 6, 23, 4], [2023, 6, 23, 5], [2023, 6, 23, 6], [2023, 6, 23, 7], [2023, 6, 23, 8], [2023, 6, 23, 9], [2023, 6, 23, 10], [2023, 6, 23, 11], [2023, 6, 23, 12], [2023, 6, 23, 13], [2023, 6, 23, 14], [2023, 6, 23, 15], [2023, 6, 23, 16], [2023, 6, 23, 17]]
24


In [94]:
df_pred_hours = pd.DataFrame(data=row_list_hour , columns = ["YEAR", "MONTH", "DAY", "HOUR"])

In [95]:
pred_hour = pipeline_hour.predict(df_pred_hours)

In [96]:
pred_hour

array([39.083347, 38.75528 , 39.044403, 38.5382  , 37.432568, 35.840336,
       33.80017 , 32.27113 , 30.921711, 30.102102, 30.108494, 29.397594,
       29.786331, 31.639996, 36.59574 , 38.550808, 40.06878 , 40.67454 ,
       39.255787, 39.770035, 39.406483, 39.63416 , 38.826714, 39.02807 ],
      dtype=float32)

In [126]:
type(pred_hour)

numpy.ndarray

#### TEST FOR DAYS

In [85]:
input_for_days = dt.now()

In [86]:
year = input_for_24_hour.year
month = input_for_24_hour.month
day=input_for_24_hour.day

In [87]:
## range will be an input from end user

In [88]:
range_from_user = 5

In [89]:
row_list_day = []
for i in range(5):
    print(i)
    day = day +1
    row = [year,month,day]
    if int(hour) == 24:
        hour = 0
        row = [year,month,day]
    row_list_day.append(row)
print(row_list_day)
print(len(row_list_day))

0
1
2
3
4
[[2023, 6, 19], [2023, 6, 20], [2023, 6, 21], [2023, 6, 22], [2023, 6, 23]]
5


In [97]:
df_pred_days = pd.DataFrame(data=row_list_day , columns = ["YEAR", "MONTH", "DAY"])

In [100]:
pred_days = pipeline_day.predict(df_pred_days)

In [101]:
pred_days

array([792.3908 , 818.09186, 805.2938 , 795.6999 , 849.51807],
      dtype=float32)

#### MLFLOW

In [107]:
os.environ['MLFLOW_TRACKING_URI'] = 'http://192.168.1.41:5000/'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://192.168.1.41:9000/'

In [108]:
experiment_list = ["EpiasHour", "EpiasDay"]

for exp in experiment_list:
    if mlflow.get_experiment_by_name(exp):
        pass
    else:
        mlflow.set_experiment(exp)

2023/06/18 18:12:48 INFO mlflow.tracking.fluent: Experiment with name 'EpiasHour' does not exist. Creating a new experiment.
2023/06/18 18:12:48 INFO mlflow.tracking.fluent: Experiment with name 'EpiasDay' does not exist. Creating a new experiment.


In [109]:
client = MlflowClient()
exp_id_hour = client.get_experiment_by_name("EpiasHour")._experiment_id
exp_id_day = client.get_experiment_by_name("EpiasDay")._experiment_id

In [110]:
exp_id_hour

'1'

In [111]:
exp_id_day 

'2'

In [123]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [None]:
##### MLFLOW MODEL FOR HOURLY CONSUMPTION

In [124]:
with mlflow.start_run(run_name="XGReg_Hourly_Consump", experiment_id=exp_id_hour) as run:
    pipeline_hour = Pipeline([
        ('ct-ohe', ColumnTransformer([('ct', OneHotEncoder(handle_unknown='ignore', categories='auto'), [0, 1, 2, 3])], remainder='passthrough')),
        ('scaler', StandardScaler(with_mean=False)),
        ('estimator', TransformedTargetRegressor(regressor=xgboost.XGBRegressor(), transformer=StandardScaler()))
    ])

    # Fit the pipeline
    pipeline_hour.fit(X_train_hour, y_train_hour)
    y_pred_hour = pipeline_hour.predict(X_test_hour)
    print(y_pred_hour[:10])

    (rmse, mae, r2) = eval_metrics(y_test_hour, y_pred_hour)

    # mlflow.log_param("")
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file" :
        mlflow.sklearn.log_model(pipeline_hour, "model")
        # mlflow.sklearn.log_model(estimator, "model",registered_model_name=registered_model_name)
    else:
        mlflow.sklearn.log_model(pipeline_hour, "model")

[42.043766 38.162037 35.832676 37.345707 38.103832 40.650043 40.043285
 31.803995 38.238136 27.080324]




In [125]:
with mlflow.start_run(run_name="XGReg_Daily_Consump", experiment_id=exp_id_day) as run:
    pipeline_day = Pipeline([
        ('ct-ohe', ColumnTransformer([('ct', OneHotEncoder(handle_unknown='ignore', categories='auto'), [0, 1, 2])], remainder='passthrough')),
        ('scaler', StandardScaler(with_mean=False)),
        ('estimator', TransformedTargetRegressor(regressor=xgboost.XGBRegressor(), transformer=StandardScaler()))
    ])

    # Fit the pipeline
    pipeline_day.fit(X_train_day, y_train_day)
    y_pred_day = pipeline_day.predict(X_test_day)
    print(y_pred_day[:10])

    (rmse, mae, r2) = eval_metrics(y_test_hour, y_pred_hour)

    # mlflow.log_param("")
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file" :
        mlflow.sklearn.log_model(pipeline_day, "model")
        # mlflow.sklearn.log_model(estimator, "model",registered_model_name=registered_model_name)
    else:
        mlflow.sklearn.log_model(pipeline_day, "model")

[ 871.0726   787.89844  879.00024 1001.0668   871.0579   907.2531
  919.4171   861.8203   873.6684   910.99994]
