In [31]:
import pandas as pd 
import numpy as np 

In [32]:
train_path = '../Datasets/train.csv'
weather_path = '../Datasets/weather.csv'

In [33]:
df = pd.read_csv(train_path)
weather = pd.read_csv(weather_path)

In [34]:
df['Tarih'] = pd.to_datetime(df['Tarih'])
weather['date'] = pd.to_datetime(weather['date'])

In [35]:
def extractor(df, date_col, weather, mapping=False):
    # dont show warnings
    import warnings
    warnings.filterwarnings('ignore')
    
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Extracting date features
    df['Saat'] = df[date_col].dt.hour
    df['Aylık Gün'] = df[date_col].dt.day
    df['Yıllık Gün'] = df[date_col].dt.dayofyear
    df['Haftanın günü'] = df[date_col].dt.day_name()
    df['Hafta'] = df[date_col].dt.week
    df['Ay'] = df[date_col].dt.month
    df['Çeyreklik'] = df[date_col].dt.quarter
    df['Yıl'] = df[date_col].dt.year
    
    # Extracting holiday features
    import holidays
    tr_holidays = holidays.Turkey()
    df['Özel Gün'] = df[date_col].apply(lambda x: x in tr_holidays)
    
    # Extracting seasonality features
    def get_season(month):
        if month >= 3 and month <= 5:
            return 'Spring'
        elif month >= 6 and month <= 8:
            return 'Summer'
        elif month >= 9 and month <= 11:
            return 'Autumn'
        else:
            return 'Winter'
    
    df['Mevsim'] = df[date_col].dt.month.apply(get_season)
    
    # Adding weather features
    weather['date'] = pd.to_datetime(weather['date']).dt.date
    df['date_no_time'] = df[date_col].dt.date
    df = pd.merge(df, weather, left_on='date_no_time', right_on='date', how='left')
    df.drop(['date_no_time','date'], axis=1, inplace=True)
    
    # Extracting weekend features
    import datetime
    def is_weekend(date_str):
        date_obj = datetime.datetime.strptime(date_str, '%Y-%m-%d').date()
        return 1 if date_obj.weekday() >= 5 else 0
    
    df['Hafta Sonu'] = df[date_col].dt.strftime('%Y-%m-%d').apply(is_weekend)
    
    if mapping == True:
        ozel_gun = {
            True:1,
            False:0
        }

        mevsim = {
            "Winter":1,
            "Spring":2,
            "Summer":3,
            "Autumn":4
        }

        hafta_gunleri = {
            "Monday":1,
            "Tuesday":2,
            "Wednesday":3,
            "Thursday":4,
            "Friday":5,
            "Saturday":6,
            "Sunday":7
        }
        
        df['Haftanın günü'] = df['Haftanın günü'].map(hafta_gunleri)
        df['Özel Gün'] = df['Özel Gün'].map(ozel_gun)
        df['Mevsim'] = df['Mevsim'].map(mevsim)

        return df
    
    else:
        return df

Stacked LSTM modelleri, daha karmaşık ve uzun süreli bağımlılıkları olan verilerde daha iyi sonuçlar verirken, Vanilla LSTM modelleri daha basit zaman serilerinde daha etkilidir. Ancak, her veri seti farklıdır ve hangi modelin kullanılacağına karar vermek için veri seti üzerinde deneme yanılma yapılması önerilir.

<hr>

time step splitting

In [36]:
from sklearn.preprocessing import MinMaxScaler

split_date = pd.to_datetime('2021-06-08 17:00:00')

train = df.loc[df['Tarih'] < split_date]
test = df.loc[df['Tarih'] >= split_date]

scaler = MinMaxScaler(feature_range=(0, 1))
train_scaled = scaler.fit_transform(train[['Dağıtılan Enerji (MWh)']].values)
test_scaled = scaler.transform(test[['Dağıtılan Enerji (MWh)']].values)

def create_time_steps(length):
    return list(range(-length, 0))

time_steps = 24
X_train = []
y_train = []
for i in range(time_steps, len(train)):
    X_train.append(train_scaled[i-time_steps:i, 0])
    y_train.append(train_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)

X_test = []
y_test = []
for i in range(time_steps, len(test)):
    X_test.append(test_scaled[i-time_steps:i, 0])
    y_test.append(test_scaled[i, 0])
X_test, y_test = np.array(X_test), np.array(y_test)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

In [37]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error

In [38]:
train_X = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
test_X = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

### Vanilla LSTM

In [39]:
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')

In [40]:
history = model.fit(train_X, y_train, epochs=50, batch_size=72, validation_data=(test_X, y_test), 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=10)], verbose=0, shuffle=False)

In [41]:
import plotly.express as px
fig = px.line(history.history, y=['loss', 'val_loss'], 
              labels={'value':'Loss', 'variable':'Type', 'index':'Epoch'},
              title='Training and Validation Losses')
fig.update_xaxes(title='Epoch')
fig.update_yaxes(title='Loss')
fig.show()

In [42]:
model.fit(train_X, y_train, epochs=15, batch_size=72, validation_data=(test_X, y_test), 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=10)], verbose=0, shuffle=False)

<keras.callbacks.History at 0x18cb7c3fc50>

In [43]:
# mape, rmse
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
mape = mean_absolute_percentage_error(y_test, vanilla_preds)
rmse = np.sqrt(mean_squared_error(y_test, vanilla_preds))
f"MAPE: {mape:.2f} RMSE: {rmse:.2f}"

ValueError: Found input variables with inconsistent numbers of samples: [10015, 10038]

In [None]:
import plotly.graph_objs as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=test['Tarih'], y=y_test.flatten(),
                    mode='lines',
                    name='Gerçek Değerler'))

fig.add_trace(go.Scatter(x=test['Tarih'], y=vanilla_preds.flatten(),
                    mode='lines',
                    name='Tahminler'))

fig.update_layout(title='Vanilla LSTM Tahminleri',
                   xaxis_title='Tarih',
                   yaxis_title='Dağıtılan Enerji (MWh)')

fig.show()


<HR>

### Hyper Parameter Tuning

In [None]:
num_units = [50, 100, 200]
dropout_rate = [0.0, 0.1, 0.2]
epochs = [50, 100, 200]
batch_size = [72, 144, 288]

In [None]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor

def create_model(num_units=50, dropout_rate=0.0):
    model = Sequential()
    model.add(LSTM(num_units, input_shape=(train_X.shape[1], train_X.shape[2]), dropout=dropout_rate))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
    return model

model = KerasRegressor(build_fn=create_model, verbose=0)

# Grid search hyperparameters
param_grid = dict(num_units=num_units, dropout_rate=dropout_rate, epochs=epochs, batch_size=batch_size)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)

grid_result = grid.fit(train_X, y_train, validation_data=(test_X, y_test),
                        callbacks=[EarlyStopping(monitor='val_loss', patience=10)], shuffle=False)


KerasRegressor is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating.



In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

Best: -0.017297 using {'batch_size': 72, 'dropout_rate': 0.0, 'epochs': 50, 'num_units': 100}


In [None]:
best_num_units = grid_result.best_params_['num_units']
best_dropout_rate = grid_result.best_params_['dropout_rate']
best_epochs = grid_result.best_params_['epochs']

model = Sequential()
model.add(LSTM(100, input_shape=(train_X.shape[1], train_X.shape[2]), dropout=0.0))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])

history = model.fit(train_X, y_train, epochs=50, batch_size=72, validation_data=(test_X, y_test), 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=10)], verbose=0, shuffle=False)

In [None]:
y_pred = model.predict(test_X)



In [None]:
# mape, rmse
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
f"MAPE: {mape:.2f} RMSE: {rmse:.2f}"

'MAPE: 0.05 RMSE: 0.03'

In [None]:
import plotly.graph_objs as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=test['Tarih'], y=y_test.flatten(),
                    mode='lines',
                    name='Gerçek Değerler'))

fig.add_trace(go.Scatter(x=test['Tarih'], y=y_pred.flatten(),
                    mode='lines',
                    name='Tahminler'))

fig.update_layout(title='Vanilla LSTM Tahminleri',
                   xaxis_title='Tarih',
                   yaxis_title='Dağıtılan Enerji (MWh)')

fig.show()