# Data Processing

In [14]:
import pandas as pd
import numpy as np
# model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_absolute_percentage_error

In [3]:
# Load data
def load_data(dataset = 'df_19_24_cleaned'):
    return pd.read_pickle(f'../data/{dataset}.pkl') 

In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
def data_scaler(data):
    data_scaled = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
    print('Data is scaled')
    return data_scaled

In [12]:
# 1. Train-Test Split (keeping all hourly data points in the last 7 days of each month for testing)
def train_test_split_7(data):
    test_indices = data.index.to_series().groupby([data.index.year, data.index.month]).apply(lambda x: x[-24*7:])
    test_data = data.loc[test_indices]
    train_data = data.drop(test_indices)
    print(f'Shape of train_data: {train_data.shape}')
    print(f'Shape of test_data: {test_data.shape}')
    return train_data, test_data

## LSTM

In [27]:
def create_sequences(data, seq_length=24, target_column='price'):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data.iloc[i:i+seq_length].values)  # Include all features in X
        y.append(data[target_column].iloc[i+seq_length])  # Target is still the original 'price'
    return np.array(X), np.array(y)

def create_sequences_2(data, seq_length=24, features = ['price']):
    # features = ['price', 'wind_energy_generation', 'solar_energy_generation', 'total_load']
    
    # Convert to numpy array for easier slicing
    data_array = data[features].values
    
    # Initialize lists for sequences and labels
    sequences = []
    labels = []
    
    # Create sequences
    for i in range(len(data_array) - seq_length):
        # Sequence of 24 time steps
        sequences.append(data_array[i:i + seq_length])
        
        # The label is the price at the next time step after the sequence
        labels.append(data_array[i + seq_length, 0])  # Assuming `price` is the first column
    
    # Convert lists to numpy arrays
    sequences = np.array(sequences)
    labels = np.array(labels)

    return  sequences, labels

In [15]:
def create_lstm_model(input_shape):
    model = Sequential([
        LSTM(50, activation='relu', input_shape=input_shape),
        Dropout(0.2),  # Dropout to prevent overfitting
        Dense(1)  # Output layer with a single neuron for regression
    ])
    model.compile(optimizer='adam', loss='mae')
    return model

## Evaluation

In [17]:
# Define sMAPE function for evaluation
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [28]:
def scaler_inverse(y_test_scaled, y_preds_scaled, X_test):
    y_test_original = scaler.inverse_transform(
        np.concatenate((y_test_scaled.reshape(-1, 1), X_test[:, -1, 1:]), axis=1))[:, 0]

    y_preds_original = scaler.inverse_transform(
        np.concatenate((y_preds_scaled, X_test[:, -1, 1:]), axis=1))[:, 0]

    return y_test_original, y_preds_original
    
def scaler_inverse_2(y_test_scaled, y_preds_scaled, num_features = 1):
    # Reshape predictions and true values for inverse transformation
    y_preds_scaled = y_preds_scaled.reshape(-1, 1)
    y_test_scaled = y_test_scaled.reshape(-1, 1)
    
    # Extend with zeros for other features to match scaler's input shape
    # num_features = len(features)
    zeros = np.zeros((len(y_preds_scaled), num_features - 1))
    predictions_extended = np.concatenate([y_preds_scaled, zeros], axis=1)
    # test
    y_test_extended = np.concatenate([y_test, zeros], axis=1)
    
    # Inverse transform
    y_preds_original = scaler.inverse_transform(predictions_extended)[:, 0]  # Only take price column
    y_test_original = scaler.inverse_transform(y_test_extended)[:, 0]      

    return y_test_original, y_preds_original


In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def eva(y_test_scaled, y_preds_scaled, X_test):

    # y_test, y_preds = scaler_inverse_2(y_test_scaled, y_preds_scaled)
    y_test, y_preds = scaler_inverse(y_test_scaled, y_preds_scaled, X_test)
    
    lstm_smape = smape(y_test, y_preds)
    print(f"LSTM sMAPE: {lstm_smape:.2f}")
    # Calculate MAE
    mae = mean_absolute_error(y_test, y_preds)
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_preds))
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    return y_test, y_preds