# **1. Environment Set Up and Data Loading**

In [None]:
!pip install gdown

In [None]:
!pip install keras-tuner

In [None]:
import gdown
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
import tensorflow as tf
from tensorflow import keras as tfk
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional
import random
from bayes_opt import BayesianOptimization

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, BatchNormalization

seed = 72

In [None]:
# Google Drive URLs for the .npy files
url_categ = "https://drive.google.com/uc?id=1UiZliu_AQdlkjRhVf61Cc0_iJNvIbSyJ"
url_train = "https://drive.google.com/uc?id=1hIkzsOiDMX5B7pwxyJkiOEBV1nW6_cOB"
url_valid = "https://drive.google.com/uc?id=1nV6ugTmqf--NTzBZCpb80PO0YZmsjigs"

# Function to download and load a .npy file
def download_and_load_npy(url):
    output_file = gdown.download(url, quiet=True)
    return np.load(output_file)

# Downloading and reading the .npy files
categories = download_and_load_npy(url_categ)
training_dataset = download_and_load_npy(url_train)
valid_periods = download_and_load_npy(url_valid)

# **2. Data Preparation**

> Converts NumPy arrays into Pandas DataFrames, handles column renaming, and replaces 0 values with NaN.

In [None]:
categ_df = pd.DataFrame(categories)#.T
train_df = pd.DataFrame(training_dataset)#.T
val_df = pd.DataFrame(valid_periods)

In [None]:
cdf = pd.DataFrame(categories)
tdf = pd.DataFrame(training_dataset)
vdf = pd.DataFrame(valid_periods)
# Rename the columns of our vdf dataframe to Start (the time instance at which the time series starts) and End (time at which it ends)
vdf = vdf.rename(columns={0: "Start", 1: "End"})

In [None]:
tdf = tdf.replace(0, np.nan)

# **3. Handling Missing Values**

> Implements a sliding window mean to fill missing values in the time series data.




In [None]:
window_size = 5

# Create a copy of the original DataFrame to preserve the original data
tdf_filled = tdf.copy()

# Iterate through the rows (axis=0) and fill missing values using a sliding window mean
for i in range(len(tdf_filled)):
    window = tdf_filled.iloc[i].rolling(window=window_size, min_periods=1)

    # Calculate the mean within the sliding window
    window_mean = window.mean()

    # Fill missing values with the calculated mean
    tdf_filled.iloc[i] = tdf_filled.iloc[i].fillna(window_mean)

tdf_filled

In [None]:
tdf = tdf_filled

# **4. Data Scaling**

> Applies Robust Scaler and Min-Max Scaler to normalize and scale the time series data.



In [None]:
def build_scaled_df(df):
    # Assuming 'your_df' is your DataFrame
    indices = df.index
    columns = df.columns

    # Apply Robust Scaler
    scaler = RobustScaler(with_centering=True)
    scaled_array = scaler.fit_transform(df)

    min_max_scaler = MinMaxScaler()
    scaled_array = min_max_scaler.fit_transform(scaled_array)

    # Create a new DataFrame with original indices and columns
    scaled_df = pd.DataFrame(data=scaled_array, columns=columns, index=indices)

    return scaled_df

In [None]:
TOTAL = build_scaled_df(tdf)

In [None]:
TOTAL = TOTAL.iloc[:,2490:].copy()

# **5. Model Bulilding and Training**

*   Removes rows with a high percentage of missing values and interpolates remaining missing values.
*   Creates input-output sequences for the LSTM model using a sliding window approach.
*   Defines a function to build an LSTM model with tunable hyperparameters and performs random search for optimal values.
*   Retrieves and displays the summary of the best-performing LSTM model from the hyperparameter tuning.









In [None]:
def threshold_df(df):
    threshold = 0.15
    missing_percentage = df.isnull().mean(axis=1)
    mask_valid_rows = missing_percentage <= threshold
    cleaned_df = df.loc[mask_valid_rows]

    return cleaned_df

In [None]:
TOTAL = threshold_df(TOTAL)

TOTAL.interpolate(method='linear', limit_direction='forward', axis=0, inplace=True)
TOTAL.interpolate(method='linear', limit_direction='backward', axis=0, inplace=True)

In [None]:
def build_sequences(df):
    time_steps = 200
    future_steps = 9
    X, y = [], []
    for i in range(len(df)):
        for j in range(0, 51, 5):
            X.append(df.iloc[i, j:time_steps+j].values)
            y.append(df.iloc[i, time_steps+j:time_steps+future_steps+j].values)


    # Convert the lists to numpy arrays
    X = np.array(X)
    y = np.array(y)

    # Now y should be of the correct shape, but let's ensure by reshaping if necessary
    y = y.reshape(-1, future_steps)

    return X, y

In [None]:
X_total, y_total = build_sequences(TOTAL)

In [None]:
X_total = np.expand_dims(X_total, axis=2)

In [None]:
X_total.shape

In [None]:
X_total_train, X_total_test, y_total_train, y_total_test = train_test_split(X_total, y_total, test_size=0.2, shuffle=False)

In [None]:
X_total_train.shape

In [None]:
y_total_train.shape

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from kerastuner.tuners import RandomSearch

# Create the LSTM model
time_steps = 200
future_steps = 9

def build_model(hp):
    model = Sequential()
    # Tune the number of units for the first LSTM layer
    hp_units_1 = hp.Int('units_1', min_value=100, max_value=140, step=10)
    model.add(LSTM(units=hp_units_1, return_sequences=True, input_shape=(time_steps, 1)))

    # Tune the number of units for the second LSTM layer
    #hp_units_2 = hp.Int('units_2', min_value=50, max_value=1000, step=50)
    model.add(LSTM(units=hp_units_1))

    # Dense layer for future steps prediction
    model.add(Dense(future_steps))

    # Compile the model with a tunable learning rate
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='mean_squared_error',
                  metrics=['MeanSquaredError'])

    return model

# Define the search space
tuner = RandomSearch(
    build_model,
    objective='val_mean_squared_error',
    max_trials=10,
    executions_per_trial=3,
    directory='keras_tuner_logs',
    project_name='lstm_hyperparameter_tuning'
)

# Start the search
tuner.search_space_summary()

# Assuming you have X_train, y_train, X_val, and y_val defined
tuner.search(X_total_train, y_total_train,
             validation_data=(X_total_test, y_total_test),
             epochs=10,
             batch_size=32)

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()