In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import itertools
import random
import requests
import os


from tsfeatures import tsfeatures
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [2]:
url = 'https://raw.githubusercontent.com/numenta/NAB/master/data/realAWSCloudwatch/grok_asg_anomaly.csv'

data = pd.read_csv(url)
df = data.copy()
df.head()

Unnamed: 0,timestamp,value
0,2014-01-16 00:00:00,33.5573
1,2014-01-16 00:05:00,33.446
2,2014-01-16 00:10:00,33.4447
3,2014-01-16 00:15:00,33.3333
4,2014-01-16 00:20:00,33.4447


In [3]:
default_freq = 'H'

In [4]:
# Function to get a random start date from the DataFrame index
def get_random_start_date(index):
    return np.random.choice(index)

# Main function to repeat the process until non-None frequency is obtained
def find_non_none_frequency(df, offset=9):
    while True:
        # Get a random start date from the DataFrame index
        start_date = pd.to_datetime(get_random_start_date(df.index))

        # Find the index of the end date by moving 9 steps through the indices
        end_date_index = df.index.get_loc(start_date) + offset

        # Check if the end date index is within the range of the DataFrame index
        if end_date_index < len(df.index):
            # Calculate the end date using the index
            end_date = df.index[end_date_index]

            # Infer frequency within the specified date range
            subset_df = df.loc[start_date:end_date]
            freq = pd.infer_freq(subset_df.index)

            if freq is not None:
                print("Inferred frequency within range", start_date, "-", end_date, ":", freq)
                return freq  # Exit the loop and return the inferred frequency

In [5]:
def max_consecutive_missing_dates(inferred_freq, missing_dates):
    # Function to check if two dates are consecutive based on the inferred frequency
    def are_consecutive(date1, date2, freq):
        # Calculate the difference between dates based on the inferred frequency
        diff = date2 - date1
        # Check if the difference matches the frequency
        if freq == 'D':
            return diff.days == 1
        elif freq.endswith('H')| freq.endswith('h'):
             # If the frequency ends with 'H', check if it represents hourly intervals
            if freq[:-1]:  # Check if there is a multiplier
                  interval = int(freq[:-1])
                  return diff.total_seconds() == interval * 3600
            else:
                   # If no multiplier is provided, it's assumed to be one hour
                   return diff.total_seconds() == 3600
        elif freq.endswith('T') | freq.endswith('min') :
            if freq.endswith('T'):
                # Extract the interval from the frequency string
                interval = int(freq[:-1])
                return diff.seconds // 60 == interval
            else:
                interval = int(freq[:-3])
                return diff.seconds // 60 == interval
        else:
            raise ValueError("Unsupported frequency: {}".format(freq))

    # Initialize variables to track maximum length and current length
    max_consecutive_missing = 0
    current_consecutive_missing = 0

    # Iterate over the missing dates
    for i in range(1, len(missing_dates)):
        # Check if the current date is consecutive with the previous date
        if are_consecutive(missing_dates[i - 1], missing_dates[i], inferred_freq):
            # Increment current consecutive missing count
            current_consecutive_missing += 1
        else:
            # Update maximum consecutive missing count if needed
            max_consecutive_missing = max(max_consecutive_missing, current_consecutive_missing)
            # Reset current consecutive missing count
            current_consecutive_missing = 0

    # Update max_consecutive_missing if current_consecutive_missing is still greater
    max_consecutive_missing = max(max_consecutive_missing, current_consecutive_missing)

    return max_consecutive_missing

In [6]:
def preprocess(df):
    # Convert 'timestamp' column to datetime format and rename it to 'ds'
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Removing the duplicate rows
    df = df[~df.duplicated(keep='first')]

    duplicated_dates_length = len(df[df['timestamp'].duplicated(keep=False)])

    if  duplicated_dates_length > 0:
      print("Number of Duplicated Dates: "+ str(duplicated_dates_length))
      # To make the mean as the value for the numerical columns if there are different values for a particular date
      df = df.groupby('timestamp').mean()
      # Reset index to bring 'timestamp' column back
      df.reset_index(inplace=True)

    df.set_index(['timestamp'], inplace=True)
    df.sort_index()

    # Create a date range with hourly frequency covering the entire time range
    start_date = df.index.min()
    end_date = df.index.max()

    #inferred_freq = pd.infer_freq(df.index)
    inferred_freq = find_non_none_frequency(df)

    if inferred_freq is None:
      inferred_freq = default_freq # setting the default frequency
      print("Cannot infer the frequency of the timestamp of the dataset. Therefore the default frequency of " + default_freq+ " will be used")

    expected_date_range = pd.date_range(start=start_date, end=end_date, freq=inferred_freq)

    # Find the missing date entries
    missing_dates = expected_date_range[~expected_date_range.isin(df.index)]
    # Print or work with the list of missing dates
    print("Number of Missing Dates: "+ str(len(missing_dates))+"\n")

    if len(missing_dates) > 0:
      df = df.asfreq(inferred_freq)
      df.sort_index()

      # Call the function with inferred_freq and missing_dates parameters
      max_consecutive = max_consecutive_missing_dates(inferred_freq, missing_dates)
      print("Maximum length of consecutive missing dates:", max_consecutive)
      if max_consecutive > 3:
        print("It is better to use other imputation method rather than linear interpolation")

      df['value'] = df['value'].interpolate(method='linear')

    return df

In [7]:
df = preprocess(df)

Inferred frequency within range 2014-01-23 01:10:00 - 2014-01-23 01:55:00 : 5min
Number of Missing Dates: 0



In [8]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df.index, y=df['value'], mode='lines', name='Value'))

# Update layout
fig.update_layout( xaxis_title='Timestamp', yaxis_title='Value')

# Show plot
fig.show()


TS Feature Extraction

In [9]:
dfs = []

In [10]:
def extract_features( new_df):
    new_df.index = new_df.index  # Set the index (you can perform operations here if needed)
    new_df.rename(columns={'value': 'y'}, inplace=True)
    new_df['unique_id'] = 0 
    dfs.append(new_df)  # Append the modified DataFrame to the list
    return dfs

extract_features(df.copy())

[                           y  unique_id
 timestamp                              
 2014-01-16 00:00:00  33.5573          0
 2014-01-16 00:05:00  33.4460          0
 2014-01-16 00:10:00  33.4447          0
 2014-01-16 00:15:00  33.3333          0
 2014-01-16 00:20:00  33.4447          0
 ...                      ...        ...
 2014-02-01 00:40:00   0.3380          0
 2014-02-01 00:45:00   0.0000          0
 2014-02-01 00:50:00   0.0000          0
 2014-02-01 00:55:00   0.0000          0
 2014-02-01 01:00:00   0.3340          0
 
 [4621 rows x 2 columns]]

In [11]:
combined_df = pd.concat(dfs, ignore_index=True)
# Assuming tsfeatures function is defined elsewhere and imported
features = tsfeatures(combined_df, freq=288)
#features = tsfeatures(combined_df, dict_freqs={'T': 60, '2T': 30,'3T': 20, '4T': 15,'5T': 12,'10T': 6,'15T': 4,'20T': 3,'30T': 2, 'H': 24, '2H': 12,'3H': 8, '4H': 6,'6H': 4,'8H': 3,'12H': 2, 'D': 7, 'W': 52, 'M': 12})
df_features = pd.DataFrame(features)
df_features

Unnamed: 0,unique_id,hurst,series_length,unitroot_pp,unitroot_kpss,hw_alpha,hw_beta,hw_gamma,stability,nperiods,...,entropy,crossing_points,arch_lm,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
0,0,1.246498,4621,-12.709826,20.719757,0.406679,1.427202e-10,7.085164e-08,1.025456,1,...,0.254064,1779,0.972019,0.987206,9.552269,-0.347389,0.13838,-0.605874,0.392035,0.638466


Splitting of dataset

In [12]:
def split_data(df, train_ratio=0.6):
    train_size = int(len(df) * train_ratio)
    train, val = df[:train_size], df[train_size:]
    return train, val

Exponential Smoothing

In [13]:
seasonal_periods = [6, 12, 24, 36, 48, 60, 72, 96, 120, 144]
smoothing_level = [0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 0.9]
smoothing_seasonal = [0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 0.9]

all_combinations = list(itertools.product(seasonal_periods, smoothing_level, smoothing_seasonal))


# Define the number of random combinations to sample
num_samples = 100

# Set the seed for reproducibility
random.seed(42)

# Randomly sample from all possible combinations
param_grid_exponential = random.sample(all_combinations, num_samples)

param_grid_exponential

[(72, 0.6, 0.8),
 (12, 0.2, 0.2),
 (6, 0.2, 0.8),
 (96, 0.8, 0.2),
 (24, 0.9, 0.1),
 (24, 0.5, 0.9),
 (24, 0.4, 0.4),
 (12, 0.5, 0.2),
 (96, 0.6, 0.9),
 (12, 0.1, 0.5),
 (96, 0.1, 0.5),
 (144, 0.9, 0.5),
 (144, 0.4, 0.2),
 (60, 0.6, 0.9),
 (6, 0.9, 0.4),
 (72, 0.2, 0.2),
 (48, 0.4, 0.9),
 (6, 0.4, 0.4),
 (6, 0.4, 0.2),
 (6, 0.9, 0.8),
 (24, 0.2, 0.9),
 (24, 0.5, 0.1),
 (60, 0.2, 0.9),
 (72, 0.4, 0.1),
 (6, 0.2, 0.9),
 (60, 0.9, 0.1),
 (24, 0.1, 0.5),
 (96, 0.5, 0.4),
 (72, 0.8, 0.5),
 (96, 0.4, 0.4),
 (144, 0.8, 0.1),
 (48, 0.4, 0.6),
 (24, 0.4, 0.1),
 (48, 0.6, 0.8),
 (72, 0.2, 0.1),
 (24, 0.9, 0.4),
 (120, 0.5, 0.2),
 (144, 0.1, 0.6),
 (6, 0.1, 0.5),
 (96, 0.9, 0.5),
 (120, 0.4, 0.9),
 (12, 0.6, 0.6),
 (96, 0.4, 0.1),
 (144, 0.6, 0.6),
 (36, 0.5, 0.9),
 (144, 0.2, 0.9),
 (12, 0.6, 0.4),
 (24, 0.2, 0.8),
 (96, 0.9, 0.8),
 (36, 0.5, 0.6),
 (144, 0.8, 0.6),
 (144, 0.6, 0.2),
 (36, 0.9, 0.8),
 (12, 0.1, 0.1),
 (36, 0.8, 0.2),
 (120, 0.8, 0.9),
 (36, 0.6, 0.2),
 (72, 0.4, 0.2),
 (24, 0.8,

In [14]:
def exponential_smoothing(train, val, param_grid):
    train_values = train['value']  # Extracting only the 'value' column
    val_values = val['value']      # Extracting only the 'value' column

    # Initialize variables to store best parameters and performance
    best_params = None
    best_score = float('inf')

    # Iterate over parameter grid
    for params in param_grid:
        # Extract parameters
        seasonal_periods = params[0]
        smoothing_level = params[1]
        smoothing_seasonal = params[2]

        # Fit the model with current parameters
        model = ExponentialSmoothing(train_values, trend=None, seasonal='add', seasonal_periods=seasonal_periods)
        fitted_model = model.fit(smoothing_level=smoothing_level, smoothing_seasonal=smoothing_seasonal)

        # Make predictions
        forecast = fitted_model.forecast(steps=len(val))

        # Evaluate performance
        mse = mean_squared_error(val_values, forecast)

        # Update best parameters if the current parameters yield a lower MSE
        if mse < best_score:
            best_score = mse
            best_params = params

    print("Exponential Smoothing")
    print("Best parameters of Seasonal Periods, Smoothing Level and Smoothing Seasonal:", best_params)

    # Extract the best parameters
    best_seasonal_periods = best_params[0]
    best_smoothing_level = best_params[1]
    best_smoothing_seasonal = best_params[2]

    # Fit the final model with the best parameters
    final_model = ExponentialSmoothing(train_values, trend=None, seasonal='add', seasonal_periods=best_seasonal_periods)
    final_fitted_model = final_model.fit(smoothing_level=best_smoothing_level, smoothing_seasonal=best_smoothing_seasonal)

    # Make final forecast
    final_forecast = final_fitted_model.forecast(steps=len(val))

    forecast_df = pd.DataFrame(final_forecast, index=val.index, columns=['Forecast'])

    final_mse = mean_squared_error(val_values, final_forecast)
    final_mae = mean_absolute_error(val_values, final_forecast)

    print("Mean Absolute Error:", final_mae)
    print("Mean Squared Error:", final_mse)


    return forecast_df, final_mae, final_mse


ARIMA

In [15]:
from pmdarima.arima import auto_arima

In [16]:
def arima(train, val):
    # Extracting only the 'value' column
    train_values = train['value']

    # Perform automated ARIMA test
    arima_model = auto_arima(train_values, seasonal=True)

    # Forecast on the validation data
    forecast = arima_model.predict(n_periods=len(val))

    p, d, q = arima_model.order

    # Create a DataFrame with the forecasted values
    forecast_df = pd.DataFrame(forecast, index=val.index, columns=['Forecast'])

    print("Arima")

    # Calculate MAE and MSE
    mae = mean_absolute_error(val['value'], forecast)
    mse = mean_squared_error(val['value'], forecast)

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)

    return forecast_df, mae, mse

XGBoost

In [17]:

def create_features(df, label=None):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day

    X = df[['hour','dayofweek','quarter','month','year',
            'dayofyear','dayofmonth']]
    if label:
        y = df[label]
        return X, y
    return X

In [18]:
def xgboost(train, val):
    # Feature Engineering
    lags = 12  # You can adjust this
    for i in range(1, lags + 1):
        train[f'lag_{i}'] = train['value'].shift(i)
        val[f'lag_{i}'] = val['value'].shift(i)

    # Create features and target variable
    X_train, y_train = create_features(train, label='value')
    X_val, y_val = create_features(val, label='value')

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Hyperparameter tuning
    param_grid = {
        'n_estimators': [25, 50, 100, 150, 200, 300, 400, 500, 1000],
        'max_depth': [2, 3, 5, 7, 10],
        'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.3],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3, 0.4],
        'reg_alpha': [0, 0.1, 0.5, 1, 10],
        'reg_lambda': [0, 0.1, 0.5, 1, 10],
        'min_child_weight': [1, 3, 5, 7, 10],
    }
    xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
    search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid, n_iter=100, cv=2, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    search.fit(X_train_scaled, y_train)

    print("XGBoost")

    best_params = search.best_params_
    print("Best Parameters:", best_params)

    # Model training with best parameters
    model = XGBRegressor(**best_params, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Model evaluation
    forecast = model.predict(X_val_scaled)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_val, forecast)
    print(f'Mean Absolute Error: {mae}')

    mse = mean_squared_error(y_val, forecast)
    print(f'Mean Squared Error: {mse}')

    # Create a DataFrame with the forecasted values
    forecast_df = pd.DataFrame(forecast, index=y_val.index, columns=['Forecast'])

    return forecast_df, mae, mse

Visualization of model prediction

In [19]:
def plot_forecast_interactive(forecast_df, val, model_name):

    # Create a directory if it doesn't exist
    output_folder = os.path.join("visualization", model_name)
    os.makedirs(output_folder, exist_ok=True)

    # Plot forecast and real values
    forecast_trace = go.Scatter(x=forecast_df.index, y=forecast_df['Forecast'], mode='lines', name='Forecast')
    real_trace = go.Scatter(x=val.index, y=val['value'], mode='lines', name='Real')

    # Create the layout
    layout = go.Layout(title=f"{model_name}",xaxis=dict(title='Timestamp'), yaxis=dict(title='Value'))

    # Combine traces into a list
    data = [forecast_trace, real_trace]

    # Create the figure
    fig = go.Figure(data=data, layout=layout)

    # Show the interactive plot
    fig.show()

Stacking Approach for Predictions

In [20]:
def stacked_model_predictions(val, base_preds):
    # Splitting features and target variable
    X_train, X_val, y_train, y_val = train_test_split(base_preds, val, test_size=0.2, random_state=42)

    # Define parameter grid for Random Forest
    param_grid = {
        'n_estimators': [25, 50, 100, 150, 200],  # Number of trees in the forest
        'max_depth': [None, 10, 20, 30],      # Maximum depth of the tree
        'min_samples_split': [2, 5, 8, 10, 15],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4, 6]     # Minimum number of samples required to be at a leaf node
    }

    # Initialize Random Forest regressor
    rf = RandomForestRegressor(random_state=42)

    search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    search.fit(X_train, y_train)

    print("Stacking Approach")

    # Print the best estimator found
    print(search.best_estimator_)

    # Make predictions using the best model
    y_pred = search.best_estimator_.predict(X_val)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_val, y_pred)
    print("Mean Absolute Error (MAE):", mae)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_val, y_pred)
    print("Mean Squared Error (MSE):", mse)

    # Calculate Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)

    print("")

    return y_pred, y_val, mae, mse


In [21]:
def plot_predictions_stack_interactive(val, y_pred, y_val):
    # Plot predicted and actual values
    pred_trace = go.Scatter(x=val.index, y=y_pred, mode='lines', name='Predicted', line=dict(color='red'))
    val_trace = go.Scatter(x=val.index, y=y_val, mode='lines', name='Actual', line=dict(color='blue'))

    # Create the layout
    layout = go.Layout(title="Interactive Plot",
                       xaxis=dict(title='Timestamp'),
                       yaxis=dict(title='Value'))

    # Combine traces into a list
    data = [pred_trace, val_trace]

    # Create the figure
    fig = go.Figure(data=data, layout=layout)

    # Show the interactive plot
    fig.show()

In [22]:
def generate_stacked_predictions(best_models, train, val):
    base_preds = []
    val_copy=val.copy()
    for model in best_models:
        if model.lower() == "arima":
            arima_forecast_df, arima_mae, arima_mse = arima(train, val)
            plot_forecast_interactive(arima_forecast_df, val, "arima")
            arima_prediction = arima_forecast_df['Forecast'].values
            base_preds.append(arima_prediction)
        elif model.lower() == "exponential_smoothing":
            exponential_forecast_df, exponential_mae, exponential_mse = exponential_smoothing(train, val, param_grid_exponential)
            plot_forecast_interactive(exponential_forecast_df, val, "exponential_smoothing")
            exponential_prediction = exponential_forecast_df['Forecast'].values
            base_preds.append(exponential_prediction)
        elif model.lower() == "xgboost":
            xgboost_forecast_df, xgboost_mae, xgboost_mse = xgboost(train, val)
            plot_forecast_interactive(xgboost_forecast_df, val, "XGBoost")
            xgboost_prediction = xgboost_forecast_df['Forecast'].values
            base_preds.append(xgboost_prediction)

    if len(best_models) == 1:
        # If only one model is selected, return its prediction directly
        return base_preds[0]

    else:
        base_preds = np.stack(base_preds, axis=-1)
        y_pred, y_val, mae, mse = stacked_model_predictions(val_copy['value'].values, base_preds)
        plot_predictions_stack_interactive(val, y_pred, y_val)
        return y_pred

Some temporary preparations for running ensemble size moldel

In [23]:
df_features

Unnamed: 0,unique_id,hurst,series_length,unitroot_pp,unitroot_kpss,hw_alpha,hw_beta,hw_gamma,stability,nperiods,...,entropy,crossing_points,arch_lm,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
0,0,1.246498,4621,-12.709826,20.719757,0.406679,1.427202e-10,7.085164e-08,1.025456,1,...,0.254064,1779,0.972019,0.987206,9.552269,-0.347389,0.13838,-0.605874,0.392035,0.638466


In [24]:
# Replace null values with 0 in each column
df_features= df_features.fillna(0)


In [25]:
# Assuming df_features is your DataFrame
df_features['model_1'] = 1
df_features['model_2'] = 2
df_features['model_3'] = 0

# Print the DataFrame to verify the changes
df_features

Unnamed: 0,unique_id,hurst,series_length,unitroot_pp,unitroot_kpss,hw_alpha,hw_beta,hw_gamma,stability,nperiods,...,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1,model_1,model_2,model_3
0,0,1.246498,4621,-12.709826,20.719757,0.406679,1.427202e-10,7.085164e-08,1.025456,1,...,0.987206,9.552269,-0.347389,0.13838,-0.605874,0.392035,0.638466,1,2,0


Running the ensemble size model

In [26]:
# Drop unique_id and ensemble_size from features
X = df_features.drop(['unique_id'], axis=1)

In [27]:
import os
import pickle

# Directory containing the pickle files
model_directory = "model_pickle/ensemble_size"

# List all files in the directory
pickle_files = [file for file in os.listdir(model_directory) if file.endswith('.pkl')]

if len(pickle_files) == 0:
    print("No pickle files found in the directory.")
else:
    # Load the first pickle file found
    model_filename = os.path.join(model_directory, pickle_files[0])
    with open(model_filename, 'rb') as f:
        model = pickle.load(f)

    # Make predictions on your data
    predictions = model.predict(X)  # Replace df_data with your actual DataFrame containing the data

    # The variable 'predictions' now contains the predicted y values for your data
    print(predictions)

[3]


In [28]:
# Ensure predictions is a single number, not a list
prediction_value = predictions[0]

# Create a list of column names to select based on the prediction value
columns_to_select = [f"model_{i}" for i in range(1, prediction_value + 1)]

# Select only the columns starting with 'model_' and based on the prediction value
selected_columns = df_features.filter(regex='^model_', axis=1)[columns_to_select]

selected_columns 

Unnamed: 0,model_1,model_2,model_3
0,1,2,0


In [29]:
# Mapping dictionary
mapping = {
    0: 'arima',
    1: 'exponential_smoothing',
    2: 'xgboost'
}

In [30]:
# Apply mapping to each column in selected_columns
mapped_values = selected_columns.apply(lambda col: col.map(mapping))

# Combine all mapped values into a single Series
combined_values = mapped_values.stack()

# Extract unique values to get the best models
best_models = combined_values.unique().tolist()

best_models

['exponential_smoothing', 'xgboost', 'arima']

In [31]:
train, val = split_data(df)
y_pred=generate_stacked_predictions(best_models, train, val)

Exponential Smoothing
Best parameters of Seasonal Periods, Smoothing Level and Smoothing Seasonal: (48, 0.4, 0.9)
Mean Absolute Error: 16.147169583880842
Mean Squared Error: 446.7303465833384


XGBoost
Best Parameters: {'subsample': 0.9, 'reg_lambda': 0.1, 'reg_alpha': 0, 'n_estimators': 25, 'min_child_weight': 7, 'max_depth': 2, 'learning_rate': 0.3, 'gamma': 0.1, 'colsample_bytree': 0.6}
Mean Absolute Error: 16.483432930232823
Mean Squared Error: 528.4817991869792


Arima
Mean Absolute Error (MAE): 16.19826120594921
Mean Squared Error (MSE): 511.85912595884315


Stacking Approach
RandomForestRegressor(max_depth=30, min_samples_leaf=2, min_samples_split=8,
                      n_estimators=25, random_state=42)
Mean Absolute Error (MAE): 7.455973823223816
Mean Squared Error (MSE): 143.50764782096613
Root Mean Squared Error (RMSE): 11.979467760337524

