In [1]:
#!pip install tsfeatures

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import itertools
import random
import requests
import os
import json
import time
import psutil

from itertools import product
from datetime import datetime
from sklearn.impute import KNNImputer
from tsfeatures import tsfeatures
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [3]:
default_freq = 'H'

In [4]:
index_url = 'https://api.github.com/repos/numenta/NAB/contents/data'
# Fetching file names from the index URL
response = requests.get(index_url)
index_data = response.json()

directories = [file['name'] for file in index_data if file['type']=="dir"]
directories

['artificialNoAnomaly',
 'artificialWithAnomaly',
 'realAWSCloudwatch',
 'realAdExchange',
 'realKnownCause',
 'realTraffic',
 'realTweets']

In [5]:
base_url = 'https://raw.githubusercontent.com/numenta/NAB/master/data/'
data = {}

def addFolderAndReadAll(d_name):
    data[d_name] = {}
    response = requests.get(index_url + '/' + d_name)
    index_data = response.json()

    csv_files = [ file['name'] for file in index_data if file['type'] == "file"]
    csvs_num = 0
    for f_name in csv_files:
        data[d_name][f_name] = pd.read_csv(base_url + d_name + '/' + f_name)
        csvs_num += 1
    return csvs_num

csvs_num = sum([addFolderAndReadAll(d_name) for d_name in directories])

Preprocessing

In [6]:
# Function to get a random start date from the DataFrame index
def get_random_start_date(index):
    return np.random.choice(index)

# Main function to repeat the process until non-None frequency is obtained
def find_non_none_frequency(df, offset=9):
    while True:
        # Get a random start date from the DataFrame index
        start_date = pd.to_datetime(get_random_start_date(df.index))

        # Find the index of the end date by moving 9 steps through the indices
        end_date_index = df.index.get_loc(start_date) + offset

        # Check if the end date index is within the range of the DataFrame index
        if end_date_index < len(df.index):
            # Calculate the end date using the index
            end_date = df.index[end_date_index]

            # Infer frequency within the specified date range
            subset_df = df.loc[start_date:end_date]
            freq = pd.infer_freq(subset_df.index)

            if freq is not None:
                print("Inferred frequency within range", start_date, "-", end_date, ":", freq)
                return freq  # Exit the loop and return the inferred frequency

In [7]:
def max_consecutive_missing_dates(inferred_freq, missing_dates):
    # Function to check if two dates are consecutive based on the inferred frequency
    def are_consecutive(date1, date2, freq):
        # Calculate the difference between dates based on the inferred frequency
        diff = date2 - date1
        # Check if the difference matches the frequency
        if freq == 'D':
            return diff.days == 1
        elif freq.endswith('H')| freq.endswith('h'):
             # If the frequency ends with 'H', check if it represents hourly intervals
            if freq[:-1]:  # Check if there is a multiplier
                  interval = int(freq[:-1])
                  return diff.total_seconds() == interval * 3600
            else:
                   # If no multiplier is provided, it's assumed to be one hour
                   return diff.total_seconds() == 3600
        elif freq.endswith('T'):
            # Extract the interval from the frequency string
            interval = int(freq[:-1])
            return diff.seconds // 60 == interval
        else:
            raise ValueError("Unsupported frequency: {}".format(freq))

    # Initialize variables to track maximum length and current length
    max_consecutive_missing = 0
    current_consecutive_missing = 0

    # Iterate over the missing dates
    for i in range(1, len(missing_dates)):
        # Check if the current date is consecutive with the previous date
        if are_consecutive(missing_dates[i - 1], missing_dates[i], inferred_freq):
            # Increment current consecutive missing count
            current_consecutive_missing += 1
        else:
            # Update maximum consecutive missing count if needed
            max_consecutive_missing = max(max_consecutive_missing, current_consecutive_missing)
            # Reset current consecutive missing count
            current_consecutive_missing = 0

    # Update max_consecutive_missing if current_consecutive_missing is still greater
    max_consecutive_missing = max(max_consecutive_missing, current_consecutive_missing)

    return max_consecutive_missing

In [8]:
def preprocess(df, f_name):
    # Convert 'timestamp' column to datetime format and rename it to 'ds'
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Removing the duplicate rows
    df = df[~df.duplicated(keep='first')]

    duplicated_dates_length = len(df[df['timestamp'].duplicated(keep=False)])

    if  duplicated_dates_length > 0:
      print("Number of Duplicated Dates in "+ f_name + ": "+ str(duplicated_dates_length))
      # To make the mean as the value for the numerical columns if there are different values for a particular date
      df = df.groupby('timestamp').mean()
      # Reset index to bring 'timestamp' column back
      df.reset_index(inplace=True)

    df.set_index(['timestamp'], inplace=True)
    df.sort_index()

    # Create a date range with hourly frequency covering the entire time range
    start_date = df.index.min()
    end_date = df.index.max()

    #inferred_freq = pd.infer_freq(df.index)
    inferred_freq = find_non_none_frequency(df)

    if inferred_freq is None:
      inferred_freq = default_freq # setting the default frequency
      print("Cannot infer the frequency of the timestamp of the dataset "+ f_name+ " .Therefore the default frequency of " + default_freq+ " will be used")

    expected_date_range = pd.date_range(start=start_date, end=end_date, freq=inferred_freq)

    # Find the missing date entries
    missing_dates = expected_date_range[~expected_date_range.isin(df.index)]
    # Print or work with the list of missing dates
    print("Number of Missing Dates in "+ f_name + ": "+ str(len(missing_dates)))

    if len(missing_dates) > 0:
      df = df.asfreq(inferred_freq)
      df.sort_index()

      # Call the function with inferred_freq and missing_dates parameters
      max_consecutive = max_consecutive_missing_dates(inferred_freq, missing_dates)
      print("Maximum length of consecutive missing dates:", max_consecutive)
      if max_consecutive > 3:
        print("It is better to use other imputation method rather than linear interpolation")

      df['value'] = df['value'].interpolate(method='linear')

      print("")

    return df

In [9]:
url = 'https://raw.githubusercontent.com/numenta/NAB/master/labels/combined_labels.json'

response = requests.get(url)

if response.status_code == 200:
    labels = json.loads(response.text)
else:
    print("Failed to retrieve data from the URL:", response.status_code)

In [10]:
dir = 'realAdExchange'

# Ensure the directory exists in the data dictionary
if dir in data:
    # Loop through the files in the directory
    for f_name in data[dir]:
        print(f_name)
else:
    print(f"Directory '{dir}' not found in the data.")


exchange-2_cpc_results.csv
exchange-2_cpm_results.csv
exchange-3_cpc_results.csv
exchange-3_cpm_results.csv
exchange-4_cpc_results.csv
exchange-4_cpm_results.csv


In [11]:
for f_name in data[dir]:

    df = preprocess(data[dir][f_name], f_name)

    labels_of_one_file = labels[dir+'/'+f_name]

    df['is_anomaly'] = 0

    for anomalous_timestamp in labels_of_one_file:
      anomalous_timestamp = pd.to_datetime(anomalous_timestamp)
      try:
          df.at[anomalous_timestamp, 'is_anomaly'] = 1  # Set is_anomaly to 1 at the index location

      except KeyError:
          print(f"Anomalous timestamp {anomalous_timestamp} not found in data[{dir}][{f_name}] .")
          pass

    data[dir][f_name] = df

Number of Duplicated Dates in exchange-2_cpc_results.csv: 2
Inferred frequency within range 2011-07-26 22:00:01 - 2011-07-27 07:00:01 : h
Number of Missing Dates in exchange-2_cpc_results.csv: 25
Maximum length of consecutive missing dates: 19
It is better to use other imputation method rather than linear interpolation

Number of Duplicated Dates in exchange-2_cpm_results.csv: 2
Inferred frequency within range 2011-07-17 07:00:01 - 2011-07-17 16:00:01 : h
Number of Missing Dates in exchange-2_cpm_results.csv: 25
Maximum length of consecutive missing dates: 19
It is better to use other imputation method rather than linear interpolation

Inferred frequency within range 2011-08-22 15:15:01 - 2011-08-23 00:15:01 : h
Number of Missing Dates in exchange-3_cpc_results.csv: 109
Maximum length of consecutive missing dates: 14
It is better to use other imputation method rather than linear interpolation

Inferred frequency within range 2011-09-02 21:15:01 - 2011-09-03 06:15:01 : h
Number of Missi

In [12]:
#pip install -U kaleido

Visualization

In [13]:
# Create a directory if it doesn't exist
output_folder = "visualization/pure_format"
os.makedirs(output_folder, exist_ok=True)

In [14]:
import plotly.io as pio

for f_name in data[dir]:

    df = data[dir][f_name]
    # Create a figure using Plotly Express
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=df.index, y=df['value'], mode='lines', name='Value'))

    # Add scatter plot for anomalies
    anomalies = df[df['is_anomaly'] == 1]  # Filter DataFrame to get rows where is_anomaly is 1
    fig.add_trace(go.Scatter(x=anomalies.index, y=anomalies['value'], mode='markers', marker=dict(color='red'), name='Anomalies'))

    # Update layout
    fig.update_layout(title=f"{dir} / {f_name}", xaxis_title='Timestamp', yaxis_title='Value')

    # Show plot
    fig.show()

    # Save plot as PNG file
    file_path = os.path.join(output_folder, f"{dir}_{f_name}.png")
    pio.write_image(fig, file_path)


TS Feature Extraction

In [15]:
dfs = []

In [16]:
def extract_features(dir, file_name, new_df):
    new_df.index = new_df.index  # Set the index (you can perform operations here if needed)
    new_df.rename(columns={'value': 'y'}, inplace=True)
    new_df['unique_id'] = f"{dir}/{file_name}"  # Using the filename as unique identifier
    dfs.append(new_df)  # Append the modified DataFrame to the list
    return dfs

In [17]:
for file_name in data[dir]:
    new_df = data[dir][file_name].copy()
    extract_features(dir, file_name, new_df)

In [18]:
combined_df = pd.concat(dfs, ignore_index=True)
# Assuming tsfeatures function is defined elsewhere and imported
features = tsfeatures(combined_df, freq=288)
#features = tsfeatures(combined_df, dict_freqs={'T': 60, '2T': 30,'3T': 20, '4T': 15,'5T': 12,'10T': 6,'15T': 4,'20T': 3,'30T': 2, 'H': 24, '2H': 12,'3H': 8, '4H': 6,'6H': 4,'8H': 3,'12H': 2, 'D': 7, 'W': 52, 'M': 12})
df_features = pd.DataFrame(features)
df_features

Unnamed: 0,unique_id,hurst,series_length,unitroot_pp,unitroot_kpss,hw_alpha,hw_beta,hw_gamma,stability,nperiods,...,entropy,crossing_points,arch_lm,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
0,realAdExchange/exchange-2_cpc_results.csv,0.906114,1648,-291.651096,3.322436,0.662955,3.689681e-10,2.177951e-10,0.164412,1,...,0.523456,219,0.333751,0.84456,1.723964,0.019727,0.033048,-0.4742,0.247259,0.533564
1,realAdExchange/exchange-2_cpm_results.csv,0.720425,1648,-352.407911,0.68889,0.413818,1.154891e-09,1.32262e-09,0.041318,1,...,0.483681,228,0.270601,0.823062,2.040337,-0.122537,0.066928,-0.590594,0.359987,0.601838
2,realAdExchange/exchange-3_cpc_results.csv,0.82741,1647,-689.70209,0.784406,0.248606,0.0,0.0,0.072987,1,...,0.813993,361,0.243241,0.595922,0.603031,-0.176902,0.101971,-0.466519,0.270492,0.155811
3,realAdExchange/exchange-3_cpm_results.csv,0.898996,1647,-377.537596,3.913879,0.716322,6.293205e-10,1.216726e-10,0.143555,1,...,0.662637,257,0.098898,0.764779,1.248005,-0.05856,0.040519,-0.454639,0.217878,0.290427
4,realAdExchange/exchange-4_cpc_results.csv,0.520317,1647,-1862.138673,1.727024,0.014715,1.600749e-13,4.137589e-09,0.026716,1,...,0.941542,287,0.003958,0.040633,0.032792,-0.504576,0.263658,-0.670898,0.492239,-0.000595
5,realAdExchange/exchange-4_cpm_results.csv,0.390779,1647,-1839.697103,0.181736,0.006275,2.6367e-12,5.935964e-15,0.011183,1,...,0.943788,315,0.005727,0.030553,0.026503,-0.50421,0.26334,-0.671191,0.494804,6e-05


In [19]:
directory = 'dataset_preparation'
if not os.path.exists(directory):
    os.makedirs(directory)

# Specify the file path
file_path = os.path.join(directory, 'df_features_pure_extraction.csv')  # for CSV file
# file_path = os.path.join(directory, 'df_features_pure_extraction.pkl')  # for pickle file

# Save the DataFrame
df_features.to_csv(file_path, index=False)  # for CSV file

Splitting the dataset into training and testing sets

In [20]:
def split_data(df, train_ratio=0.7):
    train_size = int(len(df) * train_ratio)
    train, val = df[:train_size], df[train_size:]
    return train, val

Exponential Smoothing

In [21]:
seasonal_periods = [6, 12, 24, 36, 48, 60, 72, 96, 120, 144]
smoothing_level = [0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 0.9]
smoothing_seasonal = [0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 0.9]

all_combinations = list(itertools.product(seasonal_periods, smoothing_level, smoothing_seasonal))


# Define the number of random combinations to sample
num_samples = 100

# Set the seed for reproducibility
random.seed(42)

# Randomly sample from all possible combinations
param_grid_exponential = random.sample(all_combinations, num_samples)

param_grid_exponential

[(72, 0.6, 0.8),
 (12, 0.2, 0.2),
 (6, 0.2, 0.8),
 (96, 0.8, 0.2),
 (24, 0.9, 0.1),
 (24, 0.5, 0.9),
 (24, 0.4, 0.4),
 (12, 0.5, 0.2),
 (96, 0.6, 0.9),
 (12, 0.1, 0.5),
 (96, 0.1, 0.5),
 (144, 0.9, 0.5),
 (144, 0.4, 0.2),
 (60, 0.6, 0.9),
 (6, 0.9, 0.4),
 (72, 0.2, 0.2),
 (48, 0.4, 0.9),
 (6, 0.4, 0.4),
 (6, 0.4, 0.2),
 (6, 0.9, 0.8),
 (24, 0.2, 0.9),
 (24, 0.5, 0.1),
 (60, 0.2, 0.9),
 (72, 0.4, 0.1),
 (6, 0.2, 0.9),
 (60, 0.9, 0.1),
 (24, 0.1, 0.5),
 (96, 0.5, 0.4),
 (72, 0.8, 0.5),
 (96, 0.4, 0.4),
 (144, 0.8, 0.1),
 (48, 0.4, 0.6),
 (24, 0.4, 0.1),
 (48, 0.6, 0.8),
 (72, 0.2, 0.1),
 (24, 0.9, 0.4),
 (120, 0.5, 0.2),
 (144, 0.1, 0.6),
 (6, 0.1, 0.5),
 (96, 0.9, 0.5),
 (120, 0.4, 0.9),
 (12, 0.6, 0.6),
 (96, 0.4, 0.1),
 (144, 0.6, 0.6),
 (36, 0.5, 0.9),
 (144, 0.2, 0.9),
 (12, 0.6, 0.4),
 (24, 0.2, 0.8),
 (96, 0.9, 0.8),
 (36, 0.5, 0.6),
 (144, 0.8, 0.6),
 (144, 0.6, 0.2),
 (36, 0.9, 0.8),
 (12, 0.1, 0.1),
 (36, 0.8, 0.2),
 (120, 0.8, 0.9),
 (36, 0.6, 0.2),
 (72, 0.4, 0.2),
 (24, 0.8,

In [22]:
def exponential_smoothing(train, val, param_grid):
    train_values = train['value']  # Extracting only the 'value' column
    val_values = val['value']      # Extracting only the 'value' column

    # Initialize variables to store best parameters and performance
    best_params = None
    best_score = float('inf')

    # Iterate over parameter grid
    for params in param_grid:
        # Extract parameters
        seasonal_periods = params[0]
        smoothing_level = params[1]
        smoothing_seasonal = params[2]

        # Fit the model with current parameters
        model = ExponentialSmoothing(train_values, trend=None, seasonal='add', seasonal_periods=seasonal_periods)
        fitted_model = model.fit(smoothing_level=smoothing_level, smoothing_seasonal=smoothing_seasonal)

        # Make predictions
        forecast = fitted_model.forecast(steps=len(val))

        # Evaluate performance
        mse = mean_squared_error(val_values, forecast)

        # Update best parameters if the current parameters yield a lower MSE
        if mse < best_score:
            best_score = mse
            best_params = params

    print("Exponential Smoothing")
    print("Best parameters of Seasonal Periods, Smoothing Level and Smoothing Seasonal:", best_params)

    # Extract the best parameters
    best_seasonal_periods = best_params[0]
    best_smoothing_level = best_params[1]
    best_smoothing_seasonal = best_params[2]

    # Fit the final model with the best parameters
    final_model = ExponentialSmoothing(train_values, trend=None, seasonal='add', seasonal_periods=best_seasonal_periods)
    final_fitted_model = final_model.fit(smoothing_level=best_smoothing_level, smoothing_seasonal=best_smoothing_seasonal)

    # Make final forecast
    final_forecast = final_fitted_model.forecast(steps=len(val))

    forecast_df = pd.DataFrame(final_forecast, index=val.index, columns=['Forecast'])

    final_mse = mean_squared_error(val_values, final_forecast)
    final_mae = mean_absolute_error(val_values, final_forecast)

    print("Mean Absolute Error:", final_mae)
    print("Mean Squared Error:", final_mse)


    return forecast_df, final_mae, final_mse


ARIMA

In [23]:
#pip install pmdarima

In [24]:
from pmdarima.arima import auto_arima

In [25]:
def arima(train, val):
    # Extracting only the 'value' column
    train_values = train['value']

    # Perform automated ARIMA test
    arima_model = auto_arima(train_values, seasonal=True)

    # Forecast on the validation data
    forecast = arima_model.predict(n_periods=len(val))

    p, d, q = arima_model.order

    # Create a DataFrame with the forecasted values
    forecast_df = pd.DataFrame(forecast, index=val.index, columns=['Forecast'])

    print("Arima")

    # Calculate MAE and MSE
    mae = mean_absolute_error(val['value'], forecast)
    mse = mean_squared_error(val['value'], forecast)

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)

    return forecast_df, mae, mse, p, d, q

SARIMA

In [26]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [27]:
from sklearn.base import BaseEstimator

class SARIMAXWrapper(BaseEstimator):
    def __init__(self, order=(1, 1, 1), seasonal_order=(0, 0, 0, 0)):
        self.order = order
        self.seasonal_order = seasonal_order

    def fit(self, X, y):
        self.model = SARIMAX(endog=y, order=self.order, seasonal_order=self.seasonal_order)
        self.result = self.model.fit()
        return self

    def predict(self, X):
        return self.result.forecast(steps=len(X))

    def get_params(self, deep=True):
        return {"order": self.order, "seasonal_order": self.seasonal_order}

In [28]:
def SARIMA(train, val, p, d, q):

    param_grid = {
        'seasonal_order': [(P, D, Q, s) for P in range(0, 4)
                                          for D in range(0, 4)
                                          for Q in range(0, 4)
                                          for s in [6,12,24,36,48,60,72,96,120,144,288,576]]
     }

    sarima = SARIMAXWrapper(order=(p, d, q))
    search = RandomizedSearchCV(estimator=sarima, param_distributions=param_grid, n_iter=2, cv=2, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    search.fit(X=train, y=train['value'])

    # Print the best estimator found
    print(search.best_estimator_)

    # Make predictions using the best model
    y_pred = search.best_estimator_.predict(val['value'])

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(val['value'], y_pred)
    print("Mean Absolute Error (MAE):", mae)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(val['value'], y_pred)
    print("Mean Squared Error (MSE):", mse)

    # Calculate Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)

    print("")

    return y_pred, mae, mse

XGBoost

In [29]:
# Feature Engineering
def create_features(df, label=None):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day

    X = df[['hour','dayofweek','quarter','month','year',
            'dayofyear','dayofmonth']]
    if label:
        y = df[label]
        return X, y
    return X

In [30]:
def xgboost(train, val):
    # Feature Engineering
    lags = 12  # You can adjust this
    for i in range(1, lags + 1):
        train[f'lag_{i}'] = train['value'].shift(i)
        val[f'lag_{i}'] = val['value'].shift(i)

    # Create features and target variable
    X_train, y_train = create_features(train, label='value')
    X_val, y_val = create_features(val, label='value')

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Hyperparameter tuning
    param_grid = {
        'n_estimators': [25, 50, 100, 150, 200, 300, 400, 500, 1000],
        'max_depth': [2, 3, 5, 7, 10],
        'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.3],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3, 0.4],
        'reg_alpha': [0, 0.1, 0.5, 1, 10],
        'reg_lambda': [0, 0.1, 0.5, 1, 10],
        'min_child_weight': [1, 3, 5, 7, 10],
    }
    xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
    search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid, n_iter=100, cv=2, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    search.fit(X_train_scaled, y_train)

    print("XGBoost")

    best_params = search.best_params_
    print("Best Parameters:", best_params)

    # Model training with best parameters
    model = XGBRegressor(**best_params, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Model evaluation
    forecast = model.predict(X_val_scaled)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_val, forecast)
    print(f'Mean Absolute Error: {mae}')

    mse = mean_squared_error(y_val, forecast)
    print(f'Mean Squared Error: {mse}')

    # Create a DataFrame with the forecasted values
    forecast_df = pd.DataFrame(forecast, index=y_val.index, columns=['Forecast'])

    return forecast_df, mae, mse

Visualization of model predictions - for exponential smoothing

In [31]:
def plot_forecast_interactive(forecast_df, val, file_name, model_name):

    # Create a directory if it doesn't exist
    output_folder = os.path.join("visualization", model_name)
    os.makedirs(output_folder, exist_ok=True)

    # Plot forecast and real values
    forecast_trace = go.Scatter(x=forecast_df.index, y=forecast_df['Forecast'], mode='lines', name='Forecast')
    real_trace = go.Scatter(x=val.index, y=val['value'], mode='lines', name='Real')

    # Create the layout
    layout = go.Layout(title=f"{model_name} / {dir} / {file_name} ",
                       xaxis=dict(title='Timestamp'),
                       yaxis=dict(title='Value'))

    # Combine traces into a list
    data = [forecast_trace, real_trace]

    # Create the figure
    fig = go.Figure(data=data, layout=layout)

    # Show the interactive plot
    fig.show()

    # Save plot as PNG file
    file_path = os.path.join(output_folder, f"{dir}_{file_name}.png")
    pio.write_image(fig, file_path)

In [55]:
def plot_forecast_sarima_interactive( y_pred, val, file_name, model_name):

    # Create a directory if it doesn't exist
    output_folder = os.path.join("visualization", model_name)
    os.makedirs(output_folder, exist_ok=True)
    
    # Plot predicted and actual values
    pred_trace = go.Scatter(x=val.index, y=y_pred, mode='lines', name='Forecast')
    val_trace = go.Scatter(x=val.index, y=val['value'], mode='lines', name='Real')

    # Create the layout
    layout = go.Layout(title=f"{model_name} / {dir} / {file_name} ",
                       xaxis=dict(title='Timestamp'),
                       yaxis=dict(title='Value'))

    # Combine traces into a list
    data = [pred_trace, val_trace]

    # Create the figure
    fig = go.Figure(data=data, layout=layout)

    # Show the interactive plot
    fig.show()

    # Save plot as PNG file
    file_path = os.path.join(output_folder, f"{dir}_{file_name}.png")
    pio.write_image(fig, file_path)

Running All Models --------------------------------

In [32]:
columns = ['dir', 'file_name', 'exponential_smoothing', 'xgboost', 'arima']

In [51]:
def process_file(dir, file_name, df, param_grid_exponential):
    train, val = split_data(df)

    predicted_result = {'dir': dir, 'file_name': file_name, 'original_value': val['value'].values}
    mae_result = {'dir': dir, 'file_name': file_name}
    cpu_results = {'dir': dir, 'file_name': file_name}
    memory_results = {'dir': dir, 'file_name': file_name}
    time_results = {'dir': dir, 'file_name': file_name}

    # Exponential Smoothing
    start_time = time.time()
    start_cpu = psutil.cpu_percent()
    start_memory = psutil.virtual_memory().used

    exponential_forecast_df, exponential_mae, exponential_mse = exponential_smoothing(train, val, param_grid_exponential)

    end_time = time.time()
    end_cpu = psutil.cpu_percent()
    end_memory = psutil.virtual_memory().used

    elapsed_time = end_time - start_time
    cpu_usage = end_cpu - start_cpu
    memory_usage = end_memory - start_memory

    print("Exponential Smoothing:")
    print("Time taken:", elapsed_time, "seconds")
    print("CPU Usage:", cpu_usage, "%")
    print("Memory Usage:", memory_usage, "bytes")

    plot_forecast_interactive(exponential_forecast_df, val, file_name, "exponential_smoothing")

    mae_result['exponential_smoothing'] = exponential_mae
    cpu_results['exponential_smoothing'] = cpu_usage
    memory_results['exponential_smoothing'] = memory_usage
    time_results['exponential_smoothing'] = elapsed_time

    if 'Forecast' in exponential_forecast_df:
        predicted_result['exponential_smoothing'] = exponential_forecast_df['Forecast'].tolist()

    # ARIMA
    start_time = time.time()
    start_cpu = psutil.cpu_percent()
    start_memory = psutil.virtual_memory().used

    arima_forecast_df, arima_mae, arima_mse, p,d,q = arima(train, val)

    end_time = time.time()
    end_cpu = psutil.cpu_percent()
    end_memory = psutil.virtual_memory().used

    elapsed_time = end_time - start_time
    cpu_usage = end_cpu - start_cpu
    memory_usage = end_memory - start_memory

    print("ARIMA:")
    print("Time taken:", elapsed_time, "seconds")
    print("CPU Usage:", cpu_usage, "%")
    print("Memory Usage:", memory_usage, "bytes")

    plot_forecast_interactive(arima_forecast_df, val, file_name, "arima")

    mae_result['arima'] = arima_mae
    cpu_results['arima'] = cpu_usage
    memory_results['arima'] = memory_usage
    time_results['arima'] = elapsed_time

    if 'Forecast' in arima_forecast_df:
        predicted_result['arima'] = arima_forecast_df['Forecast'].tolist()

    # SARIMA
    # sarima_forecast, arima_mae, arima_mse = SARIMA(train, val,p,d,q)
    # plot_forecast_sarima_interactive( sarima_forecast, val, file_name, "sarima")

    # end_time = time.time()
    # end_cpu = psutil.cpu_percent()
    # end_memory = psutil.virtual_memory().used

    # elapsed_time = end_time - start_time
    # cpu_usage = end_cpu - start_cpu
    # memory_usage = end_memory - start_memory
    
    # XGBoost
    start_time = time.time()
    start_cpu = psutil.cpu_percent()
    start_memory = psutil.virtual_memory().used

    xgboost_forecast_df, xgboost_mae, xgboost_mse = xgboost(train, val)

    end_time = time.time()
    end_cpu = psutil.cpu_percent()
    end_memory = psutil.virtual_memory().used

    elapsed_time = end_time - start_time
    cpu_usage = end_cpu - start_cpu
    memory_usage = end_memory - start_memory

    print("XGBoost:")
    print("Time taken:", elapsed_time, "seconds")
    print("CPU Usage:", cpu_usage, "%")
    print("Memory Usage:", memory_usage, "bytes")

    plot_forecast_interactive(xgboost_forecast_df, val, file_name, "XGBoost")

    mae_result['xgboost'] = xgboost_mae
    cpu_results['xgboost'] = cpu_usage
    memory_results['xgboost'] = memory_usage
    time_results['xgboost'] = elapsed_time

    if 'Forecast' in xgboost_forecast_df:
        predicted_result['xgboost'] = xgboost_forecast_df['Forecast'].tolist()

    mae_df = pd.DataFrame([mae_result])
    predicted_df = pd.DataFrame([predicted_result])
    cpu_df = pd.DataFrame([cpu_results])
    memory_df = pd.DataFrame([memory_results])
    time_df = pd.DataFrame([time_results])

    return cpu_df, memory_df, time_df, mae_df, predicted_df

In [56]:
# Collect results for all files
all_cpu_results = []
all_memory_results = []
all_time_results = []
all_mae_results = []
all_predicted_results = []

for file_name in data[dir]:  # Assuming you have a list of file names called 'files'
    df = data[dir][file_name]
    print(f"Iterating for {dir} / {file_name}")
    cpu_df, memory_df, time_df, mae_df, predicted_df = process_file(dir, file_name, df, param_grid_exponential)
    all_cpu_results.append(cpu_df)
    all_memory_results.append(memory_df)
    all_time_results.append(time_df)
    all_mae_results.append(mae_df)
    all_predicted_results.append(predicted_df)

# Concatenate results into single dataframes
cpu_df_final = pd.concat(all_cpu_results)
memory_df_final = pd.concat(all_memory_results)
time_df_final = pd.concat(all_time_results)
mae_df_final = pd.concat(all_mae_results)
predicted_df_final = pd.concat(all_predicted_results)


# Reset index for all dataframes
cpu_df_final.reset_index(drop=True, inplace=True)
memory_df_final.reset_index(drop=True, inplace=True)
time_df_final.reset_index(drop=True, inplace=True)
mae_df_final.reset_index(drop=True, inplace=True)
predicted_df_final.reset_index(drop=True, inplace=True)

Iterating for realAdExchange / exchange-2_cpc_results.csv
Arima
Mean Absolute Error (MAE): 0.03982344629700707
Mean Squared Error (MSE): 0.0023452307731685507
ARIMA:
Time taken: 0.801781415939331 seconds
CPU Usage: 0.6999999999999993 %
Memory Usage: 4411392 bytes


 This problem is unconstrained.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            3     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -2.38358D+00    |proj g|=  2.28566D+00
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            3     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -2.48694D+00    |proj g|=  3.94516D+00

At iterate    5    f= -2.48798D+00    |proj g|=  1.38897D-01

At iterate   10    f= -2.48844D+00    |proj g|=  3.10506D+00



   evaluations in the last line search.  Termination
   may possibly be caused by a bad search direction.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    3      3     23      1     0     0   3.335D-02  -2.384D+00
  F =  -2.3840740932969777     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             

At iterate   15    f= -2.49148D+00    |proj g|=  1.29648D-01

At iterate   20    f= -2.49152D+00    |proj g|=  1.08205D-02


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            1     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -1.43973D+00    |proj g|=  8.41617D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    3     22     36      1     0     0   7.287D-03  -2.492D+00
  F =  -2.4915185515667315     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            1     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -1.5

 This problem is unconstrained.



At iterate    5    f= -1.45710D+00    |proj g|=  2.65482D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    1      5      9      1     0     0   2.655D-03  -1.457D+00
  F =  -1.4570985412332369     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             

At iterate    5    f= -1.57385D+00    |proj g|=  7.95795D-03



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.

 Line search cannot locate an adequate point after MAXLS
  function and gradient evaluations.
  Previous x, f and g restored.
 Possible causes: 1 error in function or gradient evaluation;
                  2 rounding error dominate computation.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    1      7     50      2     0     0   3.799D-04  -1.574D+00
  F =  -1.5738537928440810     

ABNORMAL_TERMINATION_IN_LNSRCH                              
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            3     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -2.54118D+00    |proj g|=  9.02011D-01


 This problem is unconstrained.



At iterate    5    f= -2.54124D+00    |proj g|=  1.52048D-02

At iterate   10    f= -2.54124D+00    |proj g|=  2.09473D-01

At iterate   15    f= -2.54153D+00    |proj g|=  1.21443D+00

At iterate   20    f= -2.54176D+00    |proj g|=  6.31599D-03

At iterate   25    f= -2.54179D+00    |proj g|=  4.35932D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    3     28     47      1     0     0   2.513D-02  -2.542D+00
  F =  -2.5417885233927655     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
SARIMAXWrapper(order=(0, 1, 0), seasonal_order=(2, 1, 0, 36))
Mean Absolute Error (MAE): 0.12266712657188389
Mean

Iterating for realAdExchange / exchange-2_cpm_results.csv
Arima
Mean Absolute Error (MAE): 0.13444456440004043
Mean Squared Error (MSE): 0.027845264414521002
ARIMA:
Time taken: 13.179426670074463 seconds
CPU Usage: -10.100000000000001 %
Memory Usage: 47419392 bytes


  warn('Non-invertible starting MA parameters found.'


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            9     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -2.80236D-01    |proj g|=  2.19804D+00


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            7     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -4.76137D-01    |proj g|=  3.04643D-02


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            7     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -1.50446D-01    |proj g|=  2.55177D-02


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            9     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -6.42768D-01    |proj g|=  4.06149D+00


 This problem is unconstrained.



At iterate    5    f= -3.74051D-01    |proj g|=  3.25617D+00

At iterate    5    f= -4.77435D-01    |proj g|=  5.59968D-03

At iterate   10    f= -4.95829D-01    |proj g|=  3.49817D-01

At iterate    5    f= -6.82043D-01    |proj g|=  2.19724D-01

At iterate    5    f= -1.50677D-01    |proj g|=  9.92298D-03

At iterate   15    f= -6.40946D-01    |proj g|=  5.51105D-01

At iterate   10    f= -8.86813D-01    |proj g|=  7.43807D-01

At iterate   10    f= -4.79355D-01    |proj g|=  5.38537D-02

At iterate   10    f= -1.50756D-01    |proj g|=  7.98239D-03

At iterate   20    f= -6.78788D-01    |proj g|=  3.78776D-01

At iterate   15    f= -1.00908D+00    |proj g|=  3.64606D-01

At iterate   25    f= -6.99926D-01    |proj g|=  3.01947D-01

At iterate   20    f= -1.03122D+00    |proj g|=  4.13359D-01

At iterate   15    f= -1.50790D-01    |proj g|=  1.38977D-04

At iterate   30    f= -7.04839D-01    |proj g|=  1.11063D-02

At iterate   25    f= -1.06014D+00    |proj g|=  1.95890D-02

At iter




           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    7     21     34      1     0     0   1.027D-03  -4.827D-01
  F = -0.48266071806478855     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             

At iterate   40    f= -1.08536D+00    |proj g|=  5.96478D-02

At iterate   45    f= -1.09177D+00    |proj g|=  1.68158D-01

At iterate   50    f= -1.10673D+00    |proj g|=  1.55911D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at



RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            9     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -4.75634D-01    |proj g|=  2.93370D+00


 This problem is unconstrained.



At iterate    5    f= -6.20806D-01    |proj g|=  5.63593D+00

At iterate   10    f= -8.01929D-01    |proj g|=  1.30600D+00

At iterate   15    f= -8.61863D-01    |proj g|=  1.03135D-01

At iterate   20    f= -8.71267D-01    |proj g|=  5.66805D-02

At iterate   25    f= -8.89720D-01    |proj g|=  2.43419D+00

At iterate   30    f= -9.25513D-01    |proj g|=  1.79119D-01

At iterate   35    f= -9.36239D-01    |proj g|=  3.87592D-02

At iterate   40    f= -9.50227D-01    |proj g|=  1.02313D+00

At iterate   45    f= -9.80856D-01    |proj g|=  7.69066D-02

At iterate   50    f= -9.83665D-01    |proj g|=  8.61571D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tn

Iterating for realAdExchange / exchange-3_cpc_results.csv
Arima
Mean Absolute Error (MAE): 0.035637924354173504
Mean Squared Error (MSE): 0.0038581082735454006
ARIMA:
Time taken: 17.866833209991455 seconds
CPU Usage: -29.099999999999994 %
Memory Usage: -214700032 bytes


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -1.38317D+00    |proj g|=  5.85971D+00
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -8.84239D-01    |proj g|=  3.32757D+00


 This problem is unconstrained.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            6     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -1.83272D-01    |proj g|=  2.16862D-01


 This problem is unconstrained.



At iterate    5    f= -1.44092D+00    |proj g|=  2.18020D-01

At iterate   10    f= -1.44794D+00    |proj g|=  8.72207D-01

At iterate    5    f= -9.36940D-01    |proj g|=  5.41127D-02

At iterate   15    f= -1.44946D+00    |proj g|=  2.29473D-02
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            6     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -4.92922D-01    |proj g|=  3.96163D+00


 This problem is unconstrained.



At iterate   10    f= -9.48403D-01    |proj g|=  4.72553D-01

At iterate   20    f= -1.45143D+00    |proj g|=  2.14826D-01

At iterate   25    f= -1.45362D+00    |proj g|=  1.99457D-01

At iterate   15    f= -9.49315D-01    |proj g|=  1.19675D-01

At iterate   30    f= -1.45921D+00    |proj g|=  4.17819D-02

At iterate    5    f= -1.87215D-01    |proj g|=  2.12182D-01

At iterate   35    f= -1.46196D+00    |proj g|=  1.39261D-01

At iterate    5    f= -5.64358D-01    |proj g|=  3.78809D-01

At iterate   20    f= -9.53617D-01    |proj g|=  5.13200D-02

At iterate   40    f= -1.46234D+00    |proj g|=  4.23446D-03

At iterate   45    f= -1.46234D+00    |proj g|=  2.47764D-03

At iterate   10    f= -6.88687D-01    |proj g|=  3.25722D-02

At iterate   25    f= -9.54272D-01    |proj g|=  1.35683D-01

At iterate   10    f= -1.88635D-01    |proj g|=  1.12563D-01

At iterate   50    f= -1.46234D+00    |proj g|=  3.78973D-03

           * * *

Tit   = total number of iterations
Tnf   = total nu




At iterate   15    f= -6.89525D-01    |proj g|=  2.46250D-02

At iterate   30    f= -9.54947D-01    |proj g|=  1.88088D-02

At iterate   35    f= -9.55057D-01    |proj g|=  9.85444D-03

At iterate   15    f= -1.89722D-01    |proj g|=  3.69265D-03

At iterate   20    f= -6.91165D-01    |proj g|=  8.67396D-03

At iterate   40    f= -9.55631D-01    |proj g|=  1.08516D-01

At iterate   45    f= -9.56578D-01    |proj g|=  8.80008D-02

At iterate   25    f= -6.92105D-01    |proj g|=  2.18167D-02

At iterate   20    f= -1.89815D-01    |proj g|=  6.75008D-03

At iterate   50    f= -9.57833D-01    |proj g|=  3.35406D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tn




At iterate   30    f= -6.96562D-01    |proj g|=  2.01008D-01

At iterate   25    f= -1.90077D-01    |proj g|=  5.08303D-03

At iterate   35    f= -6.98780D-01    |proj g|=  1.20355D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    6     29     39      1     0     0   3.915D-04  -1.901D-01
  F = -0.19008023882561581     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             

At iterate   40    f= -6.98851D-01    |proj g|=  2.26113D-02



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.



At iterate   45    f= -6.98860D-01    |proj g|=  1.48697D-02

At iterate   50    f= -6.98864D-01    |proj g|=  2.91227D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    6     50     96      2     0     0   2.912D-03  -6.989D-01
  F = -0.69886401417812472     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 




RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -1.10293D+00    |proj g|=  4.12512D+00


 This problem is unconstrained.



At iterate    5    f= -1.15923D+00    |proj g|=  9.69144D-01

At iterate   10    f= -1.16978D+00    |proj g|=  2.27269D-01

At iterate   15    f= -1.17242D+00    |proj g|=  3.56740D-02

At iterate   20    f= -1.17333D+00    |proj g|=  3.74917D-01

At iterate   25    f= -1.17868D+00    |proj g|=  1.95666D-01

At iterate   30    f= -1.17952D+00    |proj g|=  7.32594D-03

At iterate   35    f= -1.17959D+00    |proj g|=  1.78009D-03

At iterate   40    f= -1.17959D+00    |proj g|=  3.18554D-02

At iterate   45    f= -1.17969D+00    |proj g|=  7.72213D-02

At iterate   50    f= -1.18015D+00    |proj g|=  6.66977D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tn

Iterating for realAdExchange / exchange-3_cpm_results.csv
Arima
Mean Absolute Error (MAE): 0.40712147448614444
Mean Squared Error (MSE): 0.31539810861848017
ARIMA:
Time taken: 1.5419957637786865 seconds
CPU Usage: -14.600000000000001 %
Memory Usage: 38449152 bytes


 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            1     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.89264D-01    |proj g|=  9.55548D-01
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            3     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -4.20863D-01    |proj g|=  3.55231D-01
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            3     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -9.14619D-02    |proj g|=  1.58389D+00
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            1     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.38148D-01    |proj g|=  7.83319D-01

At iterate    5    f= -4.21753D-01    |proj g|=  4.78328D-02

At iterate    5    f=

 This problem is unconstrained.



At iterate    5    f= -2.68257D-01    |proj g|=  7.83894D-02

At iterate   10    f= -2.68666D-01    |proj g|=  4.26698D-03

At iterate   15    f= -2.68681D-01    |proj g|=  2.18024D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    3     17     22      1     0     0   6.011D-05  -2.687D-01
  F = -0.26868105159153166     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
SARIMAXWrapper(order=(0, 1, 0), seasonal_order=(2, 1, 0, 36))
Mean Absolute Error (MAE): 1.2477892181280488
Mean Squared Error (MSE): 2.0625727364806856
Root Mean Squared Error (RMSE): 1.436165985003365



Iterating for realAdExchange / exchange-4_cpc_results.csv
Arima
Mean Absolute Error (MAE): 0.04663620716568602
Mean Squared Error (MSE): 0.034832344789766455
ARIMA:
Time taken: 34.96389865875244 seconds
CPU Usage: -3.6000000000000014 %
Memory Usage: -192610304 bytes


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -1.91763D+00    |proj g|=  9.75529D+00


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -4.78068D-01    |proj g|=  2.26884D+00


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            6     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.19785D-02    |proj g|=  2.26952D-01


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            6     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -1.00764D+00    |proj g|=  7.49838D-01


 This problem is unconstrained.



At iterate    5    f= -5.40420D-01    |proj g|=  8.89356D-02

At iterate    5    f= -1.96674D+00    |proj g|=  1.08011D+00

At iterate   10    f= -1.97222D+00    |proj g|=  4.33437D-02

At iterate    5    f=  1.96026D-02    |proj g|=  2.20786D-02

At iterate    5    f= -1.00913D+00    |proj g|=  8.39285D-01

At iterate   10    f= -5.41771D-01    |proj g|=  2.96330D-02

At iterate   15    f= -1.97276D+00    |proj g|=  2.76902D-01

At iterate   10    f= -1.01031D+00    |proj g|=  1.77036D-02

At iterate   15    f= -5.41900D-01    |proj g|=  2.28566D-02

At iterate   10    f=  1.87282D-02    |proj g|=  3.10417D-03

At iterate   20    f= -1.97282D+00    |proj g|=  2.39893D-02

At iterate   15    f= -1.01058D+00    |proj g|=  2.49696D-03

At iterate   20    f= -5.41945D-01    |proj g|=  1.35835D-02

At iterate   25    f= -1.97288D+00    |proj g|=  9.33500D-02

At iterate   15    f=  1.86810D-02    |proj g|=  6.66807D-04

At iterate   25    f= -5.41963D-01    |proj g|=  1.26173D-02

At iter


 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    6     22     32      1     0     0   1.083D-04   1.868D-02
  F =   1.8679478440215756E-002

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             



 Bad direction in the line search;
   refresh the lbfgs memory and restart the iteration.

 Line search cannot locate an adequate point after MAXLS
  function and gradient evaluations.
  Previous x, f and g restored.
 Possible causes: 1 error in function or gradient evaluation;
                  2 rounding error dominate computation.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    6     17     84      3     0     0   3.473D-03  -1.011D+00
  F =  -1.0105779828535804     

ABNORMAL_TERMINATION_IN_LNSRCH                              
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -8.05593D-01    |proj g|=  2.80035D+00


 This problem is unconstrained.



At iterate    5    f= -8.48194D-01    |proj g|=  1.29434D-01

At iterate   10    f= -8.49228D-01    |proj g|=  9.97509D-02

At iterate   15    f= -8.49711D-01    |proj g|=  5.43145D-02

At iterate   20    f= -8.49815D-01    |proj g|=  2.53112D-02

At iterate   25    f= -8.49861D-01    |proj g|=  1.76772D-03

At iterate   30    f= -8.49863D-01    |proj g|=  1.31341D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     31     40      1     0     0   1.286D-04  -8.499D-01
  F = -0.84986268049715630     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
SARIMAXWrapper(order=(0, 0, 5), seasonal_order=(2, 

Iterating for realAdExchange / exchange-4_cpm_results.csv
Arima
Mean Absolute Error (MAE): 0.23210168552712246
Mean Squared Error (MSE): 1.0729040486288866
ARIMA:
Time taken: 6.69688081741333 seconds
CPU Usage: -15.699999999999996 %
Memory Usage: -9367552 bytes


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           10     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -5.88804D-02    |proj g|=  5.64448D-01


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           10     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.24494D+00    |proj g|=  3.59761D-01


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.64808D-01    |proj g|=  1.52004D-02
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.19588D+00    |proj g|=  1.91241D+00


 This problem is unconstrained.



At iterate    5    f=  1.18075D+00    |proj g|=  4.11206D-03

At iterate    5    f= -2.82747D-01    |proj g|=  2.33122D-01

At iterate   10    f=  1.18066D+00    |proj g|=  1.66476D-03

At iterate   15    f=  1.18056D+00    |proj g|=  1.75429D-03

At iterate   10    f= -2.93060D-01    |proj g|=  1.19631D-02

At iterate    5    f=  3.64140D-01    |proj g|=  1.40751D-02

At iterate   15    f= -2.93564D-01    |proj g|=  1.51177D-02

At iterate   20    f=  1.18048D+00    |proj g|=  9.73826D-03

At iterate    5    f=  1.36367D+00    |proj g|=  3.83531D-02

At iterate   20    f= -2.95317D-01    |proj g|=  6.59022D-02

At iterate   25    f=  1.17871D+00    |proj g|=  4.46526D-02

At iterate   30    f=  1.16200D+00    |proj g|=  4.65831D-02

At iterate   10    f=  1.35381D+00    |proj g|=  3.71463D-02

At iterate   25    f= -2.96111D-01    |proj g|=  6.31803D-03

At iterate   35    f=  1.15906D+00    |proj g|=  1.33799D-03

At iterate   10    f=  3.59215D-01    |proj g|=  1.83873D-02

At iter




At iterate   25    f=  3.57860D-01    |proj g|=  1.43402D-02

At iterate   30    f=  3.57590D-01    |proj g|=  8.14344D-03

At iterate   35    f=  3.57535D-01    |proj g|=  1.43700D-03

At iterate   40    f=  3.57531D-01    |proj g|=  1.47236D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     43     52      1     0     0   8.849D-05   3.575D-01
  F =  0.35753124819734894     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           10     M =           10

At X0         0 variables are exactly at the bounds

At iter

 This problem is unconstrained.



At iterate    5    f=  9.37481D-01    |proj g|=  9.55030D-03

At iterate   10    f=  9.37294D-01    |proj g|=  3.42588D-03

At iterate   15    f=  9.37235D-01    |proj g|=  5.76231D-03

At iterate   20    f=  9.36143D-01    |proj g|=  4.15012D-02

At iterate   25    f=  9.27283D-01    |proj g|=  6.61007D-02

At iterate   30    f=  9.14757D-01    |proj g|=  2.92757D-02

At iterate   35    f=  9.14270D-01    |proj g|=  6.56057D-03

At iterate   40    f=  9.14215D-01    |proj g|=  8.94272D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   10     41     49      1     0     0   7.282D-06   9.142D-01
  F =  0.914214521226

In [35]:
mae_df_final

Unnamed: 0,dir,file_name,exponential_smoothing,arima,xgboost
0,realAdExchange,exchange-2_cpc_results.csv,0.014086,0.039823,0.014942
1,realAdExchange,exchange-2_cpm_results.csv,0.064484,0.134445,0.067883
2,realAdExchange,exchange-3_cpc_results.csv,0.030153,0.035638,0.030289
3,realAdExchange,exchange-3_cpm_results.csv,0.184865,0.407121,0.274369
4,realAdExchange,exchange-4_cpc_results.csv,0.043871,0.046636,0.04651
5,realAdExchange,exchange-4_cpm_results.csv,0.228934,0.232102,0.228178


In [36]:
time_df_final

Unnamed: 0,dir,file_name,exponential_smoothing,arima,xgboost
0,realAdExchange,exchange-2_cpc_results.csv,1.456841,0.734529,2.260342
1,realAdExchange,exchange-2_cpm_results.csv,1.600408,8.854248,1.558423
2,realAdExchange,exchange-3_cpc_results.csv,1.65338,19.933057,1.358517
3,realAdExchange,exchange-3_cpm_results.csv,1.696618,1.396211,1.860822
4,realAdExchange,exchange-4_cpc_results.csv,1.641152,18.576991,1.62863
5,realAdExchange,exchange-4_cpm_results.csv,1.822876,6.391324,1.616319


In [37]:
cpu_df_final

Unnamed: 0,dir,file_name,exponential_smoothing,arima,xgboost
0,realAdExchange,exchange-2_cpc_results.csv,3.8,8.2,69.4
1,realAdExchange,exchange-2_cpm_results.csv,-9.7,8.1,54.6
2,realAdExchange,exchange-3_cpc_results.csv,-6.6,21.7,44.3
3,realAdExchange,exchange-3_cpm_results.csv,-27.0,15.0,71.7
4,realAdExchange,exchange-4_cpc_results.csv,-12.1,29.1,39.8
5,realAdExchange,exchange-4_cpm_results.csv,-11.0,25.6,49.0


In [38]:
memory_df_final

Unnamed: 0,dir,file_name,exponential_smoothing,arima,xgboost
0,realAdExchange,exchange-2_cpc_results.csv,51466240,6602752,973967360
1,realAdExchange,exchange-2_cpm_results.csv,26943488,43753472,9109504
2,realAdExchange,exchange-3_cpc_results.csv,24686592,18157568,23285760
3,realAdExchange,exchange-3_cpm_results.csv,-13430784,4722688,-29622272
4,realAdExchange,exchange-4_cpc_results.csv,22077440,20455424,-3481600
5,realAdExchange,exchange-4_cpm_results.csv,4927488,5672960,-14385152


In [39]:
predicted_df_final

Unnamed: 0,dir,file_name,original_value,exponential_smoothing,arima,xgboost
0,realAdExchange,exchange-2_cpc_results.csv,"[0.0707963246554, 0.0606611570248, 0.044240139...","[0.08239459853498723, 0.07737155060185466, 0.0...","[0.0739106901218, 0.0739106901218, 0.073910690...","[0.08348382264375687, 0.08224911242723465, 0.0..."
1,realAdExchange,exchange-2_cpm_results.csv,"[0.176334283725, 0.168131389871, 0.12613429076...","[0.2023933033637229, 0.17521874384645642, 0.15...","[0.20361228624551847, 0.2111542908079831, 0.22...","[0.22574101388454437, 0.18855653703212738, 0.1..."
2,realAdExchange,exchange-3_cpc_results.csv,"[0.0833674873834, 0.115031691858, 0.1125559481...","[0.08662693547130892, 0.10909349776190366, 0.1...","[0.1012880208061552, 0.11480172722007481, 0.11...","[0.09825707972049713, 0.10299435257911682, 0.1..."
3,realAdExchange,exchange-3_cpm_results.csv,"[0.544978141492, 0.758639317537, 0.73238189853...","[0.6532485519967688, 0.814394912520056, 0.8096...","[0.548446739751, 0.548446739751, 0.54844673975...","[0.576107382774353, 0.576107382774353, 0.56887..."
4,realAdExchange,exchange-4_cpc_results.csv,"[0.0686437659033, 0.0956501182033, 0.059973604...","[0.07563152461680721, 0.09054939726141119, 0.0...","[0.08002699251479879, 0.07317117125065187, 0.0...","[0.06404364109039307, 0.06404364109039307, 0.0..."
5,realAdExchange,exchange-4_cpm_results.csv,"[0.42784912914, 0.595515446142, 0.315128224551...","[0.4650382897217444, 0.5413448956773631, 0.266...","[0.5017022758048947, 0.4527749809280139, 0.425...","[0.4120841324329376, 0.4120841324329376, 0.405..."


In [40]:
# Define the directory to save the CSV files
output_dir = "dataset_preparation"

# Check if the directory exists, if not, create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save dataframes as CSV files
cpu_df_final.to_csv(os.path.join(output_dir, "cpu_results.csv"), index=False)
memory_df_final.to_csv(os.path.join(output_dir, "memory_results.csv"), index=False)
time_df_final.to_csv(os.path.join(output_dir, "time_results.csv"), index=False)
mae_df_final.to_csv(os.path.join(output_dir, "mae_results.csv"), index=False)
predicted_df_final.to_csv(os.path.join(output_dir, "predicted_results.csv"), index=False)


Ranking

In [41]:
# Define a function to rank models based on MAE values for each row
def rank_models(row):
    mae_values = row[['exponential_smoothing', 'arima', 'xgboost']]
    model_rank = mae_values.sort_values().index.tolist()
    return model_rank

In [42]:
# Apply the function to each row of the DataFrame
df_features['model_rank'] = mae_df_final.apply(rank_models, axis=1)
df_features

Unnamed: 0,unique_id,hurst,series_length,unitroot_pp,unitroot_kpss,hw_alpha,hw_beta,hw_gamma,stability,nperiods,...,crossing_points,arch_lm,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1,model_rank
0,realAdExchange/exchange-2_cpc_results.csv,0.906114,1648,-291.651096,3.322436,0.662955,3.689681e-10,2.177951e-10,0.164412,1,...,219,0.333751,0.84456,1.723964,0.019727,0.033048,-0.4742,0.247259,0.533564,"[exponential_smoothing, xgboost, arima]"
1,realAdExchange/exchange-2_cpm_results.csv,0.720425,1648,-352.407911,0.68889,0.413818,1.154891e-09,1.32262e-09,0.041318,1,...,228,0.270601,0.823062,2.040337,-0.122537,0.066928,-0.590594,0.359987,0.601838,"[exponential_smoothing, xgboost, arima]"
2,realAdExchange/exchange-3_cpc_results.csv,0.82741,1647,-689.70209,0.784406,0.248606,0.0,0.0,0.072987,1,...,361,0.243241,0.595922,0.603031,-0.176902,0.101971,-0.466519,0.270492,0.155811,"[exponential_smoothing, xgboost, arima]"
3,realAdExchange/exchange-3_cpm_results.csv,0.898996,1647,-377.537596,3.913879,0.716322,6.293205e-10,1.216726e-10,0.143555,1,...,257,0.098898,0.764779,1.248005,-0.05856,0.040519,-0.454639,0.217878,0.290427,"[exponential_smoothing, xgboost, arima]"
4,realAdExchange/exchange-4_cpc_results.csv,0.520317,1647,-1862.138673,1.727024,0.014715,1.600749e-13,4.137589e-09,0.026716,1,...,287,0.003958,0.040633,0.032792,-0.504576,0.263658,-0.670898,0.492239,-0.000595,"[exponential_smoothing, xgboost, arima]"
5,realAdExchange/exchange-4_cpm_results.csv,0.390779,1647,-1839.697103,0.181736,0.006275,2.6367e-12,5.935964e-15,0.011183,1,...,315,0.005727,0.030553,0.026503,-0.50421,0.26334,-0.671191,0.494804,6e-05,"[xgboost, exponential_smoothing, arima]"


In [43]:
directory = 'dataset_preparation'
if not os.path.exists(directory):
    os.makedirs(directory)

# Specify the file path
file_path = os.path.join(directory, 'df_features_with_ranking.csv')  # for CSV file

# Save the DataFrame
df_features.to_csv(file_path, index=False)  # for CSV file

Stacking Approach for training

In [44]:
def stacked_model_predictions(val, base_preds):
    # Splitting features and target variable
    X_train, X_val, y_train, y_val = train_test_split(base_preds, val, test_size=0.2, random_state=42)

    # Define parameter grid for Random Forest
    param_grid = {
        'n_estimators': [25, 50, 100, 150, 200],  # Number of trees in the forest
        'max_depth': [None, 10, 20, 30],      # Maximum depth of the tree
        'min_samples_split': [2, 5, 8, 10, 15],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4, 6]     # Minimum number of samples required to be at a leaf node
    }

    # Initialize Random Forest regressor
    rf = RandomForestRegressor(random_state=42)

    search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, cv=2, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    search.fit(X_train, y_train)

    print("Stacking Approach")

    # Print the best estimator found
    print(search.best_estimator_)

    # Make predictions using the best model
    y_pred = search.best_estimator_.predict(X_val)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_val, y_pred)
    print("Mean Absolute Error (MAE):", mae)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_val, y_pred)
    print("Mean Squared Error (MSE):", mse)

    # Calculate Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)

    print("")

    return y_pred, y_val, mae, mse

Finding the appropriate ensemble size

In [45]:
# Create an empty column named 'ensemble_size'
df_features['ensemble_size'] = np.nan
# Display the DataFrame with the new empty column
df_features

Unnamed: 0,unique_id,hurst,series_length,unitroot_pp,unitroot_kpss,hw_alpha,hw_beta,hw_gamma,stability,nperiods,...,arch_lm,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1,model_rank,ensemble_size
0,realAdExchange/exchange-2_cpc_results.csv,0.906114,1648,-291.651096,3.322436,0.662955,3.689681e-10,2.177951e-10,0.164412,1,...,0.333751,0.84456,1.723964,0.019727,0.033048,-0.4742,0.247259,0.533564,"[exponential_smoothing, xgboost, arima]",
1,realAdExchange/exchange-2_cpm_results.csv,0.720425,1648,-352.407911,0.68889,0.413818,1.154891e-09,1.32262e-09,0.041318,1,...,0.270601,0.823062,2.040337,-0.122537,0.066928,-0.590594,0.359987,0.601838,"[exponential_smoothing, xgboost, arima]",
2,realAdExchange/exchange-3_cpc_results.csv,0.82741,1647,-689.70209,0.784406,0.248606,0.0,0.0,0.072987,1,...,0.243241,0.595922,0.603031,-0.176902,0.101971,-0.466519,0.270492,0.155811,"[exponential_smoothing, xgboost, arima]",
3,realAdExchange/exchange-3_cpm_results.csv,0.898996,1647,-377.537596,3.913879,0.716322,6.293205e-10,1.216726e-10,0.143555,1,...,0.098898,0.764779,1.248005,-0.05856,0.040519,-0.454639,0.217878,0.290427,"[exponential_smoothing, xgboost, arima]",
4,realAdExchange/exchange-4_cpc_results.csv,0.520317,1647,-1862.138673,1.727024,0.014715,1.600749e-13,4.137589e-09,0.026716,1,...,0.003958,0.040633,0.032792,-0.504576,0.263658,-0.670898,0.492239,-0.000595,"[exponential_smoothing, xgboost, arima]",
5,realAdExchange/exchange-4_cpm_results.csv,0.390779,1647,-1839.697103,0.181736,0.006275,2.6367e-12,5.935964e-15,0.011183,1,...,0.005727,0.030553,0.026503,-0.50421,0.26334,-0.671191,0.494804,6e-05,"[xgboost, exponential_smoothing, arima]",


In [46]:
def find_ensemble_size(model_rank, unique_id):
    # Extract directory and file name from unique_id
    dir_name, file_name = unique_id.split('/')

    # Get the first element from the list of model_rank
    model_name = model_rank[0]

    # Find the row in mae_df dataframe that matches the directory and file name
    row = mae_df_final[(mae_df_final['dir'] == dir_name) & (mae_df_final['file_name'] == file_name)]

    # Find the value in the column that matches the model_name
    model_mae = row[model_name].iloc[0]

    # Determine the ensemble size based on the model MAE value
    if model_mae < 0.05:
        print("No need for stacking apporach, since first model has MAE less than 0.05")
        return 1
    else:
        previous_mae = model_mae
        print("Going for stacking Approach")
        i = 0  # Initialize the count of models
        while i < len(model_rank):
            i += 2  # Increment the count of models
            models_to_use = model_rank[:i]  # Take the first i models from the model_rank list
            base_preds = []  # Initialize base_preds as a list
            # Get the predicted values for the selected models
            for model in models_to_use:
                # Find the respective row in predicted_df
                model_row = predicted_df_final[(predicted_df_final['dir'] == dir_name) & (predicted_df_final['file_name'] == file_name)]
                # Get the predicted value for the model
                pred_value = model_row[model].iloc[0]
                # Append the predicted value to base_preds
                base_preds.append(pred_value)

            val_row = predicted_df_final[(predicted_df_final['dir'] == dir_name) & (predicted_df_final['file_name'] == file_name)]
            val = val_row['original_value'].iloc[0]

            base_preds = np.stack(base_preds, axis=-1)
            y_pred, y_val, mae, mse = stacked_model_predictions(val, base_preds)

            if mae > previous_mae:
                print("MAE increased after adding", i, "models, so returning the previous ensemble size")
                return i - 1

            # Update previous MAE with current MAE
            previous_mae = mae

            # If MAE is less than 0.05, return the current ensemble size
            if mae < 0.08:
                return i

    # If none of the models have MAE less than 0.05, return the total count of models
    return len(model_rank)


In [47]:
# Iterate over rows in df_features
for index, row in df_features.iterrows():
    # Extract values from the current row
    model_rank = row['model_rank']
    unique_id = row['unique_id']

    # Print the unique ID before calling the function
    print("Unique ID:", unique_id)

    # Call the function to find ensemble size
    ensemble_size = find_ensemble_size(model_rank, unique_id)

    print("")

    # Assign the ensemble size to the 'ensemble_size' column
    df_features.at[index, 'ensemble_size'] = ensemble_size

Unique ID: realAdExchange/exchange-2_cpc_results.csv
No need for stacking apporach, since first model has MAE less than 0.05

Unique ID: realAdExchange/exchange-2_cpm_results.csv
Going for stacking Approach
Stacking Approach
RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=15,
                      n_estimators=150, random_state=42)
Mean Absolute Error (MAE): 0.05756971890372835
Mean Squared Error (MSE): 0.0061126590682754845
Root Mean Squared Error (RMSE): 0.07818349613745527


Unique ID: realAdExchange/exchange-3_cpc_results.csv
No need for stacking apporach, since first model has MAE less than 0.05

Unique ID: realAdExchange/exchange-3_cpm_results.csv
Going for stacking Approach
Stacking Approach
RandomForestRegressor(max_depth=30, min_samples_leaf=6, min_samples_split=15,
                      n_estimators=25, random_state=42)
Mean Absolute Error (MAE): 0.16910277605591773
Mean Squared Error (MSE): 0.057525299584235697
Root Mean Squared Error (RMSE): 0.2398

In [48]:
# Display the updated DataFrame
df_features

Unnamed: 0,unique_id,hurst,series_length,unitroot_pp,unitroot_kpss,hw_alpha,hw_beta,hw_gamma,stability,nperiods,...,arch_lm,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1,model_rank,ensemble_size
0,realAdExchange/exchange-2_cpc_results.csv,0.906114,1648,-291.651096,3.322436,0.662955,3.689681e-10,2.177951e-10,0.164412,1,...,0.333751,0.84456,1.723964,0.019727,0.033048,-0.4742,0.247259,0.533564,"[exponential_smoothing, xgboost, arima]",1.0
1,realAdExchange/exchange-2_cpm_results.csv,0.720425,1648,-352.407911,0.68889,0.413818,1.154891e-09,1.32262e-09,0.041318,1,...,0.270601,0.823062,2.040337,-0.122537,0.066928,-0.590594,0.359987,0.601838,"[exponential_smoothing, xgboost, arima]",2.0
2,realAdExchange/exchange-3_cpc_results.csv,0.82741,1647,-689.70209,0.784406,0.248606,0.0,0.0,0.072987,1,...,0.243241,0.595922,0.603031,-0.176902,0.101971,-0.466519,0.270492,0.155811,"[exponential_smoothing, xgboost, arima]",1.0
3,realAdExchange/exchange-3_cpm_results.csv,0.898996,1647,-377.537596,3.913879,0.716322,6.293205e-10,1.216726e-10,0.143555,1,...,0.098898,0.764779,1.248005,-0.05856,0.040519,-0.454639,0.217878,0.290427,"[exponential_smoothing, xgboost, arima]",3.0
4,realAdExchange/exchange-4_cpc_results.csv,0.520317,1647,-1862.138673,1.727024,0.014715,1.600749e-13,4.137589e-09,0.026716,1,...,0.003958,0.040633,0.032792,-0.504576,0.263658,-0.670898,0.492239,-0.000595,"[exponential_smoothing, xgboost, arima]",1.0
5,realAdExchange/exchange-4_cpm_results.csv,0.390779,1647,-1839.697103,0.181736,0.006275,2.6367e-12,5.935964e-15,0.011183,1,...,0.005727,0.030553,0.026503,-0.50421,0.26334,-0.671191,0.494804,6e-05,"[xgboost, exponential_smoothing, arima]",1.0


In [49]:
directory = 'dataset_preparation'
if not os.path.exists(directory):
    os.makedirs(directory)

# Specify the file path
file_path = os.path.join(directory, 'df_features_with_ensemble_size.csv')  # for CSV file

# Save the DataFrame
df_features.to_csv(file_path, index=False)  # for CSV file