In [1]:
#!pip install tsfeatures

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import itertools
import random
import requests
import os
import json
import time
import psutil

from itertools import product
from datetime import datetime
from sklearn.impute import KNNImputer
from tsfeatures import tsfeatures
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [3]:
default_freq = 'H'

In [4]:
import requests

index_url = 'https://api.github.com/repos/numenta/NAB/contents/data'

# Fetching file names from the index URL
response = requests.get(index_url)

# Check if the response was successful (status code 200)
if response.status_code == 200:
    # Parse the JSON response
    index_data = response.json()
    
    # Extract directory names
    directories = [file['name'] for file in index_data if file.get('type') == "dir"]
    print(directories)
else:
    print("Failed to fetch data:", response.status_code)


['artificialNoAnomaly', 'artificialWithAnomaly', 'realAWSCloudwatch', 'realAdExchange', 'realKnownCause', 'realTraffic', 'realTweets']


In [5]:
base_url = 'https://raw.githubusercontent.com/numenta/NAB/master/data/'
data = {}

def addFolderAndReadAll(d_name):
    data[d_name] = {}
    response = requests.get(index_url + '/' + d_name)
    index_data = response.json()

    csv_files = [ file['name'] for file in index_data if file['type'] == "file"]
    csvs_num = 0
    for f_name in csv_files:
        data[d_name][f_name] = pd.read_csv(base_url + d_name + '/' + f_name)
        csvs_num += 1
    return csvs_num

csvs_num = sum([addFolderAndReadAll(d_name) for d_name in directories])

Preprocessing

In [6]:
# Function to get a random start date from the DataFrame index
def get_random_start_date(index):
    return np.random.choice(index)

# Main function to repeat the process until non-None frequency is obtained
def find_non_none_frequency(df, offset=9):
    while True:
        # Get a random start date from the DataFrame index
        start_date = pd.to_datetime(get_random_start_date(df.index))

        # Find the index of the end date by moving 9 steps through the indices
        end_date_index = df.index.get_loc(start_date) + offset

        # Check if the end date index is within the range of the DataFrame index
        if end_date_index < len(df.index):
            # Calculate the end date using the index
            end_date = df.index[end_date_index]

            # Infer frequency within the specified date range
            subset_df = df.loc[start_date:end_date]
            freq = pd.infer_freq(subset_df.index)

            if freq is not None:
                print("Inferred frequency within range", start_date, "-", end_date, ":", freq)
                return freq  # Exit the loop and return the inferred frequency

In [7]:
def max_consecutive_missing_dates(inferred_freq, missing_dates):
    # Function to check if two dates are consecutive based on the inferred frequency
    def are_consecutive(date1, date2, freq):
        # Calculate the difference between dates based on the inferred frequency
        diff = date2 - date1
        # Check if the difference matches the frequency
        if freq == 'D':
            return diff.days == 1
        elif freq.endswith('H')| freq.endswith('h'):
             # If the frequency ends with 'H', check if it represents hourly intervals
            if freq[:-1]:  # Check if there is a multiplier
                  interval = int(freq[:-1])
                  return diff.total_seconds() == interval * 3600
            else:
                   # If no multiplier is provided, it's assumed to be one hour
                   return diff.total_seconds() == 3600
        elif freq.endswith('T') | freq.endswith('min') :
            if freq.endswith('T'):
                # Extract the interval from the frequency string
                interval = int(freq[:-1])
                return diff.seconds // 60 == interval
            else:
                interval = int(freq[:-3])
                return diff.seconds // 60 == interval
        else:
            raise ValueError("Unsupported frequency: {}".format(freq))

    # Initialize variables to track maximum length and current length
    max_consecutive_missing = 0
    current_consecutive_missing = 0

    # Iterate over the missing dates
    for i in range(1, len(missing_dates)):
        # Check if the current date is consecutive with the previous date
        if are_consecutive(missing_dates[i - 1], missing_dates[i], inferred_freq):
            # Increment current consecutive missing count
            current_consecutive_missing += 1
        else:
            # Update maximum consecutive missing count if needed
            max_consecutive_missing = max(max_consecutive_missing, current_consecutive_missing)
            # Reset current consecutive missing count
            current_consecutive_missing = 0

    # Update max_consecutive_missing if current_consecutive_missing is still greater
    max_consecutive_missing = max(max_consecutive_missing, current_consecutive_missing)

    return max_consecutive_missing

In [8]:
def preprocess(df, f_name):
    # Convert 'timestamp' column to datetime format and rename it to 'ds'
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Removing the duplicate rows
    df = df[~df.duplicated(keep='first')]

    duplicated_dates_length = len(df[df['timestamp'].duplicated(keep=False)])

    if  duplicated_dates_length > 0:
      print("Number of Duplicated Dates in "+ f_name + ": "+ str(duplicated_dates_length))
      # To make the mean as the value for the numerical columns if there are different values for a particular date
      df = df.groupby('timestamp').mean()
      # Reset index to bring 'timestamp' column back
      df.reset_index(inplace=True)

    df.set_index(['timestamp'], inplace=True)
    df.sort_index()

    # Create a date range with hourly frequency covering the entire time range
    start_date = df.index.min()
    end_date = df.index.max()

    #inferred_freq = pd.infer_freq(df.index)
    inferred_freq = find_non_none_frequency(df)

    if inferred_freq is None:
      inferred_freq = default_freq # setting the default frequency
      print("Cannot infer the frequency of the timestamp of the dataset "+ f_name+ " .Therefore the default frequency of " + default_freq+ " will be used")

    expected_date_range = pd.date_range(start=start_date, end=end_date, freq=inferred_freq)

    # Find the missing date entries
    missing_dates = expected_date_range[~expected_date_range.isin(df.index)]
    # Print or work with the list of missing dates
    print("Number of Missing Dates in "+ f_name + ": "+ str(len(missing_dates))+"\n")

    if len(missing_dates) > 0:
      df = df.asfreq(inferred_freq)
      df.sort_index()

      # Call the function with inferred_freq and missing_dates parameters
      max_consecutive = max_consecutive_missing_dates(inferred_freq, missing_dates)
      print("Maximum length of consecutive missing dates:", max_consecutive)
      if max_consecutive > 3:
        print("It is better to use other imputation method rather than linear interpolation")

      df['value'] = df['value'].interpolate(method='linear')

    return df

In [9]:
url = 'https://raw.githubusercontent.com/numenta/NAB/master/labels/combined_labels.json'

response = requests.get(url)

if response.status_code == 200:
    labels = json.loads(response.text)
else:
    print("Failed to retrieve data from the URL:", response.status_code)

In [10]:
# List of directories
dirs = ['realAdExchange', 'realAWSCloudwatch', 'realKnownCause', 'realTweets']


# Loop through each directory
for dir in dirs:
    for f_name in data[dir]:
        print("")
        print(f"Iterating over file: {dir} / {f_name}")
        df = preprocess(data[dir][f_name], f_name)
        labels_of_one_file = labels[dir+'/'+f_name]
        df['is_anomaly'] = 0
        for anomalous_timestamp in labels_of_one_file:
            anomalous_timestamp = pd.to_datetime(anomalous_timestamp)
            try:
                df.at[anomalous_timestamp, 'is_anomaly'] = 1  # Set is_anomaly to 1 at the index location
            except KeyError:
                print(f"Anomalous timestamp {anomalous_timestamp} not found in data[{dir}][{f_name}].")
                pass
        data[dir][f_name] = df  # Assign the modified DataFrame back to the data dictionary



Iterating over file: realAWSCloudwatch / ec2_cpu_utilization_24ae8d.csv
Inferred frequency within range 2014-02-15 19:05:00 - 2014-02-15 19:50:00 : 5min
Number of Missing Dates in ec2_cpu_utilization_24ae8d.csv: 0


Iterating over file: realAWSCloudwatch / ec2_cpu_utilization_53ea38.csv
Inferred frequency within range 2014-02-27 13:30:00 - 2014-02-27 14:15:00 : 5min
Number of Missing Dates in ec2_cpu_utilization_53ea38.csv: 0


Iterating over file: realAWSCloudwatch / ec2_cpu_utilization_5f5533.csv
Inferred frequency within range 2014-02-26 23:37:00 - 2014-02-27 00:22:00 : 5min
Number of Missing Dates in ec2_cpu_utilization_5f5533.csv: 0


Iterating over file: realAWSCloudwatch / ec2_cpu_utilization_77c1ca.csv
Inferred frequency within range 2014-04-10 05:55:00 - 2014-04-10 06:40:00 : 5min
Number of Missing Dates in ec2_cpu_utilization_77c1ca.csv: 0


Iterating over file: realAWSCloudwatch / ec2_cpu_utilization_825cc2.csv
Inferred frequency within range 2014-04-16 03:04:00 - 2014-04-1

In [11]:
#pip install -U kaleido

In [12]:
df=data['realAWSCloudwatch']['ec2_cpu_utilization_c6585a.csv']
df

Unnamed: 0_level_0,value,is_anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-02 14:29:00,0.066,0
2014-04-02 14:34:00,0.066,0
2014-04-02 14:39:00,0.068,0
2014-04-02 14:44:00,0.134,0
2014-04-02 14:49:00,0.066,0
...,...,...
2014-04-16 14:04:00,0.066,0
2014-04-16 14:09:00,0.068,0
2014-04-16 14:14:00,0.134,0
2014-04-16 14:19:00,0.068,0


In [13]:
df=data['realAWSCloudwatch']['ec2_cpu_utilization_ac20cd.csv']
df

Unnamed: 0_level_0,value,is_anomaly
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-02 14:29:00,42.652,0
2014-04-02 14:34:00,41.362,0
2014-04-02 14:39:00,43.408,0
2014-04-02 14:44:00,40.262,0
2014-04-02 14:49:00,40.328,0
...,...,...
2014-04-16 14:29:00,99.434,0
2014-04-16 14:34:00,99.132,0
2014-04-16 14:39:00,99.248,0
2014-04-16 14:44:00,98.552,0


Visualization

In [14]:
# Create a directory if it doesn't exist
output_folder = "visualization/pure_format"
os.makedirs(output_folder, exist_ok=True)

In [15]:
import plotly.io as pio


# Loop through each directory
for dir in dirs:
    print(f"Iterating over directory: {dir}")
    for f_name in data[dir]:
        print(f"Iterating over file: {f_name}")

        # Retrieve DataFrame for the current file
        df = data[dir][f_name]

        # Create a figure using Plotly Express
        fig = go.Figure()

        # Add line plot for value
        fig.add_trace(go.Scatter(x=df.index, y=df['value'], mode='lines', name='Value'))

        anomalies = df[df['is_anomaly'] == 1]  # Filter DataFrame to get rows where is_anomaly is 1
        fig.add_trace(go.Scatter(x=anomalies.index, y=anomalies['value'], mode='markers', marker=dict(color='red'), name='Anomalies'))
        
        # Update layout
        fig.update_layout(title=f"{dir} / {f_name}", xaxis_title='Timestamp', yaxis_title='Value')

        # Show plot
        fig.show()

        # Save plot as PNG file
        file_path = os.path.join(output_folder, f"{dir}_{f_name}.png")
        pio.write_image(fig, file_path)




Iterating over directory: realAWSCloudwatch
Iterating over file: ec2_cpu_utilization_24ae8d.csv


Iterating over file: ec2_cpu_utilization_53ea38.csv


Iterating over file: ec2_cpu_utilization_5f5533.csv


Iterating over file: ec2_cpu_utilization_77c1ca.csv


Iterating over file: ec2_cpu_utilization_825cc2.csv


Iterating over file: ec2_cpu_utilization_ac20cd.csv


Iterating over file: ec2_cpu_utilization_c6585a.csv


Iterating over file: ec2_cpu_utilization_fe7f93.csv


Iterating over file: ec2_disk_write_bytes_1ef3de.csv


Iterating over file: ec2_disk_write_bytes_c0d644.csv


Iterating over file: ec2_network_in_257a54.csv


Iterating over file: ec2_network_in_5abac7.csv


Iterating over file: elb_request_count_8c0756.csv


Iterating over file: grok_asg_anomaly.csv


Iterating over file: iio_us-east-1_i-a2eb1cd9_NetworkIn.csv


Iterating over file: rds_cpu_utilization_cc0c53.csv


Iterating over file: rds_cpu_utilization_e47b3b.csv


TS Feature Extraction

In [16]:
dfs = []

In [17]:
def extract_features(dir, file_name, new_df):
    new_df.index = new_df.index  # Set the index (you can perform operations here if needed)
    new_df.rename(columns={'value': 'y'}, inplace=True)
    new_df['unique_id'] = f"{dir}/{file_name}"  # Using the filename as unique identifier
    dfs.append(new_df)  # Append the modified DataFrame to the list
    return dfs

In [18]:
# Iterate over each directory
for dir in dirs:
    # Iterate over each file in the current directory
    for file_name in data[dir]:
        new_df = data[dir][file_name].copy()
        extract_features(dir, file_name, new_df)


In [19]:
combined_df = pd.concat(dfs, ignore_index=True)
# Assuming tsfeatures function is defined elsewhere and imported
features = tsfeatures(combined_df, freq=288)
#features = tsfeatures(combined_df, dict_freqs={'T': 60, '2T': 30,'3T': 20, '4T': 15,'5T': 12,'10T': 6,'15T': 4,'20T': 3,'30T': 2, 'H': 24, '2H': 12,'3H': 8, '4H': 6,'6H': 4,'8H': 3,'12H': 2, 'D': 7, 'W': 52, 'M': 12})
df_features = pd.DataFrame(features)
df_features

Unnamed: 0,unique_id,hurst,series_length,unitroot_pp,unitroot_kpss,hw_alpha,hw_beta,hw_gamma,stability,nperiods,...,entropy,crossing_points,arch_lm,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
0,realAWSCloudwatch/ec2_cpu_utilization_24ae8d.csv,0.542713,4032,-4111.795313,0.209819,8.965784e-07,5.825583e-07,2.509908e-06,0.002084,1,...,0.938935,572,0.000133,-0.040142,0.002776,-0.508071,0.260179,-0.66523,0.468612,0.109389
1,realAWSCloudwatch/ec2_cpu_utilization_53ea38.csv,,4032,-5479.60826,2.076331,0.07161681,9.947451e-12,4.689581e-13,0.014688,1,...,0.575929,2248,0.039873,-0.117574,0.446556,-0.580917,1.108369,-0.731136,1.681924,0.571083
2,realAWSCloudwatch/ec2_cpu_utilization_5f5533.csv,1.592583,4032,-6910.25339,31.742803,0.02078785,0.02078767,0.0,0.561932,1,...,0.538841,1533,0.301557,0.30337,2.948069,-0.680217,1.842674,-0.73344,2.022622,0.413182
3,realAWSCloudwatch/ec2_cpu_utilization_77c1ca.csv,0.95088,4032,-654.999557,0.489215,0.9999121,3.794699e-06,1.98382e-07,0.076766,1,...,0.808056,1691,0.587577,0.818554,1.326405,0.123969,0.098248,-0.371219,0.192937,0.11613
4,realAWSCloudwatch/ec2_cpu_utilization_825cc2.csv,1.023656,4034,-62.565968,1.451362,0.6450128,3.541616e-09,1.218704e-06,0.482681,1,...,0.49305,851,0.970268,0.969111,8.553823,-0.278467,0.084796,-0.585181,0.351371,0.012096
5,realAWSCloudwatch/ec2_cpu_utilization_ac20cd.csv,1.089914,4037,-8.202838,12.161875,0.4353097,9.016227e-11,9.422139e-08,0.834827,1,...,0.310188,1638,0.996375,0.988133,9.567909,-0.491751,0.279253,-0.680912,0.52923,0.320249
6,realAWSCloudwatch/ec2_cpu_utilization_c6585a.csv,,4032,-4057.127914,0.045319,1.686688e-05,1.160683e-05,4.175973e-06,0.000147,1,...,0.93828,2207,0.000148,-0.037444,0.003042,-0.50427,0.25519,-0.66564,0.469975,0.2004
7,realAWSCloudwatch/ec2_cpu_utilization_fe7f93.csv,0.802616,4032,-992.787092,0.200876,0.9998869,8.837181e-07,1.69687e-07,0.029572,1,...,0.846909,769,0.366378,0.726203,0.896081,0.086053,0.188679,-0.326595,0.247232,0.084554
8,realAWSCloudwatch/ec2_disk_write_bytes_1ef3de.csv,,4730,-3490.599993,1.035371,0.021986,0.0,0.0,0.018448,1,...,0.92517,228,0.078363,0.250538,0.077008,-0.36958,0.154378,-0.596925,0.377258,0.018929
9,realAWSCloudwatch/ec2_disk_write_bytes_c0d644.csv,,4032,-3277.146318,0.374319,0.1896721,0.0,0.0,0.029484,1,...,0.912983,424,0.095276,0.36703,0.31585,-0.407642,0.184379,-0.629772,0.432614,0.045452


In [20]:
directory = 'dataset_preparation'
if not os.path.exists(directory):
    os.makedirs(directory)

# Specify the file path
file_path = os.path.join(directory, 'df_features_pure_extraction.csv')  # for CSV file
# file_path = os.path.join(directory, 'df_features_pure_extraction.pkl')  # for pickle file

# Save the DataFrame
df_features.to_csv(file_path, index=False)  # for CSV file

Splitting the dataset into training and testing sets

In [21]:
def split_data(df, train_ratio=0.7):
    train_size = int(len(df) * train_ratio)
    train, val = df[:train_size], df[train_size:]
    return train, val

Exponential Smoothing

In [22]:
seasonal_periods = [6, 12, 24, 36, 48, 60, 72, 96, 120, 144]
smoothing_level = [0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 0.9]
smoothing_seasonal = [0.1, 0.2, 0.4, 0.5, 0.6, 0.8, 0.9]

all_combinations = list(itertools.product(seasonal_periods, smoothing_level, smoothing_seasonal))


# Define the number of random combinations to sample
num_samples = 100

# Set the seed for reproducibility
random.seed(42)

# Randomly sample from all possible combinations
param_grid_exponential = random.sample(all_combinations, num_samples)

param_grid_exponential

[(72, 0.6, 0.8),
 (12, 0.2, 0.2),
 (6, 0.2, 0.8),
 (96, 0.8, 0.2),
 (24, 0.9, 0.1),
 (24, 0.5, 0.9),
 (24, 0.4, 0.4),
 (12, 0.5, 0.2),
 (96, 0.6, 0.9),
 (12, 0.1, 0.5),
 (96, 0.1, 0.5),
 (144, 0.9, 0.5),
 (144, 0.4, 0.2),
 (60, 0.6, 0.9),
 (6, 0.9, 0.4),
 (72, 0.2, 0.2),
 (48, 0.4, 0.9),
 (6, 0.4, 0.4),
 (6, 0.4, 0.2),
 (6, 0.9, 0.8),
 (24, 0.2, 0.9),
 (24, 0.5, 0.1),
 (60, 0.2, 0.9),
 (72, 0.4, 0.1),
 (6, 0.2, 0.9),
 (60, 0.9, 0.1),
 (24, 0.1, 0.5),
 (96, 0.5, 0.4),
 (72, 0.8, 0.5),
 (96, 0.4, 0.4),
 (144, 0.8, 0.1),
 (48, 0.4, 0.6),
 (24, 0.4, 0.1),
 (48, 0.6, 0.8),
 (72, 0.2, 0.1),
 (24, 0.9, 0.4),
 (120, 0.5, 0.2),
 (144, 0.1, 0.6),
 (6, 0.1, 0.5),
 (96, 0.9, 0.5),
 (120, 0.4, 0.9),
 (12, 0.6, 0.6),
 (96, 0.4, 0.1),
 (144, 0.6, 0.6),
 (36, 0.5, 0.9),
 (144, 0.2, 0.9),
 (12, 0.6, 0.4),
 (24, 0.2, 0.8),
 (96, 0.9, 0.8),
 (36, 0.5, 0.6),
 (144, 0.8, 0.6),
 (144, 0.6, 0.2),
 (36, 0.9, 0.8),
 (12, 0.1, 0.1),
 (36, 0.8, 0.2),
 (120, 0.8, 0.9),
 (36, 0.6, 0.2),
 (72, 0.4, 0.2),
 (24, 0.8,

In [23]:
def exponential_smoothing(train, val, param_grid):
    train_values = train['value']  # Extracting only the 'value' column
    val_values = val['value']      # Extracting only the 'value' column

    # Initialize variables to store best parameters and performance
    best_params = None
    best_score = float('inf')

    # Iterate over parameter grid
    for params in param_grid:
        # Extract parameters
        seasonal_periods = params[0]
        smoothing_level = params[1]
        smoothing_seasonal = params[2]

        # Fit the model with current parameters
        model = ExponentialSmoothing(train_values, trend=None, seasonal='add', seasonal_periods=seasonal_periods)
        fitted_model = model.fit(smoothing_level=smoothing_level, smoothing_seasonal=smoothing_seasonal)

        # Make predictions
        forecast = fitted_model.forecast(steps=len(val))

        # Evaluate performance
        mse = mean_squared_error(val_values, forecast)

        # Update best parameters if the current parameters yield a lower MSE
        if mse < best_score:
            best_score = mse
            best_params = params

    print("Exponential Smoothing")
    print("Best parameters of Seasonal Periods, Smoothing Level and Smoothing Seasonal:", best_params)

    # Extract the best parameters
    best_seasonal_periods = best_params[0]
    best_smoothing_level = best_params[1]
    best_smoothing_seasonal = best_params[2]

    # Fit the final model with the best parameters
    final_model = ExponentialSmoothing(train_values, trend=None, seasonal='add', seasonal_periods=best_seasonal_periods)
    final_fitted_model = final_model.fit(smoothing_level=best_smoothing_level, smoothing_seasonal=best_smoothing_seasonal)

    # Make final forecast
    final_forecast = final_fitted_model.forecast(steps=len(val))

    forecast_df = pd.DataFrame(final_forecast, index=val.index, columns=['Forecast'])

    final_mse = mean_squared_error(val_values, final_forecast)
    final_mae = mean_absolute_error(val_values, final_forecast)

    print("Mean Absolute Error:", final_mae)
    print("Mean Squared Error:", final_mse)


    return forecast_df, final_mae, final_mse


ARIMA

In [24]:
#pip install pmdarima

In [25]:
from pmdarima.arima import auto_arima

In [26]:
def arima(train, val):
    # Extracting only the 'value' column
    train_values = train['value']

    # Perform automated ARIMA test
    arima_model = auto_arima(train_values, seasonal=True)

    # Forecast on the validation data
    forecast = arima_model.predict(n_periods=len(val))

    p, d, q = arima_model.order

    # Create a DataFrame with the forecasted values
    forecast_df = pd.DataFrame(forecast, index=val.index, columns=['Forecast'])

    print("Arima")

    # Calculate MAE and MSE
    mae = mean_absolute_error(val['value'], forecast)
    mse = mean_squared_error(val['value'], forecast)

    print("Mean Absolute Error (MAE):", mae)
    print("Mean Squared Error (MSE):", mse)

    return forecast_df, mae, mse, p, d, q

SARIMA

In [27]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [28]:
from sklearn.base import BaseEstimator

class SARIMAXWrapper(BaseEstimator):
    def __init__(self, order=(1, 1, 1), seasonal_order=(0, 0, 0, 0)):
        self.order = order
        self.seasonal_order = seasonal_order

    def fit(self, X, y):
        self.model = SARIMAX(endog=y, order=self.order, seasonal_order=self.seasonal_order)
        self.result = self.model.fit()
        return self

    def predict(self, X):
        return self.result.forecast(steps=len(X))

    def get_params(self, deep=True):
        return {"order": self.order, "seasonal_order": self.seasonal_order}

In [29]:
def SARIMA(train, val, p, d, q):

    param_grid = {
        'seasonal_order': [(P, D, Q, s) for P in range(0, 4)
                                          for D in range(0, 4)
                                          for Q in range(0, 4)
                                          for s in [6,12,24,36,48,60,72,96,120,144,288,576]]
     }

    sarima = SARIMAXWrapper(order=(p, d, q))
    search = RandomizedSearchCV(estimator=sarima, param_distributions=param_grid, n_iter=2, cv=2, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    search.fit(X=train, y=train['value'])

    # Print the best estimator found
    print(search.best_estimator_)

    # Make predictions using the best model
    y_pred = search.best_estimator_.predict(val['value'])

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(val['value'], y_pred)
    print("Mean Absolute Error (MAE):", mae)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(val['value'], y_pred)
    print("Mean Squared Error (MSE):", mse)

    # Calculate Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)

    print("")

    return y_pred, mae, mse

XGBoost

In [30]:
# Feature Engineering
def create_features(df, label=None):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day

    X = df[['hour','dayofweek','quarter','month','year',
            'dayofyear','dayofmonth']]
    if label:
        y = df[label]
        return X, y
    return X

In [31]:
def xgboost(train, val):
    # Feature Engineering
    lags = 12  # You can adjust this
    for i in range(1, lags + 1):
        train[f'lag_{i}'] = train['value'].shift(i)
        val[f'lag_{i}'] = val['value'].shift(i)

    # Create features and target variable
    X_train, y_train = create_features(train, label='value')
    X_val, y_val = create_features(val, label='value')

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Hyperparameter tuning
    param_grid = {
        'n_estimators': [25, 50, 100, 150, 200, 300, 400, 500, 1000],
        'max_depth': [2, 3, 5, 7, 10],
        'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.3],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3, 0.4],
        'reg_alpha': [0, 0.1, 0.5, 1, 10],
        'reg_lambda': [0, 0.1, 0.5, 1, 10],
        'min_child_weight': [1, 3, 5, 7, 10],
    }
    xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
    search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid, n_iter=100, cv=2, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    search.fit(X_train_scaled, y_train)

    print("XGBoost")

    best_params = search.best_params_
    print("Best Parameters:", best_params)

    # Model training with best parameters
    model = XGBRegressor(**best_params, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Model evaluation
    forecast = model.predict(X_val_scaled)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_val, forecast)
    print(f'Mean Absolute Error: {mae}')

    mse = mean_squared_error(y_val, forecast)
    print(f'Mean Squared Error: {mse}')

    # Create a DataFrame with the forecasted values
    forecast_df = pd.DataFrame(forecast, index=y_val.index, columns=['Forecast'])

    return forecast_df, mae, mse

Prophet

In [32]:
#pip install prophet

In [33]:
from prophet import Prophet

In [34]:
period = [6, 12, 24, 36, 48, 60, 72, 96, 120, 144, 288]
fourier_order = [1, 2, 4, 5, 6, 8, 10]
seasonality_prior_scale = [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 7, 10]
changepoint_prior_scale = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 7, 10]

all_combinations = list(itertools.product(period, fourier_order, seasonality_prior_scale, changepoint_prior_scale))


# Define the number of random combinations to sample
num_samples = 100

# Set the seed for reproducibility
random.seed(42)

# Randomly sample from all possible combinations
param_grid_prophet = random.sample(all_combinations, num_samples)

param_grid_prophet

[(24, 2, 0.1, 0.001),
 (6, 5, 0.5, 0.005),
 (60, 4, 1, 0.5),
 (48, 8, 0.5, 0.1),
 (48, 4, 0.5, 5),
 (24, 8, 0.01, 0.5),
 (12, 10, 10, 10),
 (288, 6, 0.5, 1),
 (12, 6, 7, 2),
 (120, 2, 2, 0.001),
 (6, 6, 0.2, 0.1),
 (6, 6, 0.01, 2),
 (12, 8, 5, 10),
 (48, 2, 7, 0.5),
 (48, 5, 5, 1),
 (144, 8, 10, 10),
 (6, 5, 2, 0.01),
 (288, 10, 2, 0.05),
 (36, 10, 0.05, 0.2),
 (288, 6, 0.5, 0.001),
 (120, 2, 0.1, 5),
 (48, 4, 0.01, 10),
 (120, 8, 0.2, 0.05),
 (60, 4, 10, 5),
 (6, 1, 7, 7),
 (36, 1, 5, 10),
 (120, 2, 5, 0.001),
 (72, 6, 0.5, 0.5),
 (60, 4, 10, 0.1),
 (36, 1, 0.1, 0.05),
 (48, 2, 0.2, 10),
 (72, 5, 10, 0.5),
 (12, 10, 10, 0.5),
 (12, 8, 2, 1),
 (96, 4, 7, 2),
 (12, 10, 0.1, 0.001),
 (96, 1, 0.01, 0.005),
 (72, 6, 10, 1),
 (60, 2, 0.05, 0.005),
 (6, 8, 10, 0.05),
 (120, 10, 5, 0.05),
 (288, 5, 0.1, 0.005),
 (24, 5, 0.01, 0.2),
 (96, 4, 2, 5),
 (12, 5, 5, 1),
 (288, 8, 0.2, 2),
 (60, 8, 0.01, 0.05),
 (96, 1, 0.2, 5),
 (36, 8, 0.1, 0.5),
 (12, 4, 0.5, 10),
 (6, 10, 0.1, 0.5),
 (48, 5, 0.05

In [35]:
def prophet(train, val, periods, param_grid):
    # Make a copy of the dataframe to avoid modifying the original
    train_copy = train.copy()

    # Create a new column 'ds' with the index values
    train_copy['ds'] = train_copy.index
    # Rename columns to 'ds' and 'y'
    train_copy = train_copy.rename(columns={'value': 'y'})

    # Initialize variables to store best parameters and performance
    best_params = None
    best_score = float('inf')

    print("********************************************** New iteration Begins **************************************************************")

    # Iterate over parameter grid
    for params in param_grid:

        # Fit the model with current parameters
        model = Prophet(seasonality_mode='multiplicative', weekly_seasonality=False, yearly_seasonality=False, seasonality_prior_scale= params[2], changepoint_prior_scale= params[3])
        model.add_seasonality(name='hourly', period=params[0], fourier_order=params[1])
        model.fit(train_copy)
        future_dates = model.make_future_dataframe(periods=periods)
        forecast_df = model.predict(future_dates)
        future_forecast = forecast_df[forecast_df['ds'].isin(val.index)]
        val_forecast = val[val.index.isin(future_forecast['ds'])]

        # Evaluate performance
        mse = mean_squared_error(val_forecast['value'], future_forecast['yhat'])

        # Update best parameters if the current parameters yield a lower MSE
        if mse < best_score:
            best_score = mse
            best_params = params

    print("Best parameters:", best_params)
    print("Best Mean Squared Error:", best_score)

    best_period = best_params[0]
    best_fourier_order = best_params[1]

    # Fit the final model with the best parameters
    final_model = Prophet(seasonality_mode='multiplicative', weekly_seasonality=False, yearly_seasonality=False)
    final_model.add_seasonality(name='hourly', period=best_period, fourier_order=best_fourier_order)
    final_model.fit(train_copy)
    future_dates = final_model.make_future_dataframe(periods=15)
    forecast_df = final_model.predict(future_dates)

    return forecast_df

Visualization of model predictions - for exponential smoothing

In [36]:
def plot_forecast_interactive(forecast_df, val, file_name, model_name):

    # Create a directory if it doesn't exist
    output_folder = os.path.join("visualization", model_name)
    os.makedirs(output_folder, exist_ok=True)

    # Plot forecast and real values
    forecast_trace = go.Scatter(x=forecast_df.index, y=forecast_df['Forecast'], mode='lines', name='Forecast')
    real_trace = go.Scatter(x=val.index, y=val['value'], mode='lines', name='Real')

    # Create the layout
    layout = go.Layout(title=f"{model_name} / {dir} / {file_name} ",
                       xaxis=dict(title='Timestamp'),
                       yaxis=dict(title='Value'))

    # Combine traces into a list
    data = [forecast_trace, real_trace]

    # Create the figure
    fig = go.Figure(data=data, layout=layout)

    # Show the interactive plot
    fig.show()

    # Save plot as PNG file
    file_path = os.path.join(output_folder, f"{dir}_{file_name}.png")
    pio.write_image(fig, file_path)

In [37]:
def plot_forecast_sarima_interactive( y_pred, val, file_name, model_name):

    # Create a directory if it doesn't exist
    output_folder = os.path.join("visualization", model_name)
    os.makedirs(output_folder, exist_ok=True)
    
    # Plot predicted and actual values
    pred_trace = go.Scatter(x=val.index, y=y_pred, mode='lines', name='Forecast')
    val_trace = go.Scatter(x=val.index, y=val['value'], mode='lines', name='Real')

    # Create the layout
    layout = go.Layout(title=f"{model_name} / {dir} / {file_name} ",
                       xaxis=dict(title='Timestamp'),
                       yaxis=dict(title='Value'))

    # Combine traces into a list
    data = [pred_trace, val_trace]

    # Create the figure
    fig = go.Figure(data=data, layout=layout)

    # Show the interactive plot
    fig.show()

    # Save plot as PNG file
    file_path = os.path.join(output_folder, f"{dir}_{file_name}.png")
    pio.write_image(fig, file_path)

In [38]:
def plot_forecast_prophet_interactive(forecast_df, val, file_name, model_name):

    # Create a directory if it doesn't exist
    output_folder = os.path.join("visualization", model_name)
    os.makedirs(output_folder, exist_ok=True)

    future_forecast = forecast_df[forecast_df['ds'].isin(val.index)]
    # Plot forecast and real values
    forecast_trace = go.Scatter(x=future_forecast['ds'], y=future_forecast['yhat'], mode='lines', name='Forecast')
    forecast_upper_trace = go.Scatter(x=future_forecast['ds'], y=future_forecast['yhat_upper'], mode='lines', name='Forecast upper bound')
    real_trace = go.Scatter(x=val.index, y=val['value'], mode='lines', name='Real')

    # Create the layout
    layout = go.Layout(title=f"{model_name} / {dir} / {file_name} ",
                       xaxis=dict(title='Timestamp'),
                       yaxis=dict(title='Value'))

    # Combine traces into a list
    data = [forecast_trace, forecast_upper_trace, real_trace]

    # Create the figure
    fig = go.Figure(data=data, layout=layout)

    # Show the interactive plot
    fig.show()

    # Save plot as PNG file
    file_path = os.path.join(output_folder, f"{dir}_{file_name}.png")
    pio.write_image(fig, file_path)

Running All Models --------------------------------

In [39]:
columns = ['dir', 'file_name', 'exponential_smoothing', 'xgboost', 'arima']

In [40]:
def process_file(dir, file_name, df, param_grid_exponential):
    train, val = split_data(df)

    predicted_result = {'dir': dir, 'file_name': file_name, 'original_value': val['value'].values}
    mae_result = {'dir': dir, 'file_name': file_name}
    cpu_results = {'dir': dir, 'file_name': file_name}
    memory_results = {'dir': dir, 'file_name': file_name}
    time_results = {'dir': dir, 'file_name': file_name}

    # Exponential Smoothing
    start_time = time.time()
    start_cpu = psutil.cpu_percent()
    start_memory = psutil.virtual_memory().used

    exponential_forecast_df, exponential_mae, exponential_mse = exponential_smoothing(train, val, param_grid_exponential)

    end_time = time.time()
    end_cpu = psutil.cpu_percent()
    end_memory = psutil.virtual_memory().used

    elapsed_time = end_time - start_time
    cpu_usage = end_cpu - start_cpu
    memory_usage = end_memory - start_memory

    print("Exponential Smoothing:")
    print("Time taken:", elapsed_time, "seconds")
    print("CPU Usage:", cpu_usage, "%")
    print("Memory Usage:", memory_usage, "bytes")

    plot_forecast_interactive(exponential_forecast_df, val, file_name, "exponential_smoothing")

    mae_result['exponential_smoothing'] = exponential_mae
    cpu_results['exponential_smoothing'] = cpu_usage
    memory_results['exponential_smoothing'] = memory_usage
    time_results['exponential_smoothing'] = elapsed_time

    if 'Forecast' in exponential_forecast_df:
        predicted_result['exponential_smoothing'] = exponential_forecast_df['Forecast'].tolist()

    # ARIMA
    start_time = time.time()
    start_cpu = psutil.cpu_percent()
    start_memory = psutil.virtual_memory().used

    arima_forecast_df, arima_mae, arima_mse, p,d,q = arima(train, val)

    end_time = time.time()
    end_cpu = psutil.cpu_percent()
    end_memory = psutil.virtual_memory().used

    elapsed_time = end_time - start_time
    cpu_usage = end_cpu - start_cpu
    memory_usage = end_memory - start_memory

    print("ARIMA:")
    print("Time taken:", elapsed_time, "seconds")
    print("CPU Usage:", cpu_usage, "%")
    print("Memory Usage:", memory_usage, "bytes")

    plot_forecast_interactive(arima_forecast_df, val, file_name, "arima")

    mae_result['arima'] = arima_mae
    cpu_results['arima'] = cpu_usage
    memory_results['arima'] = memory_usage
    time_results['arima'] = elapsed_time

    if 'Forecast' in arima_forecast_df:
        predicted_result['arima'] = arima_forecast_df['Forecast'].tolist()

    # SARIMA
    # sarima_forecast, arima_mae, arima_mse = SARIMA(train, val,p,d,q)
    # plot_forecast_sarima_interactive( sarima_forecast, val, file_name, "sarima")

    # Prophet
    # prophet_forecast_df = prophet(train,val, 15, param_grid_prophet)
    # plot_forecast_prophet_interactive(prophet_forecast_df, val, file_name, "Prophet")
    
    # XGBoost
    start_time = time.time()
    start_cpu = psutil.cpu_percent()
    start_memory = psutil.virtual_memory().used

    xgboost_forecast_df, xgboost_mae, xgboost_mse = xgboost(train, val)

    end_time = time.time()
    end_cpu = psutil.cpu_percent()
    end_memory = psutil.virtual_memory().used

    elapsed_time = end_time - start_time
    cpu_usage = end_cpu - start_cpu
    memory_usage = end_memory - start_memory

    print("XGBoost:")
    print("Time taken:", elapsed_time, "seconds")
    print("CPU Usage:", cpu_usage, "%")
    print("Memory Usage:", memory_usage, "bytes")

    plot_forecast_interactive(xgboost_forecast_df, val, file_name, "XGBoost")

    mae_result['xgboost'] = xgboost_mae
    cpu_results['xgboost'] = cpu_usage
    memory_results['xgboost'] = memory_usage
    time_results['xgboost'] = elapsed_time

    if 'Forecast' in xgboost_forecast_df:
        predicted_result['xgboost'] = xgboost_forecast_df['Forecast'].tolist()

    mae_df = pd.DataFrame([mae_result])
    predicted_df = pd.DataFrame([predicted_result])
    cpu_df = pd.DataFrame([cpu_results])
    memory_df = pd.DataFrame([memory_results])
    time_df = pd.DataFrame([time_results])

    return cpu_df, memory_df, time_df, mae_df, predicted_df

In [41]:
# Collect results for all files
all_cpu_results = []
all_memory_results = []
all_time_results = []
all_mae_results = []
all_predicted_results = []

# Iterate over each directory
for dir in dirs:
    print(f"Iterating over directory: {dir}")

    # Iterate over each file in the current directory
    for file_name in data[dir]:
        df = data[dir][file_name]
        print(f"Iterating for {dir} / {file_name}")
        cpu_df, memory_df, time_df, mae_df, predicted_df = process_file(dir, file_name, df, param_grid_exponential)
        all_cpu_results.append(cpu_df)
        all_memory_results.append(memory_df)
        all_time_results.append(time_df)
        all_mae_results.append(mae_df)
        all_predicted_results.append(predicted_df)

# Concatenate results into single dataframes
cpu_df_final = pd.concat(all_cpu_results)
memory_df_final = pd.concat(all_memory_results)
time_df_final = pd.concat(all_time_results)
mae_df_final = pd.concat(all_mae_results)
predicted_df_final = pd.concat(all_predicted_results)


# Reset index for all dataframes
cpu_df_final.reset_index(drop=True, inplace=True)
memory_df_final.reset_index(drop=True, inplace=True)
time_df_final.reset_index(drop=True, inplace=True)
mae_df_final.reset_index(drop=True, inplace=True)
predicted_df_final.reset_index(drop=True, inplace=True)

Iterating over directory: realAWSCloudwatch
Iterating for realAWSCloudwatch / ec2_cpu_utilization_24ae8d.csv
Exponential Smoothing
Best parameters of Seasonal Periods, Smoothing Level and Smoothing Seasonal: (6, 0.5, 0.2)
Mean Absolute Error: 0.034856783695231194
Mean Squared Error: 0.012299290489793559
Exponential Smoothing:
Time taken: 1.9532980918884277 seconds
CPU Usage: 0.1999999999999993 %
Memory Usage: 45707264 bytes


Arima
Mean Absolute Error (MAE): 0.029461025139542445
Mean Squared Error (MSE): 0.01200702114243026
ARIMA:
Time taken: 2.2317569255828857 seconds
CPU Usage: 12.400000000000002 %
Memory Usage: -26701824 bytes


XGBoost
Best Parameters: {'subsample': 0.6, 'reg_lambda': 0, 'reg_alpha': 0.1, 'n_estimators': 1000, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.3, 'gamma': 0.4, 'colsample_bytree': 0.7}
Mean Absolute Error: 0.03177538835844718
Mean Squared Error: 0.011639932723884779
XGBoost:
Time taken: 2.360193967819214 seconds
CPU Usage: 64.10000000000001 %
Memory Usage: 1071931392 bytes


Iterating for realAWSCloudwatch / ec2_cpu_utilization_53ea38.csv
Exponential Smoothing
Best parameters of Seasonal Periods, Smoothing Level and Smoothing Seasonal: (12, 0.1, 0.1)
Mean Absolute Error: 0.04206396347371923
Mean Squared Error: 0.003917869005381942
Exponential Smoothing:
Time taken: 2.0499186515808105 seconds
CPU Usage: -4.399999999999999 %
Memory Usage: 8060928 bytes


Arima
Mean Absolute Error (MAE): 0.06967636448039934
Mean Squared Error (MSE): 0.00940386532134269
ARIMA:
Time taken: 46.08349013328552 seconds
CPU Usage: 39.1 %
Memory Usage: 404594688 bytes


KeyboardInterrupt: 

In [None]:
mae_df_final

In [None]:
time_df_final

In [None]:
cpu_df_final

In [None]:
memory_df_final

In [None]:
predicted_df_final

In [None]:
# Define the directory to save the CSV files
output_dir = "dataset_preparation"

# Check if the directory exists, if not, create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save dataframes as CSV files
cpu_df_final.to_csv(os.path.join(output_dir, "cpu_results.csv"), index=False)
memory_df_final.to_csv(os.path.join(output_dir, "memory_results.csv"), index=False)
time_df_final.to_csv(os.path.join(output_dir, "time_results.csv"), index=False)
mae_df_final.to_csv(os.path.join(output_dir, "mae_results.csv"), index=False)
predicted_df_final.to_csv(os.path.join(output_dir, "predicted_results.csv"), index=False)


Ranking

In [None]:
# Define a function to rank models based on MAE values for each row
def rank_models(row):
    mae_values = row[['exponential_smoothing', 'arima', 'xgboost']]
    model_rank = mae_values.sort_values().index.tolist()
    return model_rank

In [None]:
# Apply the function to each row of the DataFrame
df_features['model_rank'] = mae_df_final.apply(rank_models, axis=1)
df_features

In [None]:
directory = 'dataset_preparation'
if not os.path.exists(directory):
    os.makedirs(directory)

# Specify the file path
file_path = os.path.join(directory, 'df_features_with_ranking.csv')  # for CSV file

# Save the DataFrame
df_features.to_csv(file_path, index=False)  # for CSV file

Stacking Approach for training

In [None]:
def stacked_model_predictions(val, base_preds):
    # Splitting features and target variable
    X_train, X_val, y_train, y_val = train_test_split(base_preds, val, test_size=0.2, random_state=42)

    # Define parameter grid for Random Forest
    param_grid = {
        'n_estimators': [25, 50, 100, 150, 200],  # Number of trees in the forest
        'max_depth': [None, 10, 20, 30],      # Maximum depth of the tree
        'min_samples_split': [2, 5, 8, 10, 15],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4, 6]     # Minimum number of samples required to be at a leaf node
    }

    # Initialize Random Forest regressor
    rf = RandomForestRegressor(random_state=42)

    search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, cv=2, scoring='neg_mean_absolute_error', n_jobs=-1, random_state=42)
    search.fit(X_train, y_train)

    print("Stacking Approach")

    # Print the best estimator found
    print(search.best_estimator_)

    # Make predictions using the best model
    y_pred = search.best_estimator_.predict(X_val)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_val, y_pred)
    print("Mean Absolute Error (MAE):", mae)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_val, y_pred)
    print("Mean Squared Error (MSE):", mse)

    # Calculate Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)

    print("")

    return y_pred, y_val, mae, mse

Finding the appropriate ensemble size

In [None]:
# Create an empty column named 'ensemble_size'
df_features['ensemble_size'] = np.nan
# Display the DataFrame with the new empty column
df_features

In [None]:
def find_ensemble_size(model_rank, unique_id):
    # Extract directory and file name from unique_id
    dir_name, file_name = unique_id.split('/')

    # Get the first element from the list of model_rank
    model_name = model_rank[0]

    # Find the row in mae_df dataframe that matches the directory and file name
    row = mae_df_final[(mae_df_final['dir'] == dir_name) & (mae_df_final['file_name'] == file_name)]

    # Find the value in the column that matches the model_name
    model_mae = row[model_name].iloc[0]

    # Determine the ensemble size based on the model MAE value
    if model_mae < 0.05:
        print("No need for stacking apporach, since first model has MAE less than 0.05")
        return 1
    else:
        previous_mae = model_mae
        print("Going for stacking Approach")
        i = 0  # Initialize the count of models
        while i < len(model_rank):
            i += 2  # Increment the count of models
            models_to_use = model_rank[:i]  # Take the first i models from the model_rank list
            base_preds = []  # Initialize base_preds as a list
            # Get the predicted values for the selected models
            for model in models_to_use:
                # Find the respective row in predicted_df
                model_row = predicted_df_final[(predicted_df_final['dir'] == dir_name) & (predicted_df_final['file_name'] == file_name)]
                # Get the predicted value for the model
                pred_value = model_row[model].iloc[0]
                # Append the predicted value to base_preds
                base_preds.append(pred_value)

            val_row = predicted_df_final[(predicted_df_final['dir'] == dir_name) & (predicted_df_final['file_name'] == file_name)]
            val = val_row['original_value'].iloc[0]

            base_preds = np.stack(base_preds, axis=-1)
            y_pred, y_val, mae, mse = stacked_model_predictions(val, base_preds)

            if mae > previous_mae:
                print("MAE increased after adding", i, "models, so returning the previous ensemble size")
                return i - 1

            # Update previous MAE with current MAE
            previous_mae = mae

            # If MAE is less than 0.05, return the current ensemble size
            if mae < 0.08:
                return i

    # If none of the models have MAE less than 0.05, return the total count of models
    return len(model_rank)


In [None]:
# Iterate over rows in df_features
for index, row in df_features.iterrows():
    # Extract values from the current row
    model_rank = row['model_rank']
    unique_id = row['unique_id']

    # Print the unique ID before calling the function
    print("Unique ID:", unique_id)

    # Call the function to find ensemble size
    ensemble_size = find_ensemble_size(model_rank, unique_id)

    print("")

    # Assign the ensemble size to the 'ensemble_size' column
    df_features.at[index, 'ensemble_size'] = ensemble_size

In [None]:
# Display the updated DataFrame
df_features

In [None]:
directory = 'dataset_preparation'
if not os.path.exists(directory):
    os.makedirs(directory)

# Specify the file path
file_path = os.path.join(directory, 'df_features_with_ensemble_size.csv')  # for CSV file

# Save the DataFrame
df_features.to_csv(file_path, index=False)  # for CSV file

print("CSV saved successfully")

Form X and Y for ensemble size training

In [None]:
# Find null values in each column
null_values_per_column = df_features.isnull().sum()

# Print the result
print("Null values per column:")
print(null_values_per_column)


In [None]:
# Replace null values with 0 in each column
df_features= df_features.fillna(0)



In [None]:
# Drop unique_id and ensemble_size from features
X = df_features.drop(['unique_id', 'ensemble_size'], axis=1)

# Set ensemble_size as target variable
y = df_features['ensemble_size']

Convert model_rank column to integer to pass for random forest

In [None]:
X['model_rank']

In [None]:
# Assign a numerical value to each model based on its position in the list
model_mapping = {model: i for i, model in enumerate(sorted(set(model for sublist in X['model_rank'] for model in sublist)))}

# Map the model names to numerical values
X['model_rank_encoded'] = X['model_rank'].apply(lambda x: [model_mapping[model] for model in x])

X['model_rank_encoded'] 

In [None]:
# Convert the list of numerical values into separate columns
encoded_models = pd.DataFrame(X['model_rank_encoded'].tolist(), columns=[f'model_{i+1}' for i in range(len(model_mapping))])
encoded_models 

In [None]:
# Concatenate the encoded models with the original DataFrame
X_final = pd.concat([X, encoded_models], axis=1)
X_final.head(2)

In [None]:
# Drop unique_id and ensemble_size from features
X_final = X_final.drop(['model_rank', 'model_rank_encoded'], axis=1)
X_final.head(2)

In [None]:
directory = 'dataset_preparation'
if not os.path.exists(directory):
    os.makedirs(directory)

# Specify the file path
file_path = os.path.join(directory, 'df_features_with_encoded_ranking.csv')  # for CSV file

# Save the DataFrame
X_final.to_csv(file_path, index=False)  # for CSV file

print("CSV saved successfully")

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_final, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
X_train.shape

In [None]:
X_train.dtypes

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import pickle

In [None]:
# Choose a Model
ensemble_size_random_forest_model = RandomForestClassifier()

# Train the model
ensemble_size_random_forest_model.fit(X_train, y_train)

# Evaluate the model
y_val_pred = ensemble_size_random_forest_model.predict(X_val)

# Combine y_val and y_val_pred into a DataFrame for comparison
comparison_df = pd.DataFrame({'Actual': y_val, 'Random_forst_Predicted': y_val_pred})
print(comparison_df)

In [None]:
# Choose a Model
ensemble_size_logistic_regression_model = LogisticRegression()

# Train the model
ensemble_size_logistic_regression_model.fit(X_train, y_train)

# Evaluate the model
y_val_pred_logistic = ensemble_size_logistic_regression_model.predict(X_val)

# Add the logistic regression predictions to the existing comparison_df
comparison_df['Logistic_Predicted'] = y_val_pred_logistic

# Print the updated DataFrame
print(comparison_df)


In [None]:
# Choose a Model
ensemble_size_svm_model = SVC()

# Train the model
ensemble_size_svm_model.fit(X_train, y_train)

# Evaluate the model
y_val_pred_svm = ensemble_size_svm_model.predict(X_val)

# Add the SVM predictions to the existing comparison_df
comparison_df['SVM_Predicted'] = y_val_pred_svm

# Print the updated DataFrame
print(comparison_df)


In [None]:
# Save the model as a pickle file
model_directory = "model_pickle"
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

In [None]:
# Total number of predictions (total rows)
total_predictions = comparison_df.shape[0]

# Number of correctly classified predictions for each model
correct_random_forest = (comparison_df['Actual'] == comparison_df['Random_forst_Predicted']).sum()
correct_logistic_regression = (comparison_df['Actual'] == comparison_df['Logistic_Predicted']).sum()
correct_svm = (comparison_df['Actual'] == comparison_df['SVM_Predicted']).sum()

# Print the results
print("Total predictions:", total_predictions)
print("Correctly classified predictions for Random Forest:", correct_random_forest)
print("Correctly classified predictions for Logistic Regression:", correct_logistic_regression)
print("Correctly classified predictions for SVM:", correct_svm)


In [None]:
# Determine which model has the highest number of correct predictions
models_correct = {
    'Random Forest': correct_random_forest,
    'Logistic Regression': correct_logistic_regression,
    'SVM': correct_svm
}

best_model = max(models_correct, key=models_correct.get)

# Save the corresponding model to a pickle file
if best_model == 'Random Forest':
    model_filename = os.path.join(model_directory, 'random_forest_model.pkl')
    with open(model_filename, 'wb') as f:
        pickle.dump(ensemble_size_random_forest_model, f)
    print("Random Forest model saved as:", model_filename)
elif best_model == 'Logistic Regression':
    model_filename = os.path.join(model_directory, 'logistic_regression_model.pkl')
    with open(model_filename, 'wb') as f:
        pickle.dump(ensemble_size_logistic_regression_model, f)
    print("Logistic Regression model saved as:", model_filename)
else:
    model_filename = os.path.join(model_directory, 'svm_model.pkl')
    with open(model_filename, 'wb') as f:
        pickle.dump(ensemble_size_svm_model, f)
    print("SVM model saved as:", model_filename)