In [2]:
#!pip install tsfeatures

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import itertools
import random
import requests
import os
import json

from itertools import product
from datetime import datetime
from sklearn.impute import KNNImputer
from tsfeatures import tsfeatures
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [4]:
default_freq = 'H'

In [5]:
index_url = 'https://api.github.com/repos/numenta/NAB/contents/data'
# Fetching file names from the index URL
response = requests.get(index_url)
index_data = response.json()

directories = [file['name'] for file in index_data if file['type']=="dir"]
directories

['artificialNoAnomaly',
 'artificialWithAnomaly',
 'realAWSCloudwatch',
 'realAdExchange',
 'realKnownCause',
 'realTraffic',
 'realTweets']

In [6]:
base_url = 'https://raw.githubusercontent.com/numenta/NAB/master/data/'
data = {}

def addFolderAndReadAll(d_name):
    data[d_name] = {}
    response = requests.get(index_url + '/' + d_name)
    index_data = response.json()

    csv_files = [ file['name'] for file in index_data if file['type'] == "file"]
    csvs_num = 0
    for f_name in csv_files:
        data[d_name][f_name] = pd.read_csv(base_url + d_name + '/' + f_name)
        csvs_num += 1
    return csvs_num

csvs_num = sum([addFolderAndReadAll(d_name) for d_name in directories])

Preprocessing

In [7]:
# Function to get a random start date from the DataFrame index
def get_random_start_date(index):
    return np.random.choice(index)

# Main function to repeat the process until non-None frequency is obtained
def find_non_none_frequency(df, offset=9):
    while True:
        # Get a random start date from the DataFrame index
        start_date = pd.to_datetime(get_random_start_date(df.index))

        # Find the index of the end date by moving 9 steps through the indices
        end_date_index = df.index.get_loc(start_date) + offset

        # Check if the end date index is within the range of the DataFrame index
        if end_date_index < len(df.index):
            # Calculate the end date using the index
            end_date = df.index[end_date_index]

            # Infer frequency within the specified date range
            subset_df = df.loc[start_date:end_date]
            freq = pd.infer_freq(subset_df.index)

            if freq is not None:
                print("Inferred frequency within range", start_date, "-", end_date, ":", freq)
                return freq  # Exit the loop and return the inferred frequency

In [8]:
def max_consecutive_missing_dates(inferred_freq, missing_dates):
    # Function to check if two dates are consecutive based on the inferred frequency
    def are_consecutive(date1, date2, freq):
        # Calculate the difference between dates based on the inferred frequency
        diff = date2 - date1
        # Check if the difference matches the frequency
        if freq == 'D':
            return diff.days == 1
        elif freq.endswith('H')| freq.endswith('h'):
             # If the frequency ends with 'H', check if it represents hourly intervals
            if freq[:-1]:  # Check if there is a multiplier
                  interval = int(freq[:-1])
                  return diff.total_seconds() == interval * 3600
            else:
                   # If no multiplier is provided, it's assumed to be one hour
                   return diff.total_seconds() == 3600
        elif freq.endswith('T'):
            # Extract the interval from the frequency string
            interval = int(freq[:-1])
            return diff.seconds // 60 == interval
        else:
            raise ValueError("Unsupported frequency: {}".format(freq))

    # Initialize variables to track maximum length and current length
    max_consecutive_missing = 0
    current_consecutive_missing = 0

    # Iterate over the missing dates
    for i in range(1, len(missing_dates)):
        # Check if the current date is consecutive with the previous date
        if are_consecutive(missing_dates[i - 1], missing_dates[i], inferred_freq):
            # Increment current consecutive missing count
            current_consecutive_missing += 1
        else:
            # Update maximum consecutive missing count if needed
            max_consecutive_missing = max(max_consecutive_missing, current_consecutive_missing)
            # Reset current consecutive missing count
            current_consecutive_missing = 0

    # Update max_consecutive_missing if current_consecutive_missing is still greater
    max_consecutive_missing = max(max_consecutive_missing, current_consecutive_missing)

    return max_consecutive_missing

In [9]:
def preprocess(df, f_name):
    # Convert 'timestamp' column to datetime format and rename it to 'ds'
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Removing the duplicate rows
    df = df[~df.duplicated(keep='first')]

    duplicated_dates_length = len(df[df['timestamp'].duplicated(keep=False)])

    if  duplicated_dates_length > 0:
      print("Number of Duplicated Dates in "+ f_name + ": "+ str(duplicated_dates_length))
      # To make the mean as the value for the numerical columns if there are different values for a particular date
      df = df.groupby('timestamp').mean()
      # Reset index to bring 'timestamp' column back
      df.reset_index(inplace=True)

    df.set_index(['timestamp'], inplace=True)
    df.sort_index()

    # Create a date range with hourly frequency covering the entire time range
    start_date = df.index.min()
    end_date = df.index.max()

    #inferred_freq = pd.infer_freq(df.index)
    inferred_freq = find_non_none_frequency(df)

    if inferred_freq is None:
      inferred_freq = default_freq # setting the default frequency
      print("Cannot infer the frequency of the timestamp of the dataset "+ f_name+ " .Therefore the default frequency of " + default_freq+ " will be used")

    expected_date_range = pd.date_range(start=start_date, end=end_date, freq=inferred_freq)

    # Find the missing date entries
    missing_dates = expected_date_range[~expected_date_range.isin(df.index)]
    # Print or work with the list of missing dates
    print("Number of Missing Dates in "+ f_name + ": "+ str(len(missing_dates)))

    if len(missing_dates) > 0:
      df = df.asfreq(inferred_freq)
      df.sort_index()

      # Call the function with inferred_freq and missing_dates parameters
      max_consecutive = max_consecutive_missing_dates(inferred_freq, missing_dates)
      print("Maximum length of consecutive missing dates:", max_consecutive)
      if max_consecutive > 3:
        print("It is better to use other imputation method rather than linear interpolation")

      df['value'] = df['value'].interpolate(method='linear')

      print("")

    return df

In [10]:
url = 'https://raw.githubusercontent.com/numenta/NAB/master/labels/combined_labels.json'

response = requests.get(url)

if response.status_code == 200:
    labels = json.loads(response.text)
else:
    print("Failed to retrieve data from the URL:", response.status_code)

In [11]:
dir = 'realAdExchange'

# Ensure the directory exists in the data dictionary
if dir in data:
    # Loop through the files in the directory
    for f_name in data[dir]:
        print(f_name)
else:
    print(f"Directory '{dir}' not found in the data.")


exchange-2_cpc_results.csv
exchange-2_cpm_results.csv
exchange-3_cpc_results.csv
exchange-3_cpm_results.csv
exchange-4_cpc_results.csv
exchange-4_cpm_results.csv


In [12]:
for f_name in data[dir]:

    df = preprocess(data[dir][f_name], f_name)

    labels_of_one_file = labels[dir+'/'+f_name]

    df['is_anomaly'] = 0

    for anomalous_timestamp in labels_of_one_file:
      anomalous_timestamp = pd.to_datetime(anomalous_timestamp)
      try:
          df.at[anomalous_timestamp, 'is_anomaly'] = 1  # Set is_anomaly to 1 at the index location

      except KeyError:
          print(f"Anomalous timestamp {anomalous_timestamp} not found in data[{dir}][{f_name}] .")
          pass

    data[dir][f_name] = df

Number of Duplicated Dates in exchange-2_cpc_results.csv: 2
Inferred frequency within range 2011-07-20 11:00:01 - 2011-07-20 20:00:01 : h
Number of Missing Dates in exchange-2_cpc_results.csv: 25
Maximum length of consecutive missing dates: 19
It is better to use other imputation method rather than linear interpolation

Number of Duplicated Dates in exchange-2_cpm_results.csv: 2
Inferred frequency within range 2011-08-04 22:00:01 - 2011-08-05 07:00:01 : h
Number of Missing Dates in exchange-2_cpm_results.csv: 25
Maximum length of consecutive missing dates: 19
It is better to use other imputation method rather than linear interpolation

Inferred frequency within range 2011-08-09 19:15:01 - 2011-08-10 04:15:01 : h
Number of Missing Dates in exchange-3_cpc_results.csv: 109
Maximum length of consecutive missing dates: 14
It is better to use other imputation method rather than linear interpolation

Inferred frequency within range 2011-08-16 14:15:01 - 2011-08-16 23:15:01 : h
Number of Missi

In [13]:
#pip install -U kaleido

In [14]:
# Create a directory if it doesn't exist
output_folder = "visualization/pure_format"
os.makedirs(output_folder, exist_ok=True)

Visualization

V

In [15]:
import plotly.io as pio

for f_name in data[dir]:

    df = data[dir][f_name]
    # Create a figure using Plotly Express
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=df.index, y=df['value'], mode='lines', name='Value'))

    # Add scatter plot for anomalies
    anomalies = df[df['is_anomaly'] == 1]  # Filter DataFrame to get rows where is_anomaly is 1
    fig.add_trace(go.Scatter(x=anomalies.index, y=anomalies['value'], mode='markers', marker=dict(color='red'), name='Anomalies'))

    # Update layout
    fig.update_layout(title=f"{dir} / {f_name}", xaxis_title='Timestamp', yaxis_title='Value')

    # Show plot
    fig.show()

    # Save plot as PNG file
    file_path = os.path.join(output_folder, f"{dir}_{f_name}.png")
    pio.write_image(fig, file_path)


TS Feature Extraction

In [18]:
dfs = []

In [19]:
def extract_features(dir, file_name, new_df):
    new_df.index = new_df.index  # Set the index (you can perform operations here if needed)
    new_df.rename(columns={'value': 'y'}, inplace=True)
    new_df['unique_id'] = f"{dir}/{file_name}"  # Using the filename as unique identifier
    dfs.append(new_df)  # Append the modified DataFrame to the list
    return dfs

In [20]:
for file_name in data[dir]:
    new_df = data[dir][file_name].copy()
    extract_features(dir, file_name, new_df)

In [21]:
combined_df = pd.concat(dfs, ignore_index=True)
# Assuming tsfeatures function is defined elsewhere and imported
features = tsfeatures(combined_df, freq=288)
#features = tsfeatures(combined_df, dict_freqs={'T': 60, '2T': 30,'3T': 20, '4T': 15,'5T': 12,'10T': 6,'15T': 4,'20T': 3,'30T': 2, 'H': 24, '2H': 12,'3H': 8, '4H': 6,'6H': 4,'8H': 3,'12H': 2, 'D': 7, 'W': 52, 'M': 12})
df_features = pd.DataFrame(features)
df_features

Unnamed: 0,unique_id,hurst,series_length,unitroot_pp,unitroot_kpss,hw_alpha,hw_beta,hw_gamma,stability,nperiods,...,entropy,crossing_points,arch_lm,x_acf1,x_acf10,diff1_acf1,diff1_acf10,diff2_acf1,diff2_acf10,seas_acf1
0,realAdExchange/exchange-2_cpc_results.csv,0.906114,1648,-291.651096,3.322436,0.662955,3.689681e-10,2.177951e-10,0.164412,1,...,0.523456,219,0.333751,0.84456,1.723964,0.019727,0.033048,-0.4742,0.247259,0.533564
1,realAdExchange/exchange-2_cpm_results.csv,0.720425,1648,-352.407911,0.68889,0.413818,1.154891e-09,1.32262e-09,0.041318,1,...,0.483681,228,0.270601,0.823062,2.040337,-0.122537,0.066928,-0.590594,0.359987,0.601838
2,realAdExchange/exchange-3_cpc_results.csv,0.82741,1647,-689.70209,0.784406,0.248606,0.0,0.0,0.072987,1,...,0.813993,361,0.243241,0.595922,0.603031,-0.176902,0.101971,-0.466519,0.270492,0.155811
3,realAdExchange/exchange-3_cpm_results.csv,0.898996,1647,-377.537596,3.913879,0.716322,6.293205e-10,1.216726e-10,0.143555,1,...,0.662637,257,0.098898,0.764779,1.248005,-0.05856,0.040519,-0.454639,0.217878,0.290427
4,realAdExchange/exchange-4_cpc_results.csv,0.520317,1647,-1862.138673,1.727024,0.014715,1.600749e-13,4.137589e-09,0.026716,1,...,0.941542,287,0.003958,0.040633,0.032792,-0.504576,0.263658,-0.670898,0.492239,-0.000595
5,realAdExchange/exchange-4_cpm_results.csv,0.390779,1647,-1839.697103,0.181736,0.006275,2.6367e-12,5.935964e-15,0.011183,1,...,0.943788,315,0.005727,0.030553,0.026503,-0.50421,0.26334,-0.671191,0.494804,6e-05


In [22]:
directory = 'dataset_preparation'
if not os.path.exists(directory):
    os.makedirs(directory)

# Specify the file path
file_path = os.path.join(directory, 'df_features_pure_extraction.csv')  # for CSV file
# file_path = os.path.join(directory, 'df_features_pure_extraction.pkl')  # for pickle file

# Save the DataFrame
df_features.to_csv(file_path, index=False)  # for CSV file